C++ Library to handle BagIt structures. BagIt is a standard format to create transfer packages for digital preservation purposes. See https://en.wikipedia.org/wiki/BagIt for details http://andreas-romeyke.de
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
9.3 KiB

  1. #include "bag.hpp"
  2. #include <boost/filesystem.hpp>
  3. #include <fstream>
  4. #include <iostream>
  5. #include "payload.hpp"
  6. #include <sstream>
  7. #include <string>
  8. //#include <filesystem> // c++17
  9. //namespace fs = std::filesystem;
  10. namespace fs = boost::filesystem;
  11. using namespace std;
  12. Bag::Bag( string dfname ) {
  13. // log << "load constructor (" << dfname << ")" << endl;
  14. // read in file bagit.txt
  15. string bagit_txt_path = dfname + "bagit.txt";
  16. //log << "parse " << bagit_txt_path << endl;
  17. ifstream bagit_txt_file;
  18. bagit_txt_file.open( bagit_txt_path );
  19. if (bagit_txt_file.is_open()) {
  20. string version_line;
  21. string utf8_line;
  22. getline(bagit_txt_file, version_line);
  23. getline(bagit_txt_file, utf8_line);
  24. bagit_txt_file.close();
  25. stringstream version_ss ( version_line );
  26. string major;
  27. string minor;
  28. string vprefix;
  29. getline(version_ss, vprefix, ' ');
  30. getline(version_ss, major ,'.');
  31. getline(version_ss, minor, '.');
  32. if (0 != vprefix.compare("BagIt-Version:")) {
  33. // log << "wrong vprefix='" << vprefix << "', but 'BagIt-Version:' expected" << endl;
  34. }
  35. //log << "major:'"<<major<<"'"<<endl;
  36. //log << "minor:'"<<minor<<"'"<<endl;
  37. Bag::bagit_version_major = stoi(major);
  38. Bag::bagit_version_minor = stoi(minor);
  39. stringstream utf8_ss (utf8_line);
  40. string uprefix;
  41. string uvalue;
  42. getline(utf8_ss, uprefix, ' ');
  43. getline(utf8_ss, uvalue, ' ');
  44. if (0 != uprefix.compare("Tag-File-Character-Encoding:")) {
  45. //log << "wrong uprefix='" << uprefix << "', but 'Tag-File-Character-Encoding:' expected" << endl;
  46. }
  47. Bag::tag_file_character_encoding = uvalue;
  48. //log << "Bagit Version ("<< version_line << ") major=" << Bag::bagit_version_major << " minor=" << Bag::bagit_version_minor << endl;
  49. } else {
  50. Bag::log << "file " << bagit_txt_path << " could not be opened" <<endl;
  51. }
  52. // read in payload
  53. Bag::payload_p = new Payload( dfname ) ;
  54. list<string> files = Bag::payload_p->get_all_relative_paths();
  55. // read in payload manifest
  56. Bag::payloadmanifest_p = new Payloadmanifest(dfname);
  57. // read in tagmanifest
  58. Bag::tagmanifest_p = new Tagmanifest(dfname);
  59. // read in baginfo
  60. Bag::bagmetadata_p = new Bagmetadata(dfname);
  61. map<string,string> md = Bag::bagmetadata_p->get_metadata();
  62. map<string,string>::iterator m;
  63. for (m=md.begin(); m!=md.end(); m++) {
  64. // log << m->first << " = " << m->second <<endl;
  65. }
  66. list<string>::iterator i;
  67. for (i=files.begin(); i!=files.end(); i++) {
  68. // log << "file/dir (rel):" << (*i) << endl;
  69. //log << "file/dir (abs):" << (*i) << endl;
  70. }
  71. Bag::payloadmanifest_p->get_checksum_file_pairs( md5 );
  72. Bag::tagmanifest_p->get_checksum_file_pairs( md5 );
  73. }
  74. list<string> Bag::get_all_bag_files() {
  75. list<string> files;
  76. files.emplace_back("bagit.txt" );
  77. files.emplace_back("bag-info.txt" );
  78. if (nullptr != this->tagmanifest_p) {
  79. // TODO(art1): this->tagmanifest_p->get_all_checksum_files()
  80. }
  81. return files;
  82. }
  83. bool Bag::store( string basedir ) {
  84. fs::path p{ basedir };
  85. fs::file_status s = fs::status( p );
  86. if (fs::is_directory( s)) {
  87. log << "directory '" << basedir << "' already exists" << endl;
  88. return false;
  89. }
  90. fs::create_directory(p);
  91. // store payload
  92. if (nullptr == Bag::payload_p) {
  93. log << "Payload object needed" << endl;
  94. return false;
  95. }
  96. Bag::payload_p->store( basedir );
  97. // store payload manifest
  98. if (nullptr == Bag::payloadmanifest_p) {
  99. log << "Payloadmanifest object needed" << endl;
  100. return false;
  101. }
  102. list<string> payload_files =Bag::payload_p->get_all_relative_paths();
  103. log << "PAYLOAD" <<endl;
  104. Bag::payloadmanifest_p->store( basedir, payload_files);
  105. // store baginfo
  106. if (nullptr == Bag::bagmetadata_p) {
  107. log << "Bagmetadata object needed" << endl;
  108. return false;
  109. }
  110. Bag::bagmetadata_p->store( basedir );
  111. // store fetchfile (if needed)
  112. if (nullptr == Bag::fetchfile_p) {
  113. log << "Fetchfile object needed" << endl;
  114. } else {
  115. Bag::fetchfile_p->store( basedir );
  116. }
  117. // store other
  118. if (nullptr == Bag::othertags_p) {
  119. log << "Othertags object needed" << endl;
  120. } else {
  121. Bag::othertags_p->store( basedir );
  122. }
  123. // store bag itself
  124. string bagit_txt_path = basedir + "bagit.txt";
  125. ofstream bagit_txt_file;
  126. bagit_txt_file.open( bagit_txt_path );
  127. if (bagit_txt_file.is_open()) {
  128. bagit_txt_file << ("BagIt-Version: " + to_string(Bag::bagit_version_major) + "." + to_string(Bag::bagit_version_minor)) << endl;
  129. bagit_txt_file << ("Tag-File-Character-Encoding: " + Bag::tag_file_character_encoding) << endl;
  130. bagit_txt_file.close();
  131. } else {
  132. log << "file " << bagit_txt_path << "could not be open for writing" << endl;
  133. return false;
  134. }
  135. // at least (!), store tagmanifest
  136. if (nullptr == Bag::tagmanifest_p) {
  137. log << "Tagmanifest object needed" << endl;
  138. return false;
  139. }
  140. list<string> bagfiles = Bag::get_all_bag_files();
  141. log << "TAGMANIFEST" << endl;
  142. list<string>::iterator it;
  143. for (it=bagfiles.begin(); it!= bagfiles.end(); ++it) {
  144. log << "TAGMANIFEST-file '" << *it << "'" << endl;
  145. }
  146. Bag::tagmanifest_p->store( basedir, bagfiles );
  147. return true;
  148. }
  149. bool Bag::validate() {
  150. bool is_valid = true;
  151. if (this->bagit_version_major != 0) {
  152. this->log << "Bagit major version 0 is expected, but got: " << to_string(this->bagit_version_major) << endl;
  153. is_valid = false;
  154. }
  155. if (this->bagit_version_minor != 97) {
  156. this->log << "Bagit minor version 97 is expected, but got: " << to_string(this->bagit_version_minor) << endl;
  157. is_valid = false;
  158. }
  159. if (0 != tag_file_character_encoding.compare( "UTF-8" )) {
  160. this->log << "Bagit character encoding UTF-8 is expected, but got: " << this->tag_file_character_encoding << endl;
  161. is_valid = false;
  162. }
  163. if (nullptr == this->payload_p) {
  164. this->log << "Bagit payload directory 'data/' is expected, but could not found" << endl;
  165. is_valid = false;
  166. } else {
  167. bool ret = this->payload_p->validate();
  168. if (!ret) {
  169. is_valid = false;
  170. }
  171. }
  172. if (nullptr == this->payloadmanifest_p) {
  173. is_valid = false;
  174. } else {
  175. // checksums check
  176. this->log << "Bagit payload manifest" << endl;
  177. bool ret = this->payloadmanifest_p->validate();
  178. if (!ret) {
  179. is_valid = false;
  180. }
  181. if (nullptr != this->payload_p) {
  182. // check if payload checksums missed for payload files
  183. // HINT: not requested by draft, therefore only a warning
  184. list<string> payload_files = this->payload_p->get_all_relative_paths();
  185. list<string> payload_manifest_files = this->payloadmanifest_p->get_checksummed_files();
  186. list<string> missed_files;
  187. payload_files.sort();
  188. payload_manifest_files.sort();
  189. //log << "PAYLOAD_FILES:" << endl;
  190. //auto it = payload_files.begin();
  191. //while( it != payload_files.end()) {
  192. // log << "\t"<<(*it++) << endl;
  193. //}
  194. //log << "PAYLOADMANIFEST_FILES:" << endl;
  195. //it = payload_manifest_files.begin();
  196. //while( it != payload_manifest_files.end()) {
  197. // log << "\t"<<(*it++) << endl;
  198. //}
  199. auto it1 = payload_files.begin();
  200. auto it2 = payload_manifest_files.begin();
  201. while( it1 != payload_files.end() && it2 != payload_manifest_files.end() ) {
  202. int cmp_res = (*it1).compare( *it2);
  203. //log << "COMP: "<<cmp_res<<" file='"<< (*it1) << "' checksummed file='" << (*it2) << "'" << endl;
  204. if ( cmp_res < 0) {
  205. this->log << "Bagit warning, file '" << (*it1) << "' in payload has no checksum entry in payload manifest" << endl;
  206. it1++;
  207. } else if ( cmp_res > 0) {
  208. it2++;
  209. } else {
  210. it1++;
  211. it2++;
  212. }
  213. }
  214. }
  215. }
  216. // next elements are optional
  217. if (nullptr == this->tagmanifest_p) {
  218. } else {
  219. this->log << "Bagit tag manifest" << endl;
  220. bool ret = this->tagmanifest_p->validate();
  221. if (!ret) {
  222. is_valid = false;
  223. }
  224. }
  225. if (nullptr == this->bagmetadata_p) {
  226. } else {
  227. bool ret = this->bagmetadata_p->validate();
  228. if (!ret) {
  229. is_valid = false;
  230. }
  231. if (this->bagmetadata_p->has_PayloadOxum()) {
  232. // check oxum of payload
  233. Checksum c;
  234. if (this->payload_p != nullptr) {
  235. list<string> files = this->payload_p->get_all_absolute_paths();
  236. oxum_t expected_oxum = this->bagmetadata_p->get_PayloadOxum();
  237. oxum_t calculated_oxum = c.oxum_of_filelist( files );
  238. if (expected_oxum.octetcount != calculated_oxum.octetcount) {
  239. this->log << "Bagit payload oxum octectcount=" << to_string(expected_oxum.octetcount) << " expected, but " << to_string(calculated_oxum.octetcount) << " found" << endl;
  240. is_valid = false;
  241. }
  242. if (expected_oxum.streamcount != calculated_oxum.streamcount) {
  243. this->log << "Bagit payload oxum streamcount=" << to_string(expected_oxum.streamcount) << " expected, but " << to_string(calculated_oxum.streamcount) << " found" << endl;
  244. is_valid = false;
  245. }
  246. }
  247. }
  248. }
  249. /*
  250. if (NULL == this->fetchfile_p) {
  251. } else {
  252. bool ret = this->fetchfile_p->validate( log );
  253. if (ret == false) {
  254. is_valid = false;
  255. }
  256. }
  257. if (NULL == this->othertags_p) {
  258. } else {
  259. bool ret = this->othertags_p->validate( log);
  260. if (ret == false) {
  261. is_valid = false;
  262. }
  263. }
  264. */
  265. return is_valid;
  266. }
  267. void Bag::get_logstream( stringstream & log ) {
  268. log << this->log.rdbuf();
  269. }
  270. void Bag::reset_logstream() {
  271. this->log.str(std::string());
  272. }
  273. // vim: set tabstop=4 softtabstop=0 expandtab shiftwidth=4 smarttab