C++ Library to handle BagIt structures. BagIt is a standard format to create transfer packages for digital preservation purposes. See https://en.wikipedia.org/wiki/BagIt for details http://andreas-romeyke.de
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

311 lines
10 KiB

  1. // Copyright (C) 2018 Andreas Romeyke (art1@andreas-romeyke.de), 2018.
  2. //
  3. // This program is free software: you can redistribute it and/or modify
  4. // it under the terms of the GNU General Public License as published by
  5. // the Free Software Foundation, either version 3 of the License, or
  6. // (at your option) any later version.
  7. //
  8. // This program is distributed in the hope that it will be useful,
  9. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. // GNU General Public License for more details.
  12. //
  13. // You should have received a copy of the GNU General Public License
  14. // along with this program. If not, see <https://www.gnu.org/licenses/>.
  15. #include "bag.hpp"
  16. #include <boost/filesystem.hpp>
  17. #include <fstream>
  18. #include <iostream>
  19. #include "payload.hpp"
  20. #include <sstream>
  21. #include <string>
  22. //#include <filesystem> // c++17
  23. //namespace fs = std::filesystem;
  24. namespace fs = boost::filesystem;
  25. using namespace std;
  26. Bag::Bag( string dfname ) {
  27. // log << "load constructor (" << dfname << ")" << endl;
  28. // read in file bagit.txt
  29. string bagit_txt_path = dfname + "bagit.txt";
  30. //log << "parse " << bagit_txt_path << endl;
  31. ifstream bagit_txt_file;
  32. bagit_txt_file.open( bagit_txt_path );
  33. if (bagit_txt_file.is_open()) {
  34. string version_line;
  35. string utf8_line;
  36. getline(bagit_txt_file, version_line);
  37. getline(bagit_txt_file, utf8_line);
  38. bagit_txt_file.close();
  39. stringstream version_ss ( version_line );
  40. string major;
  41. string minor;
  42. string vprefix;
  43. getline(version_ss, vprefix, ' ');
  44. getline(version_ss, major ,'.');
  45. getline(version_ss, minor, '.');
  46. if (0 != vprefix.compare("BagIt-Version:")) {
  47. // log << "wrong vprefix='" << vprefix << "', but 'BagIt-Version:' expected" << endl;
  48. }
  49. //log << "major:'"<<major<<"'"<<endl;
  50. //log << "minor:'"<<minor<<"'"<<endl;
  51. Bag::bagit_version_major = stoi(major);
  52. Bag::bagit_version_minor = stoi(minor);
  53. stringstream utf8_ss (utf8_line);
  54. string uprefix;
  55. string uvalue;
  56. getline(utf8_ss, uprefix, ' ');
  57. getline(utf8_ss, uvalue, ' ');
  58. if (0 != uprefix.compare("Tag-File-Character-Encoding:")) {
  59. //log << "wrong uprefix='" << uprefix << "', but 'Tag-File-Character-Encoding:' expected" << endl;
  60. }
  61. Bag::tag_file_character_encoding = uvalue;
  62. //log << "Bagit Version ("<< version_line << ") major=" << Bag::bagit_version_major << " minor=" << Bag::bagit_version_minor << endl;
  63. } else {
  64. Bag::log << "file " << bagit_txt_path << " could not be opened" <<endl;
  65. }
  66. // read in payload
  67. Bag::payload_p = new Payload( dfname ) ;
  68. list<string> files = Bag::payload_p->get_all_relative_paths();
  69. // read in payload manifest
  70. Bag::payloadmanifest_p = new Payloadmanifest(dfname);
  71. // read in tagmanifest
  72. Bag::tagmanifest_p = new Tagmanifest(dfname);
  73. // read in baginfo
  74. Bag::bagmetadata_p = new Bagmetadata(dfname);
  75. map<string,string> md = Bag::bagmetadata_p->get_metadata();
  76. map<string,string>::iterator m;
  77. for (m=md.begin(); m!=md.end(); m++) {
  78. // log << m->first << " = " << m->second <<endl;
  79. }
  80. list<string>::iterator i;
  81. for (i=files.begin(); i!=files.end(); i++) {
  82. // log << "file/dir (rel):" << (*i) << endl;
  83. //log << "file/dir (abs):" << (*i) << endl;
  84. }
  85. Bag::payloadmanifest_p->get_checksum_file_pairs( md5 );
  86. Bag::tagmanifest_p->get_checksum_file_pairs( md5 );
  87. }
  88. list<string> Bag::get_all_bag_files() {
  89. list<string> files;
  90. files.emplace_back("bagit.txt" );
  91. files.emplace_back("bag-info.txt" );
  92. if (nullptr != this->tagmanifest_p) {
  93. // TODO(art1): this->tagmanifest_p->get_all_checksum_files()
  94. }
  95. return files;
  96. }
  97. bool Bag::store( string basedir ) {
  98. fs::path p{ basedir };
  99. fs::file_status s = fs::status( p );
  100. if (fs::is_directory( s)) {
  101. log << "directory '" << basedir << "' already exists" << endl;
  102. return false;
  103. }
  104. fs::create_directory(p);
  105. // store payload
  106. if (nullptr == Bag::payload_p) {
  107. log << "Payload object needed" << endl;
  108. return false;
  109. }
  110. Bag::payload_p->store( basedir );
  111. // store payload manifest
  112. if (nullptr == Bag::payloadmanifest_p) {
  113. log << "Payloadmanifest object needed" << endl;
  114. return false;
  115. }
  116. list<string> payload_files =Bag::payload_p->get_all_relative_paths();
  117. log << "PAYLOAD" <<endl;
  118. Bag::payloadmanifest_p->store( basedir, payload_files);
  119. // store baginfo
  120. if (nullptr == Bag::bagmetadata_p) {
  121. log << "Bagmetadata object needed" << endl;
  122. return false;
  123. }
  124. Bag::bagmetadata_p->store( basedir );
  125. // store fetchfile (if needed)
  126. if (nullptr == Bag::fetchfile_p) {
  127. log << "Fetchfile object needed" << endl;
  128. } else {
  129. Bag::fetchfile_p->store( basedir );
  130. }
  131. // store other
  132. if (nullptr == Bag::othertags_p) {
  133. log << "Othertags object needed" << endl;
  134. } else {
  135. Bag::othertags_p->store( basedir );
  136. }
  137. // store bag itself
  138. string bagit_txt_path = basedir + "bagit.txt";
  139. ofstream bagit_txt_file;
  140. bagit_txt_file.open( bagit_txt_path );
  141. if (bagit_txt_file.is_open()) {
  142. bagit_txt_file << ("BagIt-Version: " + to_string(Bag::bagit_version_major) + "." + to_string(Bag::bagit_version_minor)) << endl;
  143. bagit_txt_file << ("Tag-File-Character-Encoding: " + Bag::tag_file_character_encoding) << endl;
  144. bagit_txt_file.close();
  145. } else {
  146. log << "file " << bagit_txt_path << "could not be open for writing" << endl;
  147. return false;
  148. }
  149. // at least (!), store tagmanifest
  150. if (nullptr == Bag::tagmanifest_p) {
  151. log << "Tagmanifest object needed" << endl;
  152. return false;
  153. }
  154. list<string> bagfiles = Bag::get_all_bag_files();
  155. log << "TAGMANIFEST" << endl;
  156. list<string>::iterator it;
  157. for (it=bagfiles.begin(); it!= bagfiles.end(); ++it) {
  158. log << "TAGMANIFEST-file '" << *it << "'" << endl;
  159. }
  160. Bag::tagmanifest_p->store( basedir, bagfiles );
  161. return true;
  162. }
  163. bool Bag::validate() {
  164. bool is_valid = true;
  165. if (this->bagit_version_major != 0) {
  166. this->log << "Bagit major version 0 is expected, but got: " << to_string(this->bagit_version_major) << endl;
  167. is_valid = false;
  168. }
  169. if (this->bagit_version_minor != 97) {
  170. this->log << "Bagit minor version 97 is expected, but got: " << to_string(this->bagit_version_minor) << endl;
  171. is_valid = false;
  172. }
  173. if (0 != tag_file_character_encoding.compare( "UTF-8" )) {
  174. this->log << "Bagit character encoding UTF-8 is expected, but got: " << this->tag_file_character_encoding << endl;
  175. is_valid = false;
  176. }
  177. if (nullptr == this->payload_p) {
  178. this->log << "Bagit payload directory 'data/' is expected, but could not found" << endl;
  179. is_valid = false;
  180. } else {
  181. bool ret = this->payload_p->validate();
  182. if (!ret) {
  183. is_valid = false;
  184. }
  185. }
  186. if (nullptr == this->payloadmanifest_p) {
  187. is_valid = false;
  188. } else {
  189. // checksums check
  190. this->log << "Bagit payload manifest" << endl;
  191. bool ret = this->payloadmanifest_p->validate();
  192. if (!ret) {
  193. is_valid = false;
  194. }
  195. if (nullptr != this->payload_p) {
  196. // check if payload checksums missed for payload files
  197. // HINT: not requested by draft, therefore only a warning
  198. list<string> payload_files = this->payload_p->get_all_relative_paths();
  199. list<string> payload_manifest_files = this->payloadmanifest_p->get_checksummed_files();
  200. list<string> missed_files;
  201. payload_files.sort();
  202. payload_manifest_files.sort();
  203. //log << "PAYLOAD_FILES:" << endl;
  204. //auto it = payload_files.begin();
  205. //while( it != payload_files.end()) {
  206. // log << "\t"<<(*it++) << endl;
  207. //}
  208. //log << "PAYLOADMANIFEST_FILES:" << endl;
  209. //it = payload_manifest_files.begin();
  210. //while( it != payload_manifest_files.end()) {
  211. // log << "\t"<<(*it++) << endl;
  212. //}
  213. auto it1 = payload_files.begin();
  214. auto it2 = payload_manifest_files.begin();
  215. while( it1 != payload_files.end() && it2 != payload_manifest_files.end() ) {
  216. int cmp_res = (*it1).compare( *it2);
  217. //log << "COMP: "<<cmp_res<<" file='"<< (*it1) << "' checksummed file='" << (*it2) << "'" << endl;
  218. if ( cmp_res < 0) {
  219. this->log << "Bagit warning, file '" << (*it1) << "' in payload has no checksum entry in payload manifest" << endl;
  220. it1++;
  221. } else if ( cmp_res > 0) {
  222. it2++;
  223. } else {
  224. it1++;
  225. it2++;
  226. }
  227. }
  228. }
  229. }
  230. // next elements are optional
  231. if (nullptr == this->tagmanifest_p) {
  232. } else {
  233. this->log << "Bagit tag manifest" << endl;
  234. bool ret = this->tagmanifest_p->validate();
  235. if (!ret) {
  236. is_valid = false;
  237. }
  238. }
  239. if (nullptr == this->bagmetadata_p) {
  240. } else {
  241. bool ret = this->bagmetadata_p->validate();
  242. if (!ret) {
  243. is_valid = false;
  244. }
  245. if (this->bagmetadata_p->has_PayloadOxum()) {
  246. // check oxum of payload
  247. Checksum c;
  248. if (this->payload_p != nullptr) {
  249. list<string> files = this->payload_p->get_all_absolute_paths();
  250. oxum_t expected_oxum = this->bagmetadata_p->get_PayloadOxum();
  251. oxum_t calculated_oxum = c.oxum_of_filelist( files );
  252. if (expected_oxum.octetcount != calculated_oxum.octetcount) {
  253. this->log << "Bagit payload oxum octectcount=" << to_string(expected_oxum.octetcount) << " expected, but " << to_string(calculated_oxum.octetcount) << " found" << endl;
  254. is_valid = false;
  255. }
  256. if (expected_oxum.streamcount != calculated_oxum.streamcount) {
  257. this->log << "Bagit payload oxum streamcount=" << to_string(expected_oxum.streamcount) << " expected, but " << to_string(calculated_oxum.streamcount) << " found" << endl;
  258. is_valid = false;
  259. }
  260. }
  261. }
  262. }
  263. /*
  264. if (NULL == this->fetchfile_p) {
  265. } else {
  266. bool ret = this->fetchfile_p->validate( log );
  267. if (ret == false) {
  268. is_valid = false;
  269. }
  270. }
  271. if (NULL == this->othertags_p) {
  272. } else {
  273. bool ret = this->othertags_p->validate( log);
  274. if (ret == false) {
  275. is_valid = false;
  276. }
  277. }
  278. */
  279. return is_valid;
  280. }
  281. void Bag::get_logstream( stringstream & log ) {
  282. log << this->log.rdbuf();
  283. }
  284. void Bag::reset_logstream() {
  285. this->log.str(std::string());
  286. }
  287. // vim: set tabstop=4 softtabstop=0 expandtab shiftwidth=4 smarttab