C++ Library to handle BagIt structures. BagIt is a standard format to create transfer packages for digital preservation purposes. See https://en.wikipedia.org/wiki/BagIt for details http://andreas-romeyke.de
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

340 lines
11 KiB

// Copyright (C) 2018 Andreas Romeyke (art1@andreas-romeyke.de), 2018.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
#include "bag.hpp"
#include <boost/filesystem.hpp>
#include <fstream>
#include <iostream>
#include "payload.hpp"
#include <sstream>
#include <string>
//#include <filesystem> // c++17
//namespace fs = std::filesystem;
namespace fs = boost::filesystem;
using namespace std;
Bag::Bag( string dfname ) {
if ('/' != dfname.back()) {
dfname += "/";
}
// log << "load constructor (" << dfname << ")" << endl;
// read in file bagit.txt
string bagit_txt_path = dfname + "bagit.txt";
/* add default metadata and stuff */
this->base_dir = dfname;
this->bagit_version_major = 0;
this->bagit_version_minor = 97;
this->tag_file_character_encoding = "UTF-8";
//log << "parse " << bagit_txt_path << endl;
ifstream bagit_txt_file;
bagit_txt_file.open( bagit_txt_path );
if (bagit_txt_file.is_open()) {
string version_line;
string utf8_line;
getline(bagit_txt_file, version_line);
getline(bagit_txt_file, utf8_line);
bagit_txt_file.close();
stringstream version_ss ( version_line );
string major;
string minor;
string vprefix;
getline(version_ss, vprefix, ' ');
getline(version_ss, major ,'.');
getline(version_ss, minor, '.');
if (0 != vprefix.compare("BagIt-Version:")) {
// log << "wrong vprefix='" << vprefix << "', but 'BagIt-Version:' expected" << endl;
}
//log << "major:'"<<major<<"'"<<endl;
//log << "minor:'"<<minor<<"'"<<endl;
Bag::bagit_version_major = stoi(major);
Bag::bagit_version_minor = stoi(minor);
stringstream utf8_ss (utf8_line);
string uprefix;
string uvalue;
getline(utf8_ss, uprefix, ' ');
getline(utf8_ss, uvalue, ' ');
if (0 != uprefix.compare("Tag-File-Character-Encoding:")) {
//log << "wrong uprefix='" << uprefix << "', but 'Tag-File-Character-Encoding:' expected" << endl;
}
Bag::tag_file_character_encoding = uvalue;
//log << "Bagit Version ("<< version_line << ") major=" << Bag::bagit_version_major << " minor=" << Bag::bagit_version_minor << endl;
} else {
Bag::log << "file " << bagit_txt_path << " could not be opened" <<endl;
}
// read in payload
Bag::payload_p = new Payload( dfname ) ;
list<string> files = Bag::payload_p->get_all_relative_paths();
// read in payload manifest
Bag::payloadmanifest_p = new Payloadmanifest(dfname);
// read in tagmanifest
Bag::tagmanifest_p = new Tagmanifest(dfname);
// read in baginfo
Bag::bagmetadata_p = new Bagmetadata(dfname);
map<string,string> md = Bag::bagmetadata_p->get_metadata();
map<string,string>::iterator m;
for (m=md.begin(); m!=md.end(); m++) {
// log << m->first << " = " << m->second <<endl;
}
list<string>::iterator i;
for (i=files.begin(); i!=files.end(); i++) {
// log << "file/dir (rel):" << (*i) << endl;
//log << "file/dir (abs):" << (*i) << endl;
}
Bag::payloadmanifest_p->get_checksum_file_pairs( md5 );
Bag::tagmanifest_p->get_checksum_file_pairs( md5 );
}
list<string> Bag::get_all_bag_files() {
list<string> files;
files.emplace_back("bagit.txt" );
files.emplace_back("bag-info.txt" );
if (nullptr != this->tagmanifest_p) {
// TODO(art1): this->tagmanifest_p->get_all_checksum_files()
}
return files;
}
bool Bag::store( const string basedir ) {
fs::path p{ basedir };
fs::file_status s = fs::status( p );
if (fs::is_directory( s)) {
log << "directory '" << basedir << "' already exists" << endl;
// return false;
}
fs::create_directory(p);
// store payload
if (nullptr == Bag::payload_p) {
log << "Payload object needed" << endl;
return false;
}
Bag::payload_p->store( basedir );
// store payload manifest
if (nullptr == Bag::payloadmanifest_p) {
log << "Payloadmanifest object needed" << endl;
return false;
}
list<string> payload_files =Bag::payload_p->get_all_relative_paths();
log << "PAYLOAD" <<endl;
Bag::payloadmanifest_p->store( basedir, payload_files);
// store baginfo
if (nullptr == Bag::bagmetadata_p) {
log << "Bagmetadata object needed" << endl;
return false;
}
Bag::bagmetadata_p->store( basedir );
// store fetchfile (if needed)
if (nullptr == Bag::fetchfile_p) {
log << "Fetchfile object needed" << endl;
} else {
Bag::fetchfile_p->store( basedir );
}
// store other
if (nullptr == Bag::othertags_p) {
log << "Othertags object needed" << endl;
} else {
Bag::othertags_p->store( basedir );
}
// store bag itself
string bagit_txt_path = basedir + "bagit.txt";
ofstream bagit_txt_file;
bagit_txt_file.open( bagit_txt_path );
if (bagit_txt_file.is_open()) {
bagit_txt_file << ("BagIt-Version: " + to_string(Bag::bagit_version_major) + "." + to_string(Bag::bagit_version_minor)) << endl;
bagit_txt_file << ("Tag-File-Character-Encoding: " + Bag::tag_file_character_encoding) << endl;
bagit_txt_file.close();
} else {
log << "file " << bagit_txt_path << "could not be open for writing" << endl;
return false;
}
// at least (!), store tagmanifest
if (nullptr == Bag::tagmanifest_p) {
log << "Tagmanifest object needed" << endl;
return false;
}
list<string> bagfiles = Bag::get_all_bag_files();
log << "TAGMANIFEST" << endl;
list<string>::iterator it;
for (it=bagfiles.begin(); it!= bagfiles.end(); ++it) {
log << "TAGMANIFEST-file '" << *it << "'" << endl;
}
Bag::tagmanifest_p->store( basedir, bagfiles );
return true;
}
bool Bag::store() {
return Bag::store( this->base_dir );
}
bool Bag::validate() {
bool is_valid = true;
if (
!(
((this->bagit_version_major == 0) && (this->bagit_version_minor ==97)) ||
((this->bagit_version_major == 1) && (this->bagit_version_minor == 0))
)
) {
this->log << "Bagit version 0.97 or 1.0 is expected, but got: "
<< to_string(this->bagit_version_major) << to_string(this->bagit_version_minor) << endl;
is_valid = false;
}
if (0 != tag_file_character_encoding.compare( "UTF-8" )) {
this->log << "Bagit character encoding UTF-8 is expected, but got: " << this->tag_file_character_encoding << endl;
is_valid = false;
}
if (nullptr == this->payload_p) {
this->log << "Bagit payload directory 'data/' is expected, but could not found" << endl;
is_valid = false;
} else {
bool ret = this->payload_p->validate();
if (!ret) {
is_valid = false;
}
}
if (nullptr == this->payloadmanifest_p) {
is_valid = false;
} else {
// checksums check
this->log << "Bagit payload manifest" << endl;
bool ret = this->payloadmanifest_p->validate();
if (!ret) {
is_valid = false;
}
if (nullptr != this->payload_p) {
// check if payload checksums missed for payload files
// HINT: not requested by draft, therefore only a warning
list<string> payload_files = this->payload_p->get_all_relative_paths();
list<string> payload_manifest_files = this->payloadmanifest_p->get_checksummed_files();
list<string> missed_files;
payload_files.sort();
payload_manifest_files.sort();
//log << "PAYLOAD_FILES:" << endl;
//auto it = payload_files.begin();
//while( it != payload_files.end()) {
// log << "\t"<<(*it++) << endl;
//}
//log << "PAYLOADMANIFEST_FILES:" << endl;
//it = payload_manifest_files.begin();
//while( it != payload_manifest_files.end()) {
// log << "\t"<<(*it++) << endl;
//}
auto it1 = payload_files.begin();
auto it2 = payload_manifest_files.begin();
while( it1 != payload_files.end() && it2 != payload_manifest_files.end() ) {
int cmp_res = (*it1).compare( *it2);
//log << "COMP: "<<cmp_res<<" file='"<< (*it1) << "' checksummed file='" << (*it2) << "'" << endl;
if ( cmp_res < 0) {
this->log << "Bagit warning, file '" << (*it1) << "' in payload has no checksum entry in payload manifest" << endl;
it1++;
} else if ( cmp_res > 0) {
it2++;
} else {
it1++;
it2++;
}
}
}
}
// next elements are optional
if (nullptr == this->tagmanifest_p) {
} else {
this->log << "Bagit tag manifest" << endl;
bool ret = this->tagmanifest_p->validate();
if (!ret) {
is_valid = false;
}
}
if (nullptr == this->bagmetadata_p) {
} else {
bool ret = this->bagmetadata_p->validate();
if (!ret) {
is_valid = false;
}
if (this->bagmetadata_p->has_PayloadOxum()) {
// check oxum of payload
Checksum c;
if (this->payload_p != nullptr) {
list<string> files = this->payload_p->get_all_absolute_paths();
oxum_t expected_oxum = this->bagmetadata_p->get_PayloadOxum();
oxum_t calculated_oxum = c.oxum_of_filelist( files );
if (expected_oxum.octetcount != calculated_oxum.octetcount) {
this->log << "Bagit payload oxum octectcount=" << to_string(expected_oxum.octetcount) << " expected, but " << to_string(calculated_oxum.octetcount) << " found" << endl;
is_valid = false;
}
if (expected_oxum.streamcount != calculated_oxum.streamcount) {
this->log << "Bagit payload oxum streamcount=" << to_string(expected_oxum.streamcount) << " expected, but " << to_string(calculated_oxum.streamcount) << " found" << endl;
is_valid = false;
}
}
}
}
/*
if (NULL == this->fetchfile_p) {
} else {
bool ret = this->fetchfile_p->validate( log );
if (ret == false) {
is_valid = false;
}
}
if (NULL == this->othertags_p) {
} else {
bool ret = this->othertags_p->validate( log);
if (ret == false) {
is_valid = false;
}
}
*/
return is_valid;
}
void Bag::add_payload( Payload * payload_p ) {
this->payload_p = payload_p;
}
void Bag::add_bagmetadata( Bagmetadata * bagmetadata_p ) {
this->bagmetadata_p = bagmetadata_p;
}
void Bag::add_fetchfile( Fetchfile * fetchfile_p ) {
this->fetchfile_p = fetchfile_p;
}
void Bag::add_othertags( Othertags * othertags_p ) {
this->othertags_p = othertags_p;
}
void Bag::get_logstream( stringstream & log ) {
log << this->log.rdbuf();
}
void Bag::reset_logstream() {
this->log.str(std::string());
}
// vim: set tabstop=4 softtabstop=0 expandtab shiftwidth=4 smarttab