Perl-module which allows to use Droid/PRONOM signatures and to convert it to Perl regular expressions, analyze files using wxHexEditor tags to display matches and calc statistics. For PRONOM see https://www.nationalarchives.gov.uk/PRONOM/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1060 lines
35 KiB

  1. package File::FormatIdentification::Pronom;
  2. use feature qw(say);
  3. use strict;
  4. use warnings;
  5. use XML::LibXML;
  6. use Carp;
  7. use List::Util qw( none first );
  8. use Scalar::Util;
  9. use YAML::XS;
  10. use File::FormatIdentification::Regex;
  11. use Moose;
  12. our $VERSION = '0.02';
  13. # Preloaded methods go here.
  14. # flattens a regex-structure to a regex-string, expects a signature-pattern and a list of regex-structures
  15. # returns regex
  16. #
  17. no warnings 'recursion';
  18. sub _flatten_rx_recursive ($$$@) {
  19. my $regex = shift;
  20. my $lastpos = shift;
  21. my $open_brackets = shift;
  22. my @rx_groups = @_;
  23. my $rx = shift @rx_groups;
  24. #use Data::Printer;
  25. #say "_flatten_rx_recursive";
  26. #p( @rx_groups );
  27. #p( $rx );
  28. my $bracket_symbol = "(";
  29. if ( !defined $regex ) { confess; }
  30. if ( !defined $rx ) { # do nothing
  31. while ( $open_brackets > 0 ) {
  32. $regex .= ")";
  33. $open_brackets--;
  34. }
  35. }
  36. else {
  37. my $pos_diff = $rx->{position} - $lastpos;
  38. my $local_regex = $rx->{regex};
  39. if ( !defined $local_regex ) {
  40. $local_regex = '';
  41. }
  42. if ( 0 == $pos_diff ) {
  43. # TODO:
  44. File::FormatIdentification::Regex::simplify_two_or_combined_regex(
  45. $regex, $local_regex );
  46. $regex =
  47. &_flatten_rx_recursive( "$regex|$local_regex", $lastpos,
  48. $open_brackets, @rx_groups );
  49. }
  50. elsif ( $pos_diff > 0 ) { # is deeper
  51. # look a head, if same pos found, then use bracket, otherwise not
  52. if (
  53. (
  54. scalar @rx_groups > 0
  55. && ( $rx_groups[0]->{position} == $rx->{position} )
  56. )
  57. || $pos_diff > 1
  58. )
  59. { # use (
  60. $regex = &_flatten_rx_recursive(
  61. "$regex" . ( $bracket_symbol x $pos_diff ) . $local_regex,
  62. $rx->{position}, $open_brackets += $pos_diff, @rx_groups );
  63. }
  64. else {
  65. $regex = &_flatten_rx_recursive(
  66. "$regex$local_regex", $rx->{position},
  67. $open_brackets, @rx_groups
  68. );
  69. } ## end else [ if ( scalar @rx_groups...)]
  70. }
  71. elsif ( $pos_diff < 0 ) { # is higher
  72. $regex = &_flatten_rx_recursive(
  73. "$regex)$local_regex",
  74. $rx->{position},
  75. $open_brackets - 1, #($rx->{position} - $lastpos),
  76. @rx_groups
  77. );
  78. }
  79. else {
  80. confess
  81. "FL: pos=$rx->{position} lastpos=$lastpos regex='$regex' open=$open_brackets\n";
  82. }
  83. }
  84. return $regex;
  85. } ## end sub _flatten_rx_recursive ($$$@)
  86. use warnings 'recursion';
  87. sub _flatten_rx ($@) {
  88. my $regex = shift;
  89. my @rx_groups = @_;
  90. #say "calling flatten_rx with regex=$regex quality=$quality";
  91. #use Data::Printer;
  92. #p( @rx_groups );
  93. $regex = _flatten_rx_recursive( $regex, 0, 0, @rx_groups );
  94. return $regex;
  95. } ## end sub _flatten_rx ($@)
  96. # expands pattern of form "FFFB[10:EB]" to FFFB10, FFFB11, ... FFFBEB
  97. sub _expand_pattern ($) {
  98. my $pattern = $_[0];
  99. $pattern =~ s/(?<=\[)!/^/g;
  100. $pattern =~ s/(?<=[0-9A-F]{2}):(?=[0-9A-F]{2})\]/-]/g;
  101. $pattern =~ s/([0-9A-F]{2})/\\x{$1}/g;
  102. # substitute hex with printable ASCII-Output
  103. $pattern =~ s#\\x\{(3[0-9]|[46][1-9A-F]|[57][0-9A])\}#chr( hex($1) );#egs;
  104. return $pattern;
  105. } ## end sub _expand_pattern ($)
  106. # expands offsets min,max to regex ".{$min,$max}" and uses workarounds if $min or $max exceeds 32766
  107. sub _expand_offsets($$) {
  108. my $minoffset = shift;
  109. my $maxoffset = shift;
  110. my $byte =
  111. '.'; # HINT: needs the character set modifier "aa" in $foo=~m/$regex/aa
  112. #my $byte = '[\x00-\xff]';
  113. my $offset_expanded = "";
  114. if ( ( ( not defined $minoffset ) || ( length($minoffset) == 0 ) )
  115. && ( ( not defined $maxoffset ) || ( length($maxoffset) == 0 ) ) )
  116. {
  117. $offset_expanded = "";
  118. }
  119. elsif (( defined $minoffset )
  120. && ( length($minoffset) > 0 )
  121. && ( defined $maxoffset )
  122. && ( length($maxoffset) > 0 )
  123. && ( $minoffset == $maxoffset ) )
  124. {
  125. if ( $minoffset > 0 ) {
  126. my $maxloops = int( $maxoffset / 32766 );
  127. my $maxresidual = $maxoffset % 32766;
  128. for ( my $i = 0 ; $i < $maxloops ; $i++ ) {
  129. $offset_expanded .= $byte . "{32766}";
  130. }
  131. $offset_expanded .= $byte . "{$maxresidual}";
  132. } ## end if ( $minoffset > 0 )
  133. }
  134. else {
  135. # workaround, because perl quantifier limits,
  136. # calc How many repetitions we need! Both offsets should be less than 32766
  137. #TODO: check if this comes from Droid or is calculated
  138. my $mintmp = 0;
  139. my $maxtmp = 0;
  140. if ( defined $minoffset && ( length($minoffset) > 0 ) ) {
  141. $mintmp = $minoffset;
  142. }
  143. if ( defined $maxoffset && ( length($maxoffset) > 0 ) ) {
  144. $maxtmp = $maxoffset;
  145. }
  146. my $maxloops;
  147. if ( $maxtmp >= $mintmp ) {
  148. $maxloops = int( $maxtmp / 32766 );
  149. }
  150. else {
  151. $maxloops = int( $mintmp / 32766 );
  152. }
  153. my $maxresidual = $maxtmp % 32766;
  154. my $minresidual = $mintmp % 32766;
  155. #say "\tMaxloops=$maxloops maxres = $maxresidual minres=$minresidual";
  156. my @offsets;
  157. my $minstr = "";
  158. my $maxstr = "";
  159. if ( defined $minoffset && length($minoffset) > 0 ) {
  160. $minstr = $minresidual;
  161. $mintmp = $mintmp - $minresidual;
  162. }
  163. for ( my $i = 0 ; $i <= $maxloops ; $i++ ) {
  164. # loop, so we assure the special handling of residuals
  165. if ( $maxtmp > $maxresidual ) {
  166. $maxstr = 32766;
  167. }
  168. elsif ( $maxtmp < 0 ) {
  169. $maxstr = 0;
  170. }
  171. else {
  172. $maxstr = $maxresidual;
  173. }
  174. if ( $mintmp > $minresidual ) {
  175. $minstr = 32766;
  176. }
  177. elsif ( $mintmp < 0 ) {
  178. $minstr = 0;
  179. }
  180. else {
  181. $minstr = $minresidual;
  182. }
  183. #### handle residuals
  184. if ( $i == 0 ) {
  185. $minstr = $minresidual;
  186. $mintmp = $mintmp - $minresidual;
  187. }
  188. elsif ( $i == $maxloops ) {
  189. $maxstr = $maxresidual;
  190. $maxtmp = $maxtmp - $maxresidual;
  191. }
  192. # mark offsets
  193. my $tmp;
  194. $tmp->{minoffset} = $minstr;
  195. $tmp->{maxoffset} = $maxstr;
  196. push @offsets, $tmp;
  197. } ## end for ( my $i = 0 ; $i <=...)
  198. my @filtered = map {
  199. if ( !defined $maxoffset || length($maxoffset) == 0 ) {
  200. $_->{maxoffset} = "";
  201. }
  202. if ( !defined $minoffset || length($minoffset) == 0 ) {
  203. $_->{minoffset} = "";
  204. }
  205. $_;
  206. } @offsets;
  207. foreach my $tmp (@filtered) {
  208. # ? at the end - means non-greedy
  209. #$offset_expanded .= $byte."{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}?";
  210. $offset_expanded .=
  211. $byte . "{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}";
  212. } ## end foreach my $tmp (@filtered)
  213. } ## end else [ if ( ( ( not defined $minoffset...)))]
  214. #say "DEBUG: minoffset='$minoffset' maxoffset='$maxoffset' --> offset_expanded='$offset_expanded'";
  215. # minimization steps
  216. $offset_expanded =~ s#{0,}#*#g;
  217. $offset_expanded =~ s#{1,}#+#g;
  218. $offset_expanded =~ s#{0,1}#?#g;
  219. return $offset_expanded;
  220. } ## end sub _expand_offsets($$)
  221. # got XPath-object and returns a regex-structure as hashref
  222. sub _parse_fragments ($) {
  223. my $fq = shift;
  224. my $position = $fq->getAttribute('Position');
  225. my $minoffset = $fq->getAttribute('MinOffset');
  226. my $maxoffset = $fq->getAttribute('MaxOffset');
  227. my $rx = $fq->textContent;
  228. my $expanded = _expand_pattern($rx);
  229. my $ret;
  230. $ret->{position} = $position;
  231. $ret->{direction} = "left";
  232. $ret->{regex} = "";
  233. my ($offset_expanded) = _expand_offsets( $minoffset, $maxoffset );
  234. if ( $fq->localname eq "LeftFragment" ) {
  235. $ret->{direction} = "left";
  236. $ret->{regex} = "($expanded)$offset_expanded";
  237. }
  238. elsif ( $fq->localname eq "RightFragment" ) {
  239. $ret->{direction} = "right";
  240. $ret->{regex} = "$offset_expanded($expanded)";
  241. }
  242. #say "pF: rx=$rx expanded=$expanded offset=$offset_expanded";
  243. return $ret;
  244. } ## end sub _parse_fragments ($)
  245. # got XPath-object and search direction and returns a regex-structure as hashref
  246. sub _parse_subsequence ($$) {
  247. my $ssq = shift;
  248. my $dir = shift;
  249. my $position = $ssq->getAttribute('Position');
  250. my $minoffset = $ssq->getAttribute('SubSeqMinOffset');
  251. my $maxoffset = $ssq->getAttribute('SubSeqMaxOffset');
  252. my $rx = $ssq->getElementsByTagName('Sequence')->get_node(1)->textContent;
  253. my @lnodes = $ssq->getElementsByTagName('LeftFragment');
  254. my @rnodes = $ssq->getElementsByTagName('RightFragment');
  255. my @lrx_fragments = map { _parse_fragments($_) } @lnodes;
  256. my @rrx_fragments = map { _parse_fragments($_) } @rnodes;
  257. my $lregex = _flatten_rx( "", @lrx_fragments );
  258. my $rregex = _flatten_rx( "", @rrx_fragments );
  259. my $expanded = _expand_pattern($rx);
  260. #if ( length($minoffset) > 0
  261. # && length($maxoffset) > 0
  262. # && $minoffset > $maxoffset ) {
  263. # confess(
  264. #"parse_subsequence: Maxoffset=$maxoffset < Minoffset=$minoffset! regex= '$rx'"
  265. # );
  266. # } ## end if ( length($minoffset...))
  267. my $offset_expanded = _expand_offsets( $minoffset, $maxoffset );
  268. my $prefix;
  269. my $suffix;
  270. my $ret;
  271. my $regex;
  272. if ( !defined $dir || length($dir) == 0 ) {
  273. $regex = join( "", $lregex, $expanded, $rregex );
  274. }
  275. elsif ( $dir eq "BOFoffset" ) {
  276. $regex =
  277. join( "", $offset_expanded, "(", $lregex, $expanded, $rregex, ")" );
  278. }
  279. elsif ( $dir eq "EOFoffset" ) {
  280. $regex =
  281. join( "", "(", $lregex, $expanded, $rregex, ")", $offset_expanded );
  282. }
  283. else {
  284. warn "unknown reference '$dir' found\n";
  285. $regex = join( "", $lregex, $expanded, $rregex );
  286. }
  287. $ret->{regex} =
  288. File::FormatIdentification::Regex::peep_hole_optimizer($regex);
  289. $ret->{position} = $position;
  290. return $ret;
  291. } ## end sub _parse_subsequence ($$)
  292. # got XPath-object and returns regex-string
  293. sub _parse_bytesequence ($) {
  294. my $bsq = shift;
  295. #say "rx_groups in parse_byte_sequence:";
  296. my $reference = $bsq->getAttribute('Reference');
  297. ; # if BOFoffset -> anchored begin of file, EOFofset -> end of file
  298. my @nodes = $bsq->getElementsByTagName('SubSequence');
  299. my @rx_groups = map { _parse_subsequence( $_, $reference ) } @nodes;
  300. my $expanded = "";
  301. my $regex_flattened = _flatten_rx( $expanded, @rx_groups );
  302. #my $ro = Regexp::Optimizer->new;
  303. #my $ro = Regexp::Assemble->new;
  304. #$ro->add( $regex_flattened);
  305. #$regex_flattened = $ro->as_string($regex_flattened);
  306. #$regex_flattened = $ro->re;
  307. my $regex;
  308. if ( !defined $reference || 0 == length($reference) ) {
  309. $regex = "$regex_flattened";
  310. }
  311. elsif ( $reference eq "BOFoffset" ) {
  312. $regex = "\\A$regex_flattened";
  313. }
  314. elsif ( $reference eq "EOFoffset" ) {
  315. $regex = "$regex_flattened\\Z";
  316. }
  317. else {
  318. warn "unknown reference '$reference' found\n";
  319. $regex = "$regex_flattened";
  320. }
  321. use Regexp::Optimizer;
  322. my $ro = Regexp::Optimizer->new;
  323. #say "regex='$regex'";
  324. #$regex = $ro->as_string( $regex );
  325. return $regex;
  326. } ## end sub _parse_bytesequence ($)
  327. # ($%signatures, $%internal) = parse_signaturefile( $file )
  328. sub _parse_signaturefile($) {
  329. my $pronomfile = shift;
  330. my %signatures;
  331. # hash{internalid}->{regex} = $regex
  332. # ->{signature} = $signature
  333. my %internal_signatures;
  334. my $dom = XML::LibXML->load_xml( location => $pronomfile );
  335. $dom->indexElements();
  336. my $xp = XML::LibXML::XPathContext->new($dom);
  337. $xp->registerNs( 'droid',
  338. 'http://www.nationalarchives.gov.uk/pronom/SignatureFile' );
  339. # find Fileformats
  340. #my $tmp = $xp->find('/*[local-name() = "FFSignatureFile"]')->get_node(1);
  341. #say "E:", $tmp->nodeName;
  342. #say "EXISTS:", $xp->exists('/droid:FFSignatureFile');
  343. #say "EXISTS2", $xp->exists('/droid:FFSignatureFile/droid:FileFormatCollection/droid:FileFormat');
  344. my $fmts = $xp->find(
  345. '/*[local-name() = "FFSignatureFile"]/*[local-name() = "FileFormatCollection"]/*[local-name() = "FileFormat"]'
  346. );
  347. foreach my $fmt ( $fmts->get_nodelist() ) {
  348. my $id = $fmt->getAttribute('ID');
  349. my $mimetype = $fmt->getAttribute('MIMEtype');
  350. my $name = $fmt->getAttribute('Name');
  351. my $puid = $fmt->getAttribute('PUID');
  352. my $version = $fmt->getAttribute('Version');
  353. #
  354. ##
  355. my @extensions =
  356. map { $_->textContent() } $fmt->getElementsByTagName('Extension');
  357. my @internalsignatures =
  358. map { $_->textContent() }
  359. $fmt->getElementsByTagName('InternalSignatureID');
  360. my @haspriorityover = map { $_->textContent() }
  361. $fmt->getElementsByTagName('HasPriorityOverFileFormatID');
  362. $signatures{$id}->{mimetype} = $mimetype;
  363. $signatures{$id}->{name} = $name;
  364. $signatures{$id}->{puid} = $puid;
  365. $signatures{$id}->{version} = $version; # optional
  366. $signatures{$id}->{extensions} = \@extensions;
  367. $signatures{$id}->{internal_signatures} = \@internalsignatures;
  368. foreach my $prio (@haspriorityover) {
  369. $signatures{$id}->{priorityover}->{$prio} = 1;
  370. }
  371. foreach my $internal (@internalsignatures) {
  372. $internal_signatures{$internal}->{signature} = $id;
  373. }
  374. } ## end foreach my $fmt ( $fmts->get_nodelist...)
  375. # find InternalSignatures
  376. my $sigs =
  377. $xp->find(
  378. '/*[local-name() = "FFSignatureFile"]/*[local-name() = "InternalSignatureCollection"]/*[local-name() = "InternalSignature"]'
  379. );
  380. foreach my $sig ( $sigs->get_nodelist() ) {
  381. my $id = $sig->getAttribute('ID');
  382. my $specificity = $sig->getAttribute('Specificity');
  383. $internal_signatures{$id}->{specificity} = $specificity;
  384. #p( $sig->toString() );
  385. my @nodes = $sig->getElementsByTagName('ByteSequence');
  386. #p( @nodes );
  387. my @rx_groups = map { _parse_bytesequence($_) } @nodes;
  388. my @rx_quality =
  389. map { File::FormatIdentification::Regex::calc_quality($_); }
  390. @rx_groups;
  391. $internal_signatures{$id}->{regex} = \@rx_groups;
  392. $internal_signatures{$id}->{quality} = \@rx_quality;
  393. } ## end foreach my $sig ( $sigs->get_nodelist...)
  394. return ( \%signatures, \%internal_signatures );
  395. } ## end sub _parse_signaturefile($)
  396. sub uniq_signature_ids_by_priority {
  397. my $self = shift;
  398. my @signatures = @_;
  399. my %found_signature_ids;
  400. # which PUIDs are in list?
  401. foreach my $signatureid (@signatures) {
  402. if ( defined $signatureid ) {
  403. $found_signature_ids{$signatureid} = 1;
  404. }
  405. }
  406. # remove all signatures when actual signature has priority over
  407. foreach my $signatureid ( keys %found_signature_ids ) {
  408. foreach my $priority_over_sid (
  409. keys %{ $self->{signatures}->{$signatureid}->{priorityover} } )
  410. {
  411. if ( exists $found_signature_ids{$priority_over_sid} ) {
  412. delete $found_signature_ids{$priority_over_sid};
  413. }
  414. } ## end foreach my $priority_over_sid...
  415. } ## end foreach my $signatureid ( keys...)
  416. # reduce list to all signatures with correct priority
  417. my @result =
  418. grep { defined $found_signature_ids{ $_->{signature} } } @signatures;
  419. return @result;
  420. } ## end sub uniq_signature_ids_by_priority
  421. has 'droid_signature_filename' => (
  422. is => 'ro',
  423. required => 1,
  424. reader => 'get_droid_signature_filename',
  425. trigger => sub {
  426. my $self = shift;
  427. #say "TRIGGER";
  428. my $yaml_file = $self->get_droid_signature_filename() . ".yaml";
  429. if ( $self->{auto_load} && -e $yaml_file ) {
  430. $self->load_from_yamlfile($yaml_file);
  431. }
  432. else {
  433. my ( $signatures, $internal_signatures ) =
  434. _parse_signaturefile( $self->{droid_signature_filename} );
  435. $self->{signatures} = $signatures;
  436. $self->{internal_signatures} = $internal_signatures;
  437. #die;
  438. if ( $self->{auto_store} ) {
  439. $self->save_as_yamlfile($yaml_file);
  440. }
  441. } ## end else [ if ( $self->{auto_load...})]
  442. foreach my $s ( keys %{ $self->{signatures} } ) {
  443. my $puid = $self->{signatures}->{$s}->{puid};
  444. if ( defined $puid && length($puid) > 0 ) {
  445. $self->{puids}->{$puid} = $s;
  446. }
  447. }
  448. }
  449. );
  450. sub save_as_yamlfile {
  451. my $self = shift;
  452. my $filename = shift;
  453. my @res;
  454. push @res, $self->{signatures};
  455. push @res, $self->{internal_signatures};
  456. YAML::XS::DumpFile( "$filename", @res );
  457. return;
  458. } ## end sub save_as_yamlfile
  459. sub load_from_yamlfile {
  460. my $self = shift;
  461. my $filename = shift;
  462. my ( $sig, $int ) = YAML::XS::LoadFile($filename);
  463. $self->{signatures} = $sig;
  464. $self->{internal_signatures} = $int;
  465. return;
  466. } ## end sub load_from_yamlfile
  467. has 'auto_store' => (
  468. is => 'ro',
  469. default => 1,
  470. );
  471. has 'auto_load' => (
  472. is => 'ro',
  473. default => 1,
  474. );
  475. sub get_all_signature_ids {
  476. my $self = shift;
  477. my @sigs = sort { $a <=> $b } keys %{ $self->{signatures} };
  478. return @sigs;
  479. }
  480. sub get_signature_id_by_puid {
  481. my $self = shift;
  482. my $puid = shift;
  483. my $sig = $self->{puids}->{$puid};
  484. return $sig;
  485. }
  486. sub get_internal_ids_by_puid {
  487. my $self = shift;
  488. my $puid = shift;
  489. my $sig = $self->get_signature_id_by_puid($puid);
  490. my @ids = ();
  491. if ( defined $sig ) {
  492. @ids = grep { defined $_ }
  493. @{ $self->{signatures}->{$sig}->{internal_signatures} };
  494. }
  495. return @ids;
  496. }
  497. sub get_file_endings_by_puid {
  498. my $self = shift;
  499. my $puid = shift;
  500. my $sig = $self->get_signature_id_by_puid($puid);
  501. my @endings = ();
  502. if ( defined $sig ) {
  503. @endings = $self->{signatures}->{$sig}->{extensions};
  504. }
  505. return @endings;
  506. }
  507. sub get_all_internal_ids {
  508. my $self = shift;
  509. my @ids = sort { $a <=> $b } keys %{ $self->{internal_signatures} };
  510. foreach my $id (@ids) {
  511. if ( !defined $id ) { confess("$id not defined") }
  512. }
  513. return @ids;
  514. }
  515. sub get_all_puids {
  516. my $self = shift;
  517. my @ids =
  518. sort grep { defined $_ }
  519. map { $self->{signatures}->{$_}->{puid}; }
  520. grep { defined $_ } $self->get_all_signature_ids();
  521. return @ids;
  522. }
  523. sub get_regular_expressions_by_internal_id {
  524. my $self = shift;
  525. my $internalid = shift;
  526. if ( !defined $internalid ) { confess("internalid must exists!"); }
  527. my @rx = @{ $self->{internal_signatures}->{$internalid}->{regex} };
  528. return @rx;
  529. }
  530. sub get_all_regular_expressions {
  531. my $self = shift;
  532. my @ids = $self->get_all_internal_ids();
  533. my @regexes = ();
  534. foreach my $id (@ids) {
  535. my @rx = $self->get_regular_expressions_by_internal_id($id);
  536. push @regexes, @rx;
  537. }
  538. my @ret = sort @regexes;
  539. return @ret;
  540. }
  541. sub get_qualities_by_internal_id {
  542. my $self = shift;
  543. my $internalid = shift;
  544. if ( !defined $internalid ) { confess("internalid must exists!"); }
  545. my $value = $self->{internal_signatures}->{$internalid}->{quality};
  546. if ( defined $value ) {
  547. return @{$value};
  548. }
  549. return;
  550. }
  551. sub get_signature_id_by_internal_id {
  552. my $self = shift;
  553. my $internalid = shift;
  554. if ( !defined $internalid ) { confess("internalid must exists!"); }
  555. return $self->{internal_signatures}->{$internalid}->{signature};
  556. }
  557. sub get_name_by_signature_id {
  558. my $self = shift;
  559. my $signature = shift;
  560. return $self->{signatures}->{$signature}->{name};
  561. }
  562. sub get_puid_by_signature_id {
  563. my $self = shift;
  564. my $signature = shift;
  565. return $self->{signatures}->{$signature}->{puid};
  566. }
  567. sub get_puid_by_internal_id {
  568. my $self = shift;
  569. my $internalid = shift;
  570. if ( !defined $internalid ) { confess("internalid must exists!"); }
  571. my $signature = $self->get_signature_id_by_internal_id($internalid);
  572. return $self->get_puid_by_signature_id($signature);
  573. }
  574. sub get_quality_sorted_internal_ids {
  575. my $self = shift;
  576. my @ids = sort {
  577. # sort by regexes
  578. my @a_rxq = @{ $self->{internal_signatures}->{$a}->{quality} };
  579. my @b_rxq = @{ $self->{internal_signatures}->{$b}->{quality} };
  580. my $aq = 0;
  581. foreach my $as (@a_rxq) { $aq += $as; }
  582. my $bq = 0;
  583. foreach my $bs (@b_rxq) { $bq += $bs; }
  584. #use Data::Printer;
  585. #p( $a );
  586. #p( $aq );
  587. $aq <=> $bq;
  588. } $self->get_all_internal_ids();
  589. return @ids;
  590. }
  591. sub get_combined_regex_by_puid {
  592. my $self = shift;
  593. my $puid = shift;
  594. my @internals = $self->get_internal_ids_by_puid($puid);
  595. #use Data::Printer;
  596. #p( $puid );
  597. #p( @internals );
  598. my @regexes = map {
  599. my @regexes_per_internal =
  600. $self->get_regular_expressions_by_internal_id($_);
  601. my $combined =
  602. File::FormatIdentification::Regex::and_combine(@regexes_per_internal);
  603. #p( $combined );
  604. $combined;
  605. } @internals;
  606. my $result = File::FormatIdentification::Regex::or_combine(@regexes);
  607. #p( $result );
  608. return $result;
  609. }
  610. sub _prepare_statistics {
  611. my $self = shift;
  612. my $results;
  613. # count of PUIDs
  614. # count of internal ids (IDs per PUID)
  615. # count of regexes
  616. # count of file endings only
  617. # count of internal ids without PUID
  618. # larges and shortest regex
  619. # complex and simple regex
  620. # common regexes
  621. #say "stat";
  622. my @puids = $self->get_all_puids();
  623. my $puids = scalar(@puids);
  624. my @internals = $self->get_all_internal_ids();
  625. my $internals = scalar(@internals);
  626. my $regexes = 0;
  627. my $fileendingsonly = 0;
  628. my @fileendingsonly = ();
  629. my $fileendings = 0;
  630. my $int_per_puid = 0;
  631. my $internal_without_puid = 0;
  632. my @internal_without_puid = ();
  633. my @quality_sorted_internal_ids = $self->get_quality_sorted_internal_ids();
  634. my %uniq_regexes;
  635. foreach my $internalid (@internals) {
  636. my @regexes =
  637. $self->get_regular_expressions_by_internal_id($internalid);
  638. foreach my $rx (@regexes) {
  639. my @tmp = ();
  640. if ( exists $uniq_regexes{$rx} ) {
  641. @tmp = @{ $uniq_regexes{$rx} };
  642. }
  643. push @tmp, $internalid;
  644. $uniq_regexes{$rx} = \@tmp;
  645. }
  646. $regexes += scalar(@regexes);
  647. my $sigid = $self->get_signature_id_by_internal_id($internalid);
  648. if ( !defined $sigid ) {
  649. $internal_without_puid++;
  650. push @internal_without_puid, $internalid;
  651. }
  652. }
  653. foreach my $puid (@puids) {
  654. my @ints = $self->get_internal_ids_by_puid($puid);
  655. my @fileendings = $self->get_file_endings_by_puid($puid);
  656. if ( 0 == scalar(@ints) ) {
  657. $fileendingsonly++;
  658. push @fileendingsonly, $puid;
  659. }
  660. else {
  661. $fileendings += scalar(@fileendings);
  662. $int_per_puid += scalar(@ints);
  663. }
  664. }
  665. foreach my $i (@quality_sorted_internal_ids) {
  666. my $regex =
  667. join( "#", $self->get_regular_expressions_by_internal_id($i) );
  668. my $quality = join( " ", $self->get_qualities_by_internal_id($i) );
  669. }
  670. $results->{filename} = $self->get_droid_signature_filename();
  671. $results->{count_of_puids} = $puids;
  672. $results->{count_of_internal_ids} = $internals;
  673. $results->{count_of_regular_expressions} = $regexes;
  674. $results->{count_of_fileendings} = $fileendings;
  675. $results->{count_of_puid_with_fileendings_only} = $fileendingsonly;
  676. $results->{puids_with_fileendings_only} = \@fileendingsonly;
  677. $results->{count_of_orphaned_internal_ids} = $internal_without_puid;
  678. $results->{internal_ids_without_puids} = \@internal_without_puid;
  679. no warnings;
  680. for ( my $i = 0 ; $i <= 4 ; $i++ ) {
  681. my $best_quality_internal = pop @quality_sorted_internal_ids;
  682. if ( defined $best_quality_internal ) {
  683. my $best_quality = join( ";",
  684. $self->get_qualities_by_internal_id($best_quality_internal) );
  685. my $best_puid =
  686. $self->get_puid_by_internal_id($best_quality_internal);
  687. my $best_name =
  688. $self->get_name_by_signature_id(
  689. $self->get_signature_id_by_internal_id($best_quality_internal)
  690. );
  691. my $best_regex = $self->get_combined_regex_by_puid($best_puid);
  692. $results->{nth_best_quality}->[$i]->{internal_id} =
  693. $best_quality_internal;
  694. $results->{nth_best_quality}->[$i]->{puid} = $best_puid;
  695. $results->{nth_best_quality}->[$i]->{name} = $best_name;
  696. $results->{nth_best_quality}->[$i]->{quality} = $best_quality;
  697. $results->{nth_best_quality}->[$i]->{combined_regex} = $best_regex;
  698. }
  699. }
  700. for ( my $i = 0 ; $i <= 4 ; $i++ ) {
  701. my $worst_quality_internal = shift @quality_sorted_internal_ids;
  702. if ( defined $worst_quality_internal ) {
  703. my $worst_quality = join( ";",
  704. $self->get_qualities_by_internal_id($worst_quality_internal) );
  705. my $worst_puid =
  706. $self->get_puid_by_internal_id($worst_quality_internal);
  707. my $worst_name =
  708. $self->get_name_by_signature_id(
  709. $self->get_signature_id_by_internal_id($worst_quality_internal)
  710. );
  711. my $worst_regex = $self->get_combined_regex_by_puid($worst_puid);
  712. $results->{nth_worst_quality}->[$i]->{internal_id} =
  713. $worst_quality_internal;
  714. $results->{nth_worst_quality}->[$i]->{puid} = $worst_puid;
  715. $results->{nth_worst_quality}->[$i]->{name} = $worst_name;
  716. $results->{nth_worst_quality}->[$i]->{quality} = $worst_quality;
  717. $results->{nth_worst_quality}->[$i]->{combined_regex} =
  718. $worst_regex;
  719. }
  720. }
  721. my @multiple_used_regex = grep {
  722. my $tmp = $uniq_regexes{$_};
  723. my @tmp = @{$tmp};
  724. scalar(@tmp) > 1
  725. } sort keys %uniq_regexes;
  726. $results->{count_of_multiple_used_regex} = scalar(@multiple_used_regex);
  727. for ( my $i = 0 ; $i <= $#multiple_used_regex ; $i++ ) {
  728. $results->{multiple_used_regex}->[$i]->{regex} =
  729. $multiple_used_regex[$i];
  730. my @ids = join( ",", @{ $uniq_regexes{ $multiple_used_regex[$i] } } );
  731. $results->{multiple_used_regex}->[$i]->{internal_ids} = \@ids;
  732. }
  733. return $results;
  734. }
  735. sub print_csv_statistics {
  736. my $self = shift;
  737. my $csv_file = shift;
  738. my $results = $self->_prepare_statistics();
  739. my $version = $results->{filename};
  740. $version =~ s/DROID_SignatureFile_V(\d+)\.xml/$1/;
  741. $results->{version} = $version;
  742. $results->{best_quality_puid} = $results->{nth_best_quality}->[0]->{puid};
  743. $results->{best_quality_internal_id} =
  744. $results->{nth_best_quality}->[0]->{internal_id};
  745. $results->{best_quality_quality} =
  746. $results->{nth_best_quality}->[0]->{quality};
  747. $results->{best_quality_combined_regex} =
  748. $results->{nth_best_quality}->[0]->{combined_regex};
  749. $results->{worst_quality_puid} = $results->{nth_worst_quality}->[0]->{puid};
  750. $results->{worst_quality_internal_id} =
  751. $results->{nth_worst_quality}->[0]->{internal_id};
  752. $results->{worst_quality_quality} =
  753. $results->{nth_worst_quality}->[0]->{quality};
  754. $results->{worst_quality_combined_regex} =
  755. $results->{nth_worst_quality}->[0]->{combined_regex};
  756. my @headers =
  757. qw(version filename count_of_puids count_of_internal_ids count_of_regular_expressions count_of_fileendings count_of_puid_with_fileendings_only count_of_orphaned_internal_ids count_of_multiple_used_regex best_quality_puid best_quality_internal_id best_quality_quality best_quality_combined_regex worst_quality_puid worst_quality_internal_id worst_quality_quality worst_quality_combined_regex);
  758. my $file_exists = (-e $csv_file);
  759. open (my $FH, ">>", "$csv_file") or croak "Can't open file '$csv_file', $0";
  760. if (not $file_exists) {
  761. say $FH "#", join( ",", @headers );
  762. }
  763. say $FH join(
  764. ",",
  765. map {
  766. my $result = $results->{$_};
  767. if ( !defined $result ) { $result = ""; }
  768. "\"$result\"";
  769. } @headers
  770. );
  771. close ($FH);
  772. return;
  773. }
  774. sub print_statistics {
  775. my $self = shift;
  776. my $verbose = shift;
  777. my $results = $self->_prepare_statistics();
  778. say "Statistics of file $results->{filename}";
  779. say "=======================================";
  780. say "";
  781. say "Countings";
  782. say "---------------------------------------";
  783. say "Count of PUIDs: $results->{count_of_puids}";
  784. say
  785. " internal IDs: $results->{count_of_internal_ids}";
  786. say
  787. " regular expressions: $results->{count_of_regular_expressions}";
  788. say
  789. " file endings: $results->{count_of_fileendings}";
  790. say
  791. " PUIDs with file endings only: $results->{count_of_puid_with_fileendings_only}";
  792. if ( defined $verbose ) {
  793. say " (",
  794. join( ", ", sort @{ $results->{puids_with_fileendings_only} } ), ")";
  795. }
  796. say
  797. " orphaned internal IDs: $results->{count_of_orphaned_internal_ids}";
  798. if ( defined $verbose ) {
  799. say " (",
  800. join( ", ", sort {$a <=> $b} @{ $results->{internal_ids_without_puids} } ), ")";
  801. }
  802. say "";
  803. say "Quality of internal IDs";
  804. say "---------------------------------------";
  805. my $nth = 1;
  806. foreach my $n ( @{ $results->{nth_best_quality} } ) {
  807. say
  808. "$nth-best quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
  809. if ( defined $verbose ) {
  810. say " combined regex: ", $n->{combined_regex};
  811. }
  812. $nth++;
  813. }
  814. say "";
  815. $nth = 1;
  816. foreach my $n ( @{ $results->{nth_worst_quality} } ) {
  817. say
  818. "$nth-worst quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
  819. if ( defined $verbose ) {
  820. say " combined regex: ", $n->{combined_regex};
  821. }
  822. $nth++;
  823. }
  824. say "";
  825. say "";
  826. say "Regular expressions";
  827. say "---------------------------------------";
  828. say
  829. "Count of multiple used regular expressions: $results->{count_of_multiple_used_regex}";
  830. if ( defined $verbose ) {
  831. for ( my $i = 0 ; $i < $results->{count_of_multiple_used_regex} ; $i++ )
  832. {
  833. say " common regex group no $i:";
  834. say " regex='"
  835. . $results->{multiple_used_regex}->[$i]->{regex} . "'";
  836. say " internal IDs: ",
  837. join( ",", @{ $results->{multiple_used_regex}->[$i]->{internal_ids} } );
  838. }
  839. }
  840. say "";
  841. #my @rx = $self->get_all_regular_expressions();
  842. #use Data::Printer;
  843. #p( %uniq_regexes );
  844. return;
  845. }
  846. 1;
  847. no Moose;
  848. __PACKAGE__->meta->make_immutable;
  849. 1;
  850. __END__
  851. # Below is stub documentation for your module. You'd better edit it!
  852. =head1 NAME
  853. File::FormatIdentification::Pronom - Perl extension for parsing PRONOM-Signatures using DROID-Signature file
  854. =head1 SYNOPSIS
  855. use File::FormatIdentification::Pronom;
  856. my $pronomfile = "Droid-Signature.xml";
  857. my ( $signatures, $internals ) = parse_signaturefile($pronomfile);
  858. =head1 DESCRIPTION
  859. The module allows to handle Droid signatures. Droid is a utility which
  860. uses the PRONOM database to identify file formats.
  861. See https://www.nationalarchives.gov.uk/PRONOM/ for details.
  862. With this module you could:
  863. =over
  864. =item convert Droid signatures to Perl regular expressions
  865. =item analyze files and display which/where pattern of Droid signature matches via tag-files for wxHexEditor
  866. =item calc statistics about Droid signatures
  867. =back
  868. The module is in early alpha state and should not be used in production.
  869. =head2 Examples
  870. =head3 Colorize wxHexeditor fields
  871. See example file F<bin/pronom2wxhexeditor.pl>. This colorizes the hex-blob to check PRONOM pattern matches for a given file.
  872. =head3 Identify file
  873. There are better tools for the job, but as a proof of concept certainly not bad: Identifying the file type of a file.
  874. my $pronom = File::FormatIdentification::Pronom->new(
  875. "droid_signature_filename" => $pronomfile
  876. );
  877. # .. $filestream is a scalar representing a file
  878. foreach my $internalid ( $pronom->get_all_internal_ids() ) {
  879. my $sig = $pronom->get_signature_id_by_internal_id($internalid);
  880. my $puid = $pronom->get_puid_by_signature_id($sig);
  881. my $name = $pronom->get_name_by_signature_id($sig);
  882. my $quality = $pronom->get_qualities_by_internal_id($internalid);
  883. my @regexes = $pronom->get_regular_expressions_by_internal_id($internalid);
  884. if ( all {$filestream =~ m/$_/saa} @regexes ) {
  885. say "$binaryfile identified as $name with PUID $puid (regex quality $quality)";
  886. }
  887. }
  888. See example file F<bin/pronomidentify.pl> for a full working script.
  889. =head3 Get PRONOM Statistics
  890. To get a feeling for which signatures need to be revised in PRONOM, or why which file formats are difficult to recognize,
  891. you can get detailed statistics for given signature files.
  892. In the blog entry under L<https://kulturreste.blogspot.com/2018/10/heres-tool-make-it-work.html> the statistic report is presented in more detail.
  893. =head2 EXPORT
  894. None by default.
  895. =head1 SEE ALSO
  896. L<File::FormatIdentification::Regex>
  897. =head1 AUTHOR
  898. Andreas Romeyke L<pause@andreas-romeyke.de>
  899. =head1 COPYRIGHT AND LICENSE
  900. Copyright (C) 2018/19/20 by Andreas Romeyke
  901. This library is free software; you can redistribute it and/or modify
  902. it under the same terms as Perl itself, either Perl version 5.24.1 or,
  903. at your option, any later version of Perl 5 you may have available.
  904. The droid-signature file in t/ is from L<https://www.nationalarchives.gov.uk/PRONOM/Default.aspx>
  905. and without guarantee, it does not look like it is legally protected. If there are any legal claims,
  906. please let me know that I can remove them from the distribution.
  907. =head1 BUGS
  908. =over
  909. =item Some droid recipes results in PCREs which are greedy and therefore the running
  910. time could be exponential with size of binary file.
  911. =back
  912. =head1 CONTRIBUTING
  913. Please feel free to send me comments and patches to my email address. You can clone the modules
  914. from L<https://art1pirat.spdns.org/art1/File-FormatIdentification-Pronom> and send me merge requests.
  915. =cut