Perl-module which allows to use Droid/PRONOM signatures and to convert it to Perl regular expressions, analyze files using wxHexEditor tags to display matches and calc statistics. For PRONOM see https://www.nationalarchives.gov.uk/PRONOM/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

997 lines
33 KiB

  1. package File::FormatIdentification::Pronom;
  2. use 5.024001;
  3. use strict;
  4. use warnings;
  5. use diagnostics;
  6. use XML::LibXML;
  7. use Carp;
  8. use List::Util qw( none first );
  9. use Scalar::Util;
  10. use YAML::XS;
  11. use File::FormatIdentification::Regex;
  12. use Moose;
  13. our $VERSION = '0.01';
  14. # Preloaded methods go here.
  15. # flattens a regex-structure to a regex-string, expects a signature-pattern and a list of regex-structures
  16. # returns regex
  17. #
  18. no warnings 'recursion';
  19. sub _flatten_rx_recursive ($$$@) {
  20. my $regex = shift;
  21. my $lastpos = shift;
  22. my $open_brackets = shift;
  23. my @rx_groups = @_;
  24. my $rx = shift @rx_groups;
  25. #use Data::Printer;
  26. #say "_flatten_rx_recursive";
  27. #p( @rx_groups );
  28. #p( $rx );
  29. my $bracket_symbol = "(";
  30. if ( !defined $regex ) { confess; }
  31. if ( !defined $rx ) { # do nothing
  32. while ( $open_brackets > 0 ) {
  33. $regex .= ")";
  34. $open_brackets--;
  35. }
  36. }
  37. else {
  38. my $pos_diff = $rx->{position} - $lastpos;
  39. my $local_regex = $rx->{regex};
  40. if ( !defined $local_regex ) {
  41. $local_regex = '';
  42. }
  43. if ( 0 == $pos_diff ) {
  44. # TODO:
  45. File::FormatIdentification::Regex::simplify_two_or_combined_regex(
  46. $regex, $local_regex );
  47. $regex =
  48. &_flatten_rx_recursive( "$regex|$local_regex", $lastpos,
  49. $open_brackets, @rx_groups );
  50. }
  51. elsif ( $pos_diff > 0 ) { # is deeper
  52. # look a head, if same pos found, then use bracket, otherwise not
  53. if (
  54. (
  55. scalar @rx_groups > 0
  56. && ( $rx_groups[0]->{position} == $rx->{position} )
  57. )
  58. || $pos_diff > 1
  59. )
  60. { # use (
  61. $regex = &_flatten_rx_recursive(
  62. "$regex" . ( $bracket_symbol x $pos_diff ) . $local_regex,
  63. $rx->{position}, $open_brackets += $pos_diff, @rx_groups );
  64. }
  65. else {
  66. $regex = &_flatten_rx_recursive(
  67. "$regex$local_regex", $rx->{position},
  68. $open_brackets, @rx_groups
  69. );
  70. } ## end else [ if ( scalar @rx_groups...)]
  71. }
  72. elsif ( $pos_diff < 0 ) { # is higher
  73. $regex = &_flatten_rx_recursive(
  74. "$regex)$local_regex",
  75. $rx->{position},
  76. $open_brackets - 1, #($rx->{position} - $lastpos),
  77. @rx_groups
  78. );
  79. }
  80. else {
  81. confess
  82. "FL: pos=$rx->{position} lastpos=$lastpos regex='$regex' open=$open_brackets\n";
  83. }
  84. }
  85. return $regex;
  86. } ## end sub _flatten_rx_recursive ($$$@)
  87. use warnings 'recursion';
  88. sub _flatten_rx ($@) {
  89. my $regex = shift;
  90. my @rx_groups = @_;
  91. #say "calling flatten_rx with regex=$regex quality=$quality";
  92. #use Data::Printer;
  93. #p( @rx_groups );
  94. $regex = _flatten_rx_recursive( $regex, 0, 0, @rx_groups );
  95. return $regex;
  96. } ## end sub _flatten_rx ($@)
  97. # expands pattern of form "FFFB[10:EB]" to FFFB10, FFFB11, ... FFFBEB
  98. sub _expand_pattern ($) {
  99. my $pattern = shift;
  100. $pattern =~ s/\[!/[^/g;
  101. $pattern =~ s/([0-9A-F]{2}):([0-9A-F]{2})\]/$1-$2]/g;
  102. $pattern =~ s/([0-9A-F]{2})/\\x$1/g;
  103. # substitute hex with printable ASCII-Output
  104. $pattern =~ s#\\x(3[0-9]|[46][1-9A-F]|[57][0-9A])#chr( hex($1) );#egs;
  105. return $pattern;
  106. } ## end sub _expand_pattern ($)
  107. # expands offsets min,max to regex ".{$min,$max}" and uses workarounds if $min or $max exceeds 32766
  108. sub _expand_offsets($$) {
  109. my $minoffset = shift;
  110. my $maxoffset = shift;
  111. my $byte =
  112. '.'; # HINT: needs the character set modifier "aa" in $foo=~m/$regex/aa
  113. #my $byte = '[\x00-\xff]';
  114. my $offset_expanded = "";
  115. if ( ( ( not defined $minoffset ) || ( length($minoffset) == 0 ) )
  116. && ( ( not defined $maxoffset ) || ( length($maxoffset) == 0 ) ) )
  117. {
  118. $offset_expanded = "";
  119. }
  120. elsif (( defined $minoffset )
  121. && ( length($minoffset) > 0 )
  122. && ( defined $maxoffset )
  123. && ( length($maxoffset) > 0 )
  124. && ( $minoffset == $maxoffset ) )
  125. {
  126. if ( $minoffset > 0 ) {
  127. my $maxloops = int( $maxoffset / 32766 );
  128. my $maxresidual = $maxoffset % 32766;
  129. for ( my $i = 0 ; $i < $maxloops ; $i++ ) {
  130. $offset_expanded .= $byte . "{32766}";
  131. }
  132. $offset_expanded .= $byte . "{$maxresidual}";
  133. } ## end if ( $minoffset > 0 )
  134. }
  135. else {
  136. # workaround, because perl quantifier limits,
  137. # calc How many repetitions we need! Both offsets should be less than 32766
  138. #TODO: check if this comes from Droid or is calculated
  139. my $mintmp = 0;
  140. my $maxtmp = 0;
  141. if ( defined $minoffset && ( length($minoffset) > 0 ) ) {
  142. $mintmp = $minoffset;
  143. }
  144. if ( defined $maxoffset && ( length($maxoffset) > 0 ) ) {
  145. $maxtmp = $maxoffset;
  146. }
  147. my $maxloops;
  148. if ( $maxtmp >= $mintmp ) {
  149. $maxloops = int( $maxtmp / 32766 );
  150. }
  151. else {
  152. $maxloops = int( $mintmp / 32766 );
  153. }
  154. my $maxresidual = $maxtmp % 32766;
  155. my $minresidual = $mintmp % 32766;
  156. #say "\tMaxloops=$maxloops maxres = $maxresidual minres=$minresidual";
  157. my @offsets;
  158. my $minstr = "";
  159. my $maxstr = "";
  160. if ( defined $minoffset && length($minoffset) > 0 ) {
  161. $minstr = $minresidual;
  162. $mintmp = $mintmp - $minresidual;
  163. }
  164. for ( my $i = 0 ; $i <= $maxloops ; $i++ ) {
  165. # loop, so we assure the special handling of residuals
  166. if ( $maxtmp > $maxresidual ) {
  167. $maxstr = 32766;
  168. }
  169. elsif ( $maxtmp < 0 ) {
  170. $maxstr = 0;
  171. }
  172. else {
  173. $maxstr = $maxresidual;
  174. }
  175. if ( $mintmp > $minresidual ) {
  176. $minstr = 32766;
  177. }
  178. elsif ( $mintmp < 0 ) {
  179. $minstr = 0;
  180. }
  181. else {
  182. $minstr = $minresidual;
  183. }
  184. #### handle residuals
  185. if ( $i == 0 ) {
  186. $minstr = $minresidual;
  187. $mintmp = $mintmp - $minresidual;
  188. }
  189. elsif ( $i == $maxloops ) {
  190. $maxstr = $maxresidual;
  191. $maxtmp = $maxtmp - $maxresidual;
  192. }
  193. # mark offsets
  194. my $tmp;
  195. $tmp->{minoffset} = $minstr;
  196. $tmp->{maxoffset} = $maxstr;
  197. push @offsets, $tmp;
  198. } ## end for ( my $i = 0 ; $i <=...)
  199. my @filtered = map {
  200. if ( !defined $maxoffset || length($maxoffset) == 0 ) {
  201. $_->{maxoffset} = "";
  202. }
  203. if ( !defined $minoffset || length($minoffset) == 0 ) {
  204. $_->{minoffset} = "";
  205. }
  206. $_;
  207. } @offsets;
  208. foreach my $tmp (@filtered) {
  209. # ? at the end - means non-greedy
  210. #$offset_expanded .= $byte."{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}?";
  211. $offset_expanded .=
  212. $byte . "{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}";
  213. } ## end foreach my $tmp (@filtered)
  214. } ## end else [ if ( ( ( not defined $minoffset...)))]
  215. #say "DEBUG: minoffset='$minoffset' maxoffset='$maxoffset' --> offset_expanded='$offset_expanded'";
  216. # minimization steps
  217. $offset_expanded =~ s#{0,}#*#g;
  218. $offset_expanded =~ s#{1,}#+#g;
  219. $offset_expanded =~ s#{0,1}#?#g;
  220. return $offset_expanded;
  221. } ## end sub _expand_offsets($$)
  222. # got XPath-object and returns a regex-structure as hashref
  223. sub _parse_fragments ($) {
  224. my $fq = shift;
  225. my $position = $fq->getAttribute('Position');
  226. my $minoffset = $fq->getAttribute('MinOffset');
  227. my $maxoffset = $fq->getAttribute('MaxOffset');
  228. my $rx = $fq->textContent;
  229. my $expanded = _expand_pattern($rx);
  230. my $ret;
  231. $ret->{position} = $position;
  232. $ret->{direction} = "left";
  233. $ret->{regex} = "";
  234. my ($offset_expanded) = _expand_offsets( $minoffset, $maxoffset );
  235. if ( $fq->localname eq "LeftFragment" ) {
  236. $ret->{direction} = "left";
  237. $ret->{regex} = "($expanded)$offset_expanded";
  238. }
  239. elsif ( $fq->localname eq "RightFragment" ) {
  240. $ret->{direction} = "right";
  241. $ret->{regex} = "$offset_expanded($expanded)";
  242. }
  243. #say "pF: rx=$rx expanded=$expanded offset=$offset_expanded";
  244. return $ret;
  245. } ## end sub _parse_fragments ($)
  246. # got XPath-object and search direction and returns a regex-structure as hashref
  247. sub _parse_subsequence ($$) {
  248. my $ssq = shift;
  249. my $dir = shift;
  250. my $position = $ssq->getAttribute('Position');
  251. my $minoffset = $ssq->getAttribute('SubSeqMinOffset');
  252. my $maxoffset = $ssq->getAttribute('SubSeqMaxOffset');
  253. my $rx = $ssq->getElementsByTagName('Sequence')->get_node(1)->textContent;
  254. my @lnodes = $ssq->getElementsByTagName('LeftFragment');
  255. my @rnodes = $ssq->getElementsByTagName('RightFragment');
  256. my @lrx_fragments = map { _parse_fragments($_) } @lnodes;
  257. my @rrx_fragments = map { _parse_fragments($_) } @rnodes;
  258. my $lregex = _flatten_rx( "", @lrx_fragments );
  259. my $rregex = _flatten_rx( "", @rrx_fragments );
  260. my $expanded = _expand_pattern($rx);
  261. #if ( length($minoffset) > 0
  262. # && length($maxoffset) > 0
  263. # && $minoffset > $maxoffset ) {
  264. # confess(
  265. #"parse_subsequence: Maxoffset=$maxoffset < Minoffset=$minoffset! regex= '$rx'"
  266. # );
  267. # } ## end if ( length($minoffset...))
  268. my $offset_expanded = _expand_offsets( $minoffset, $maxoffset );
  269. my $prefix;
  270. my $suffix;
  271. my $ret;
  272. my $regex;
  273. if ( !defined $dir || length($dir) == 0 ) {
  274. $regex = join( "", $lregex, $expanded, $rregex );
  275. }
  276. elsif ( $dir eq "BOFoffset" ) {
  277. $regex =
  278. join( "", $offset_expanded, "(", $lregex, $expanded, $rregex, ")" );
  279. }
  280. elsif ( $dir eq "EOFoffset" ) {
  281. $regex =
  282. join( "", "(", $lregex, $expanded, $rregex, ")", $offset_expanded );
  283. }
  284. else {
  285. warn "unknown reference '$dir' found\n";
  286. $regex = join( "", $lregex, $expanded, $rregex );
  287. }
  288. $ret->{regex} =
  289. File::FormatIdentification::Regex::peep_hole_optimizer($regex);
  290. $ret->{position} = $position;
  291. return $ret;
  292. } ## end sub _parse_subsequence ($$)
  293. # got XPath-object and returns regex-string
  294. sub _parse_bytesequence ($) {
  295. my $bsq = shift;
  296. #say "rx_groups in parse_byte_sequence:";
  297. my $reference = $bsq->getAttribute('Reference');
  298. ; # if BOFoffset -> anchored begin of file, EOFofset -> end of file
  299. my @nodes = $bsq->getElementsByTagName('SubSequence');
  300. my @rx_groups = map { _parse_subsequence( $_, $reference ) } @nodes;
  301. my $expanded = "";
  302. my $regex_flattened = _flatten_rx( $expanded, @rx_groups );
  303. #my $ro = Regexp::Optimizer->new;
  304. #my $ro = Regexp::Assemble->new;
  305. #$ro->add( $regex_flattened);
  306. #$regex_flattened = $ro->as_string($regex_flattened);
  307. #$regex_flattened = $ro->re;
  308. my $regex;
  309. if ( !defined $reference || 0 == length($reference) ) {
  310. $regex = "$regex_flattened";
  311. }
  312. elsif ( $reference eq "BOFoffset" ) {
  313. $regex = "\\A$regex_flattened";
  314. }
  315. elsif ( $reference eq "EOFoffset" ) {
  316. $regex = "$regex_flattened\\Z";
  317. }
  318. else {
  319. warn "unknown reference '$reference' found\n";
  320. $regex = "$regex_flattened";
  321. }
  322. use Regexp::Optimizer;
  323. my $ro = Regexp::Optimizer->new;
  324. #say "regex='$regex'";
  325. #$regex = $ro->as_string( $regex );
  326. return $regex;
  327. } ## end sub _parse_bytesequence ($)
  328. # ($%signatures, $%internal) = parse_signaturefile( $file )
  329. sub _parse_signaturefile($) {
  330. my $pronomfile = shift;
  331. my %signatures;
  332. # hash{internalid}->{regex} = $regex
  333. # ->{signature} = $signature
  334. my %internal_signatures;
  335. my $dom = XML::LibXML->load_xml( location => $pronomfile );
  336. $dom->indexElements();
  337. my $xp = XML::LibXML::XPathContext->new($dom);
  338. $xp->registerNs( 'droid',
  339. 'http://www.nationalarchives.gov.uk/pronom/SignatureFile' );
  340. # find Fileformats
  341. #my $tmp = $xp->find('/*[local-name() = "FFSignatureFile"]')->get_node(1);
  342. #say "E:", $tmp->nodeName;
  343. #say "EXISTS:", $xp->exists('/droid:FFSignatureFile');
  344. #say "EXISTS2", $xp->exists('/droid:FFSignatureFile/droid:FileFormatCollection/droid:FileFormat');
  345. my $fmts = $xp->find(
  346. '/*[local-name() = "FFSignatureFile"]/*[local-name() = "FileFormatCollection"]/*[local-name() = "FileFormat"]'
  347. );
  348. foreach my $fmt ( $fmts->get_nodelist() ) {
  349. my $id = $fmt->getAttribute('ID');
  350. my $mimetype = $fmt->getAttribute('MIMEtype');
  351. my $name = $fmt->getAttribute('Name');
  352. my $puid = $fmt->getAttribute('PUID');
  353. my $version = $fmt->getAttribute('Version');
  354. #
  355. ##
  356. my @extensions =
  357. map { $_->textContent() } $fmt->getElementsByTagName('Extension');
  358. my @internalsignatures =
  359. map { $_->textContent() }
  360. $fmt->getElementsByTagName('InternalSignatureID');
  361. my @haspriorityover = map { $_->textContent() }
  362. $fmt->getElementsByTagName('HasPriorityOverFileFormatID');
  363. $signatures{$id}->{mimetype} = $mimetype;
  364. $signatures{$id}->{name} = $name;
  365. $signatures{$id}->{puid} = $puid;
  366. $signatures{$id}->{version} = $version; # optional
  367. $signatures{$id}->{extensions} = \@extensions;
  368. $signatures{$id}->{internal_signatures} = \@internalsignatures;
  369. foreach my $prio (@haspriorityover) {
  370. $signatures{$id}->{priorityover}->{$prio} = 1;
  371. }
  372. foreach my $internal (@internalsignatures) {
  373. $internal_signatures{$internal}->{signature} = $id;
  374. }
  375. } ## end foreach my $fmt ( $fmts->get_nodelist...)
  376. # find InternalSignatures
  377. my $sigs =
  378. $xp->find(
  379. '/*[local-name() = "FFSignatureFile"]/*[local-name() = "InternalSignatureCollection"]/*[local-name() = "InternalSignature"]'
  380. );
  381. foreach my $sig ( $sigs->get_nodelist() ) {
  382. my $id = $sig->getAttribute('ID');
  383. my $specificity = $sig->getAttribute('Specificity');
  384. $internal_signatures{$id}->{specificity} = $specificity;
  385. #p( $sig->toString() );
  386. my @nodes = $sig->getElementsByTagName('ByteSequence');
  387. #p( @nodes );
  388. my @rx_groups = map { _parse_bytesequence($_) } @nodes;
  389. my @rx_quality =
  390. map { File::FormatIdentification::Regex::calc_quality($_); }
  391. @rx_groups;
  392. $internal_signatures{$id}->{regex} = \@rx_groups;
  393. $internal_signatures{$id}->{quality} = \@rx_quality;
  394. } ## end foreach my $sig ( $sigs->get_nodelist...)
  395. return ( \%signatures, \%internal_signatures );
  396. } ## end sub _parse_signaturefile($)
  397. sub uniq_signature_ids_by_priority {
  398. my $self = shift;
  399. my @signatures = @_;
  400. my %found_signature_ids;
  401. # which PUIDs are in list?
  402. foreach my $signatureid (@signatures) {
  403. if ( defined $signatureid ) {
  404. $found_signature_ids{$signatureid} = 1;
  405. }
  406. }
  407. # remove all signatures when actual signature has priority over
  408. foreach my $signatureid ( keys %found_signature_ids ) {
  409. foreach my $priority_over_sid (
  410. keys %{ $self->{signatures}->{$signatureid}->{priorityover} } )
  411. {
  412. if ( exists $found_signature_ids{$priority_over_sid} ) {
  413. delete $found_signature_ids{$priority_over_sid};
  414. }
  415. } ## end foreach my $priority_over_sid...
  416. } ## end foreach my $signatureid ( keys...)
  417. # reduce list to all signatures with correct priority
  418. my @result =
  419. grep { defined $found_signature_ids{ $_->{signature} } } @signatures;
  420. return @result;
  421. } ## end sub uniq_signature_ids_by_priority
  422. has 'droid_signature_filename' => (
  423. is => 'ro',
  424. required => 1,
  425. reader => 'get_droid_signature_filename',
  426. trigger => sub {
  427. my $self = shift;
  428. #say "TRIGGER";
  429. my $yaml_file = $self->get_droid_signature_filename() . ".yaml";
  430. if ( $self->{auto_load} && -e $yaml_file ) {
  431. $self->load_from_yamlfile($yaml_file);
  432. }
  433. else {
  434. my ( $signatures, $internal_signatures ) =
  435. _parse_signaturefile( $self->{droid_signature_filename} );
  436. $self->{signatures} = $signatures;
  437. $self->{internal_signatures} = $internal_signatures;
  438. #die;
  439. if ( $self->{auto_store} ) {
  440. $self->save_as_yamlfile($yaml_file);
  441. }
  442. } ## end else [ if ( $self->{auto_load...})]
  443. foreach my $s ( keys %{ $self->{signatures} } ) {
  444. my $puid = $self->{signatures}->{$s}->{puid};
  445. if ( defined $puid && length($puid) > 0 ) {
  446. $self->{puids}->{$puid} = $s;
  447. }
  448. }
  449. }
  450. );
  451. sub save_as_yamlfile {
  452. my $self = shift;
  453. my $filename = shift;
  454. my @res;
  455. push @res, $self->{signatures};
  456. push @res, $self->{internal_signatures};
  457. YAML::XS::DumpFile( "$filename", @res );
  458. } ## end sub save_as_yamlfile
  459. sub load_from_yamlfile {
  460. my $self = shift;
  461. my $filename = shift;
  462. my ( $sig, $int ) = YAML::XS::LoadFile($filename);
  463. $self->{signatures} = $sig;
  464. $self->{internal_signatures} = $int;
  465. } ## end sub load_from_yamlfile
  466. has 'auto_store' => (
  467. is => 'ro',
  468. default => 1,
  469. );
  470. has 'auto_load' => (
  471. is => 'ro',
  472. default => 1,
  473. );
  474. sub get_all_signature_ids {
  475. my $self = shift;
  476. my @sigs = sort { $a <=> $b } keys %{ $self->{signatures} };
  477. return @sigs;
  478. }
  479. sub get_signature_id_by_puid {
  480. my $self = shift;
  481. my $puid = shift;
  482. my $sig = $self->{puids}->{$puid};
  483. no warnings;
  484. return $sig;
  485. use warnings;
  486. }
  487. sub get_internal_ids_by_puid {
  488. my $self = shift;
  489. my $puid = shift;
  490. my $sig = $self->get_signature_id_by_puid($puid);
  491. my @ids = ();
  492. if ( defined $sig ) {
  493. @ids = grep { defined $_ }
  494. @{ $self->{signatures}->{$sig}->{internal_signatures} };
  495. }
  496. return @ids;
  497. }
  498. sub get_file_endings_by_puid {
  499. my $self = shift;
  500. my $puid = shift;
  501. my $sig = $self->get_signature_id_by_puid($puid);
  502. my @endings = ();
  503. if ( defined $sig ) {
  504. @endings = $self->{signatures}->{$sig}->{extensions};
  505. }
  506. return @endings;
  507. }
  508. sub get_all_internal_ids {
  509. my $self = shift;
  510. my @ids = sort { $a <=> $b } keys %{ $self->{internal_signatures} };
  511. foreach my $id (@ids) {
  512. if ( !defined $id ) { confess("$id not defined") }
  513. }
  514. return @ids;
  515. }
  516. sub get_all_puids {
  517. my $self = shift;
  518. my @ids =
  519. sort grep { defined $_ }
  520. map { $self->{signatures}->{$_}->{puid}; }
  521. grep { defined $_ } $self->get_all_signature_ids();
  522. return @ids;
  523. }
  524. sub get_regular_expressions_by_internal_id {
  525. my $self = shift;
  526. my $internalid = shift;
  527. if ( !defined $internalid ) { confess("internalid must exists!"); }
  528. my @rx = @{ $self->{internal_signatures}->{$internalid}->{regex} };
  529. return @rx;
  530. }
  531. sub get_all_regular_expressions {
  532. my $self = shift;
  533. my @ids = $self->get_all_internal_ids();
  534. my @regexes = ();
  535. foreach my $id (@ids) {
  536. my @rx = $self->get_regular_expressions_by_internal_id($id);
  537. push @regexes, @rx;
  538. }
  539. my @ret = sort @regexes;
  540. return @ret;
  541. }
  542. sub get_qualities_by_internal_id {
  543. my $self = shift;
  544. my $internalid = shift;
  545. if ( !defined $internalid ) { confess("internalid must exists!"); }
  546. my $value = $self->{internal_signatures}->{$internalid}->{quality};
  547. if ( defined $value ) {
  548. return @{$value};
  549. }
  550. return;
  551. }
  552. sub get_signature_id_by_internal_id {
  553. my $self = shift;
  554. my $internalid = shift;
  555. if ( !defined $internalid ) { confess("internalid must exists!"); }
  556. return $self->{internal_signatures}->{$internalid}->{signature};
  557. }
  558. sub get_name_by_signature_id {
  559. my $self = shift;
  560. my $signature = shift;
  561. return $self->{signatures}->{$signature}->{name};
  562. }
  563. sub get_puid_by_signature_id {
  564. my $self = shift;
  565. my $signature = shift;
  566. return $self->{signatures}->{$signature}->{puid};
  567. }
  568. sub get_puid_by_internal_id {
  569. my $self = shift;
  570. my $internalid = shift;
  571. if ( !defined $internalid ) { confess("internalid must exists!"); }
  572. my $signature = $self->get_signature_id_by_internal_id($internalid);
  573. return $self->get_puid_by_signature_id($signature);
  574. }
  575. sub get_quality_sorted_internal_ids {
  576. my $self = shift;
  577. my @ids = sort {
  578. # sort by regexes
  579. my @a_rxq = @{ $self->{internal_signatures}->{$a}->{quality} };
  580. my @b_rxq = @{ $self->{internal_signatures}->{$b}->{quality} };
  581. my $aq = 0;
  582. foreach my $as (@a_rxq) { $aq += $as; }
  583. my $bq = 0;
  584. foreach my $bs (@b_rxq) { $bq += $bs; }
  585. #use Data::Printer;
  586. #p( $a );
  587. #p( $aq );
  588. $aq <=> $bq;
  589. } $self->get_all_internal_ids();
  590. return @ids;
  591. }
  592. sub get_combined_regex_by_puid {
  593. my $self = shift;
  594. my $puid = shift;
  595. my @internals = $self->get_internal_ids_by_puid($puid);
  596. #use Data::Printer;
  597. #p( $puid );
  598. #p( @internals );
  599. my @regexes = map {
  600. my @regexes_per_internal =
  601. $self->get_regular_expressions_by_internal_id($_);
  602. my $combined =
  603. File::FormatIdentification::Regex::and_combine(@regexes_per_internal);
  604. #p( $combined );
  605. $combined;
  606. } @internals;
  607. my $result = File::FormatIdentification::Regex::or_combine(@regexes);
  608. #p( $result );
  609. return $result;
  610. }
  611. sub _prepare_statistics {
  612. my $self = shift;
  613. my $results;
  614. # count of PUIDs
  615. # count of internal ids (IDs per PUID)
  616. # count of regexes
  617. # count of file endings only
  618. # count of internal ids without PUID
  619. # larges and shortest regex
  620. # complex and simple regex
  621. # common regexes
  622. #say "stat";
  623. my @puids = $self->get_all_puids();
  624. my $puids = scalar(@puids);
  625. my @internals = $self->get_all_internal_ids();
  626. my $internals = scalar(@internals);
  627. my $regexes = 0;
  628. my $fileendingsonly = 0;
  629. my @fileendingsonly = ();
  630. my $fileendings = 0;
  631. my $int_per_puid = 0;
  632. my $internal_without_puid = 0;
  633. my @internal_without_puid = ();
  634. my @quality_sorted_internal_ids = $self->get_quality_sorted_internal_ids();
  635. my %uniq_regexes;
  636. foreach my $internalid (@internals) {
  637. my @regexes =
  638. $self->get_regular_expressions_by_internal_id($internalid);
  639. foreach my $rx (@regexes) {
  640. my @tmp = ();
  641. if ( exists $uniq_regexes{$rx} ) {
  642. @tmp = @{ $uniq_regexes{$rx} };
  643. }
  644. push @tmp, $internalid;
  645. $uniq_regexes{$rx} = \@tmp;
  646. }
  647. $regexes += scalar(@regexes);
  648. my $sigid = $self->get_signature_id_by_internal_id($internalid);
  649. if ( !defined $sigid ) {
  650. $internal_without_puid++;
  651. push @internal_without_puid, $internalid;
  652. }
  653. }
  654. foreach my $puid (@puids) {
  655. my @ints = $self->get_internal_ids_by_puid($puid);
  656. my @fileendings = $self->get_file_endings_by_puid($puid);
  657. if ( 0 == scalar(@ints) ) {
  658. $fileendingsonly++;
  659. push @fileendingsonly, $puid;
  660. }
  661. else {
  662. $fileendings += scalar(@fileendings);
  663. $int_per_puid += scalar(@ints);
  664. }
  665. }
  666. foreach my $i (@quality_sorted_internal_ids) {
  667. my $regex =
  668. join( "#", $self->get_regular_expressions_by_internal_id($i) );
  669. my $quality = join( " ", $self->get_qualities_by_internal_id($i) );
  670. }
  671. $results->{filename} = $self->get_droid_signature_filename();
  672. $results->{count_of_puids} = $puids;
  673. $results->{count_of_internal_ids} = $internals;
  674. $results->{count_of_regular_expressions} = $regexes;
  675. $results->{count_of_fileendings} = $fileendings;
  676. $results->{count_of_puid_with_fileendings_only} = $fileendingsonly;
  677. $results->{puids_with_fileendings_only} = \@fileendingsonly;
  678. $results->{count_of_orphaned_internal_ids} = $internal_without_puid;
  679. $results->{internal_ids_without_puids} = \@internal_without_puid;
  680. no warnings;
  681. for ( my $i = 0 ; $i <= 4 ; $i++ ) {
  682. my $best_quality_internal = pop @quality_sorted_internal_ids;
  683. if ( defined $best_quality_internal ) {
  684. my $best_quality = join( ";",
  685. $self->get_qualities_by_internal_id($best_quality_internal) );
  686. my $best_puid =
  687. $self->get_puid_by_internal_id($best_quality_internal);
  688. my $best_name =
  689. $self->get_name_by_signature_id(
  690. $self->get_signature_id_by_internal_id($best_quality_internal)
  691. );
  692. my $best_regex = $self->get_combined_regex_by_puid($best_puid);
  693. $results->{nth_best_quality}->[$i]->{internal_id} =
  694. $best_quality_internal;
  695. $results->{nth_best_quality}->[$i]->{puid} = $best_puid;
  696. $results->{nth_best_quality}->[$i]->{name} = $best_name;
  697. $results->{nth_best_quality}->[$i]->{quality} = $best_quality;
  698. $results->{nth_best_quality}->[$i]->{combined_regex} = $best_regex;
  699. }
  700. }
  701. for ( my $i = 0 ; $i <= 4 ; $i++ ) {
  702. my $worst_quality_internal = shift @quality_sorted_internal_ids;
  703. if ( defined $worst_quality_internal ) {
  704. my $worst_quality = join( ";",
  705. $self->get_qualities_by_internal_id($worst_quality_internal) );
  706. my $worst_puid =
  707. $self->get_puid_by_internal_id($worst_quality_internal);
  708. my $worst_name =
  709. $self->get_name_by_signature_id(
  710. $self->get_signature_id_by_internal_id($worst_quality_internal)
  711. );
  712. my $worst_regex = $self->get_combined_regex_by_puid($worst_puid);
  713. $results->{nth_worst_quality}->[$i]->{internal_id} =
  714. $worst_quality_internal;
  715. $results->{nth_worst_quality}->[$i]->{puid} = $worst_puid;
  716. $results->{nth_worst_quality}->[$i]->{name} = $worst_name;
  717. $results->{nth_worst_quality}->[$i]->{quality} = $worst_quality;
  718. $results->{nth_worst_quality}->[$i]->{combined_regex} =
  719. $worst_regex;
  720. }
  721. }
  722. my @multiple_used_regex = grep {
  723. my $tmp = $uniq_regexes{$_};
  724. my @tmp = @{$tmp};
  725. scalar(@tmp) > 1
  726. } sort keys %uniq_regexes;
  727. $results->{count_of_multiple_used_regex} = scalar(@multiple_used_regex);
  728. for ( my $i = 0 ; $i <= $#multiple_used_regex ; $i++ ) {
  729. $results->{multiple_used_regex}->[$i]->{regex} =
  730. $multiple_used_regex[$i];
  731. my @ids = join( ",", @{ $uniq_regexes{ $multiple_used_regex[$i] } } );
  732. $results->{multiple_used_regex}->[$i]->{internal_ids} = \@ids;
  733. }
  734. return $results;
  735. }
  736. sub print_csv_statistics {
  737. my $self = shift;
  738. my $results = $self->_prepare_statistics();
  739. my $version = $results->{filename};
  740. $version =~ s/DROID_SignatureFile_V(\d+)\.xml/$1/;
  741. $results->{version} = $version;
  742. $results->{best_quality_puid} = $results->{nth_best_quality}->[0]->{puid};
  743. $results->{best_quality_internal_id} =
  744. $results->{nth_best_quality}->[0]->{internal_id};
  745. $results->{best_quality_quality} =
  746. $results->{nth_best_quality}->[0]->{quality};
  747. $results->{best_quality_combined_regex} =
  748. $results->{nth_best_quality}->[0]->{combined_regex};
  749. $results->{worst_quality_puid} = $results->{nth_worst_quality}->[0]->{puid};
  750. $results->{worst_quality_internal_id} =
  751. $results->{nth_worst_quality}->[0]->{internal_id};
  752. $results->{worst_quality_quality} =
  753. $results->{nth_worst_quality}->[0]->{quality};
  754. $results->{worst_quality_combined_regex} =
  755. $results->{nth_worst_quality}->[0]->{combined_regex};
  756. my @headers =
  757. qw(version filename count_of_puids count_of_internal_ids count_of_regular_expressions count_of_fileendings count_of_puid_with_fileendings_only count_of_orphaned_internal_ids count_of_multiple_used_regex best_quality_puid best_quality_internal_id best_quality_quality best_quality_combined_regex worst_quality_puid worst_quality_internal_id worst_quality_quality worst_quality_combined_regex);
  758. say "#", join( ",", @headers );
  759. say join(
  760. ",",
  761. map {
  762. my $result = $results->{$_};
  763. if ( !defined $result ) { $result = ""; }
  764. $result;
  765. } @headers
  766. );
  767. }
  768. sub print_statistics {
  769. my $self = shift;
  770. my $verbose = shift;
  771. my $results = $self->_prepare_statistics();
  772. say "Statistics of file $results->{filename}";
  773. say "=======================================";
  774. say "";
  775. say "Countings";
  776. say "---------------------------------------";
  777. say "Count of PUIDs: $results->{count_of_puids}";
  778. say
  779. " internal IDs: $results->{count_of_internal_ids}";
  780. say
  781. " regular expressions: $results->{count_of_regular_expressions}";
  782. say
  783. " file endings: $results->{count_of_fileendings}";
  784. say
  785. " PUIDs with file endings only: $results->{count_of_puid_with_fileendings_only}";
  786. if ( defined $verbose ) {
  787. say " (",
  788. join( ",", @{ $results->{internal_ids_without_puids} } ), ")";
  789. }
  790. say
  791. " orphaned internal IDs: $results->{count_of_orphaned_internal_ids}";
  792. if ( defined $verbose ) {
  793. say " (",
  794. join( ",", @{ $results->{internal_ids_without_puids} } ), ")";
  795. }
  796. say "";
  797. say "Quality of internal IDs";
  798. say "---------------------------------------";
  799. my $nth = 1;
  800. foreach my $n ( @{ $results->{nth_best_quality} } ) {
  801. say
  802. "$nth-best quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
  803. if ( defined $verbose ) {
  804. say " combined regex: ", $n->{combined_regex};
  805. }
  806. $nth++;
  807. }
  808. say "";
  809. $nth = 1;
  810. foreach my $n ( @{ $results->{nth_worst_quality} } ) {
  811. say
  812. "$nth-worst quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
  813. if ( defined $verbose ) {
  814. say " combined regex: ", $n->{combined_regex};
  815. }
  816. $nth++;
  817. }
  818. say "";
  819. say "";
  820. say "Regular expressions";
  821. say "---------------------------------------";
  822. say
  823. "Count of multiple used regular expressions: $results->{count_of_multiple_used_regex}";
  824. if ( defined $verbose ) {
  825. for ( my $i = 0 ; $i < $results->{count_of_multiple_used_regex} ; $i++ )
  826. {
  827. say " common regex group no $i:";
  828. say " regex='"
  829. . $results->{multiple_used_regex}->[$i]->{regex} . "'";
  830. say " internal IDs: ",
  831. join( ",",
  832. @{ $results->{multiple_used_regex}->[$i]->{internal_ids} } );
  833. }
  834. }
  835. say "";
  836. #my @rx = $self->get_all_regular_expressions();
  837. #use Data::Printer;
  838. #p( %uniq_regexes );
  839. }
  840. 1;
  841. no Moose;
  842. __PACKAGE__->meta->make_immutable;
  843. __END__
  844. # Below is stub documentation for your module. You'd better edit it!
  845. =head1 NAME
  846. File::FormatIdentification::Pronom - Perl extension for parsing Pronom-Signatures using DROID-Signature file
  847. =head1 SYNOPSIS
  848. use File::FormatIdentification::Pronom;
  849. my $pronomfile = "Droid-Signature.xml";
  850. my ( $signatures, $internals ) = parse_signaturefile($pronomfile);
  851. =head1 DESCRIPTION
  852. Stub documentation for File::FormatIdentification::Pronom, created by h2xs. It looks like the
  853. author of the extension was negligent enough to leave the stub
  854. unedited.
  855. Blah blah blah.
  856. =head2 EXPORT
  857. None by default.
  858. =head1 SEE ALSO
  859. Mention other useful documentation such as the documentation of
  860. related modules or operating system documentation (such as man pages
  861. in UNIX), or any relevant external documentation such as RFCs or
  862. standards.
  863. If you have a mailing list set up for your module, mention it here.
  864. If you have a web site set up for your module, mention it here.
  865. =head1 AUTHOR
  866. art1, E<lt>art1@E<gt>
  867. =head1 COPYRIGHT AND LICENSE
  868. Copyright (C) 2018 by art1
  869. This library is free software; you can redistribute it and/or modify
  870. it under the same terms as Perl itself, either Perl version 5.24.1 or,
  871. at your option, any later version of Perl 5 you may have available.
  872. =cut