Browse Source

- init

master
commit
1b0e7d299f
11 changed files with 44435 additions and 0 deletions
  1. +0
    -0
      Changes
  2. +11
    -0
      MANIFEST
  3. +31
    -0
      Makefile.PL
  4. +73
    -0
      README
  5. +334
    -0
      bin/pronom2wxhexeditor.pl
  6. +35
    -0
      bin/pronom_statistics.pl
  7. +996
    -0
      lib/File/FormatIdentification/Pronom.pm
  8. +251
    -0
      lib/File/FormatIdentification/Regex.pm
  9. +42544
    -0
      t/DROID_SignatureFile_V93.xml
  10. +49
    -0
      t/File/FormatIdentification/Pronom.t
  11. +111
    -0
      t/File/FormatIdentification/Regex.t

+ 0
- 0
Changes View File


+ 11
- 0
MANIFEST View File

@@ -0,0 +1,11 @@
Changes
Makefile.PL
MANIFEST
README
t/File/FormatIdentification/Pronom.t
t/File/FormatIdentification/Regex.t
t/DROID_SignatureFile_V93.xml
lib/File/FormatIdentification/Pronom.pm
lib/File/FormatIdentification/Regex.pm
bin/pronom2wxhexeditor.pl
bin/pronom_statistics.pl

+ 31
- 0
Makefile.PL View File

@@ -0,0 +1,31 @@
use 5.024001;
use ExtUtils::MakeMaker;
# See lib/ExtUtils/MakeMaker.pm for details of how to influence
# the contents of the Makefile that is written.
WriteMakefile(
NAME => 'File::FormatIdentification::Pronom',
VERSION_FROM => 'lib/File/FormatIdentification/Pronom.pm', # finds $VERSION, requires EU::MM from perl >= 5.5
PREREQ_PM => {
"XML::LibXML" => 0,
"Carp" => 0,
"List::Util" => 0,
"Scalar::Util" => 0,
"YAML::XS" => 0,
"Moose" => 0,
"String::LCSS" => 0,
"Carp" => 0,
"Regexp::Assemble" => 0,
"Regexp::Optimizer" => 0,

},
ABSTRACT_FROM => 'lib/File/FormatIdentification/Pronom.pm', # retrieve abstract from module
AUTHOR => 'Andreas Romeyke <art1@andreas-romeyke.de>',
LICENSE => 'perl_5',
test => {
"TESTS" => "t/File/FormatIdentification/*.t"
},
EXE_FILES => [
"bin/pronom2wxhexeditor.pl",
"bin/pronom_statistics.pl"
],
);

+ 73
- 0
README View File

@@ -0,0 +1,73 @@
File-FormatIdentification-Pronom version 0.01
=============================================

The module allows to handle Droid signatures. Droid is a utility which
uses the PRONOM database to identify file formats.

See https://www.nationalarchives.gov.uk/PRONOM/ for details.

With this module you could:

* convert Droid signatures to Perl regular expressions
* analyze files and display which/where pattern of Droid signature matches via tag-files for wxHexEditor
* calc statistics about Droid signatures

The module is in early alpha state and should not be used in production.

INSTALLATION

To install this module type the following:

perl Makefile.PL
make
make test
make install

DEPENDENCIES

This module requires these other modules and libraries:

* XML::LibXML
* Carp
* List::Util
* Scalar::Util
* YAML::XS
* Moose
* String::LCSS
* Regexp::Assemble
* Regexp::Optimizer

The script is only tested under GNU/Linux Debian (Stretch)

CALL

$> perl ./pronom2wxhexeditor.pl DROID-SIGNATURE.xml binaryfile

The first run produces a file "DROID-SIGNATURE.xml.yaml" which holds the PCREs
to avoid parsing DROID-SIGNATURE.xml again and again. This file is human
readable

The run produces a file "binaryfile.tags" which holds the tags used by
wxHexEditor when you open the file "binaryfile".

The run also produces a file "binaryfile.html" which can be viewed in
Webbrowser to get all matches as an overview.

$> perl ./pronom_statistics.pl DROID-SIGNATURE.xml

This prints some useful statistics about the signature file.

BUGS

* Some droid recipes results in PCREs which are greedy and therefore the running
time could be exponential with size of binary file.


COPYRIGHT AND LICENCE

Copyright (C) 2018 by Andreas Romeyke

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.


+ 334
- 0
bin/pronom2wxhexeditor.pl View File

@@ -0,0 +1,334 @@
#!/usr/bin/env perl
#===============================================================================
#
# FILE: pronom2wxhexeditor.pl
#
# USAGE: ./pronom2wxhexeditor.pl
#
# DESCRIPTION: perl ./pronom2wxhexeditor.pl <DROIDSIGNATURE-FILE> <BINARYFILE>
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Andreas Romeyke,
# CREATED: 28.08.2018 14:26:43
# REVISION: ---
#===============================================================================
use strict;
use warnings 'FATAL';
use utf8;
use v5.24;
use Fcntl qw(:seek);
use Digest::CRC qw( crc8 );
use Scalar::Util;
use File::Map qw(:map :extra);
use File::FormatIdentification::Pronom;

# calc a random color
sub rndcolor {
my $rgb = int( rand( 256 * 256 * 256 ) );
return sprintf( "#%06x", $rgb );
}

sub puidcolor {
my $puid = shift;
my $crc = crc8($puid);
return sprintf( "#%02x%02x%02x", $crc, $crc, $crc );
}

sub dircolor {
my $direction = shift;
my $pos = shift;
if ( $direction > 0 ) {
return sprintf( "#ff%04x", $pos * $pos );
}
elsif ( $direction < 0 ) {
return sprintf( "#%04xff", $pos * $pos );
}
else {
return sprintf( "#%02xff%02x", $pos, $pos );
}
} ## end sub dircolor

# helper function to collect all things needed for output and adds to a given buffer
sub push_output ($$$$$$$$$$) {
my %tmp;
$tmp{puid} = shift;
$tmp{name} = shift;
$tmp{begin} = shift;
$tmp{end} = shift;
$tmp{regex} = shift;

#$tmp{hexdump} = shift;
$tmp{position} = shift;
$tmp{signature} = shift;
$tmp{internal_signature} = shift;
$tmp{bytesequence} = shift;
my $ref_buffer = shift;
push @{$ref_buffer}, \%tmp;
return;
} ## end sub push_output

#render HTML output
sub render_for_html {
my $ref_buffer = shift;
my $fh = shift;
my $binaryfile = shift;
my @tmp = sort { $a->{begin} <=> $b->{begin} } ( @{$ref_buffer} );
say $fh <<HEAD;
<html><head />
<body>
<h1> Result for "$binaryfile"</h1>
HEAD
foreach my $tagid ( 0 .. $#tmp ) {
my $pos = $tmp[$tagid]->{position};
my $begin = $tmp[$tagid]->{begin};
my $end = $tmp[$tagid]->{end};
my $puid = $tmp[$tagid]->{puid};
my $name = $tmp[$tagid]->{name};
my $regex = $tmp[$tagid]->{regex};
my $internal = $tmp[$tagid]->{internal_signature};
my $bytesequence = $tmp[$tagid]->{bytesequence};
my $partial = get_partial_regex( $pos, $regex );

#my $hexdump = $tmp[$tagid]->{hexdump};
#if ( length($hexdump) > 10 ) {
# $hexdump = substr( $hexdump, 0, 10 ) . "...";
#}
my $fgcolor = puidcolor($puid);

#my $bgcolor = dircolor( $begin <=> $end, $pos );
my $bgcolor = rndcolor();
say $fh "
<h2>$puid</h2>
<p>Internal Signature: $internal</p>
<p>Byte Sequence: $bytesequence</p>
<p>Bytes $begin - $end</p>
<p>$name</p>
<p>regex=$regex</p>
<p>matching $pos-th partial regex: $partial</p>
"

#<p>pos=$pos</p>
#<p>hexdump:<br />$hexdump</p>"
} ## end foreach my $tagid ( 0 .. $#tmp)

say $fh <<FOOT;
</body>
</html>
FOOT
return;
} ## end sub render_for_html

# render output for wxhexeditor
sub render_for_wxhexeditor {
my $ref_buffer = shift;
my $fh = shift;
my $binaryfile = shift;
my @tmp = sort {
if ( $a->{begin} == $b->{begin} ) {
return ( $a->{end} <=> $b->{end} );
}
else {
return ( $a->{begin} <=> $b->{begin} );
}
} ( @{$ref_buffer} );
say $fh <<HEAD;
<?xml version="1.0" encoding="UTF-8"?>
<wxHexEditor_XML_TAG>
<filename path="$binaryfile">
HEAD

foreach my $tagid ( 0 .. $#tmp ) {
my $pos = $tmp[$tagid]->{position};
my $begin = $tmp[$tagid]->{begin};
my $end = $tmp[$tagid]->{end};
my $puid = $tmp[$tagid]->{puid};
my $name = $tmp[$tagid]->{name};
my $regex = $tmp[$tagid]->{regex};
my $internal = $tmp[$tagid]->{internal_signature};
my $bytesequence = $tmp[$tagid]->{bytesequence};

#my $hexdump = $tmp[$tagid]->{hexdump};
#if ( length($hexdump) > 10 ) {
# $hexdump = substr( $hexdump, 0, 10 ) . "...";
#}
my $fgcolor = puidcolor($puid);

#my $bgcolor = dircolor( $begin <=> $end, $pos );
my $bgcolor = rndcolor();
my $partial = get_partial_regex( $pos, $regex );
say $fh "
<TAG id='$tagid'>
<start_offset>$begin</start_offset>
<end_offset>$end</end_offset>
<tag_text>$puid
$name
at Bytes($begin, $end)
$regex
matching $pos-th partial regex: $partial
Internal Signature: $internal
Byte Sequence: $bytesequence

</tag_text>
<font_colour>$fgcolor</font_colour>
<note_colour>$bgcolor</note_colour>
</TAG>";
} ## end foreach my $tagid ( 0 .. $#tmp)

say $fh <<FOOT;
</filename>
</wxHexEditor_XML_TAG>
FOOT
return;
} ## end sub render_for_wxhexeditor

sub get_partial_regex($$) {
my $position = shift;
my $regex = shift;
if ( $regex =~ m/\({$position}(.{20})/ ) { return "'$1'..."; }
return "";
}

################################################################################
# main
################################################################################

my $pronomfile = shift @ARGV;
my $binaryfile = shift @ARGV;

if ( !defined $pronomfile ) {
say "you need at least a pronom signature file";
}
if ( !defined $binaryfile ) {
say "you need an binaryfile";
}

# write basic main.osd

open( my $filehandle, "<", "$binaryfile" );
binmode($filehandle);
seek( $filehandle, 0, SEEK_END );
my $eof = tell($filehandle);

my $pronom = File::FormatIdentification::Pronom->new(
"droid_signature_filename" => $pronomfile );

my @output_buffer;

#my $pathobj = path($binaryfile);
#my $filestream = $pathobj->slurp_raw;
map_file my $filestream, $binaryfile, "<";
advise( $filestream, 'random' );
foreach my $internalid ( $pronom->get_all_internal_ids() ) {
my $sig = $pronom->get_signature_id_by_internal_id($internalid);
if ( !defined $sig ) { next; }
my $puid = $pronom->get_puid_by_signature_id($sig);
my $name = $pronom->get_name_by_signature_id($sig);

my @regexes = $pronom->get_regular_expressions_by_internal_id($internalid);
my @res;
my $timer = time;

#print "internalid=$internalid";
foreach my $regex (@regexes) {

# MATCHed?
#warn "$internalid, regex='$regex'\n";
if ( !defined $regex ) {
warn "No regex found for internalid $internalid\n";
}

#say "REGEX='$regex'";
if ( $filestream =~ /$regex/saa ) {
my $tmp;
$tmp->{matched} = 1;
$tmp->{regex} = $regex;

#$tmp->{groups};

#use Data::Printer;
#p( @+ );
#p( @- );
my %groups;
for ( my $match = 0 ; $match <= $#- ; $match++ ) {
if ( defined $-[$match] && defined $+[$match] ) {
my $matches;
my $begin = $-[$match];
my $end = $+[$match];
$matches->{begin} = $begin;
$matches->{end} = $end;
$matches->{pos} = $match;
$groups{ ( $begin, $end ) } = $matches;
}
}
my @uniqgroups = values %groups;

#use Data::Printer;
#p( @uniqgroups );
$tmp->{groups} = \@uniqgroups;

#p( $tmp->{groups} );
#die "matched '$_'";
push @res, $tmp;
}
else {
last; # break for loop
}
}

if ( ( scalar @res ) == ( scalar @regexes ) ) { # all matches successfull
# my %tmp;
# $tmp{puid} = shift;
# $tmp{name} = shift;
# $tmp{begin} = shift;
# $tmp{end} = shift;
# $tmp{regex} = shift;
# #$tmp{hexdump} = shift;
# $tmp{position} = shift;
# $tmp{signature} = shift;
# my $ref_buffer = shift;

for ( my $receiptidx = 0 ; $receiptidx <= $#res ; $receiptidx++ ) {
my $receipt = $res[$receiptidx];
foreach my $group ( @{ $receipt->{groups} } ) {
push_output(
$puid,
$name,
$group->{begin},
$group->{end},
$receipt->{regex},
$group->{pos},
$sig,
$internalid,
$receiptidx,

\@output_buffer
);
}
}
}

#say " ... time=", (time - $timer), "s";
} ## end foreach my $internal ( keys...)

open( my $OUT, ">", "$binaryfile.tags" );
open( my $HTML, ">", "$binaryfile.html" );

render_for_wxhexeditor(

#filter_matches_by_signature_priority( $signatures, \@output_buffer ),
\@output_buffer,
$OUT, $binaryfile
);
render_for_html(

#filter_matches_by_signature_priority( $signatures, \@output_buffer ),
\@output_buffer,
$HTML, $binaryfile
);
close $HTML;
close $OUT;

1;

+ 35
- 0
bin/pronom_statistics.pl View File

@@ -0,0 +1,35 @@
#!/usr/bin/env perl
#===============================================================================
#
# FILE: pronom_statistics.pl
#
# USAGE: ./pronom_statistics.pl
#
# DESCRIPTION: perl ./pronom_statistics.pl <DROIDSIGNATURE-FILE>
#
# OPTIONS: ---
# REQUIREMENTS: ---
# BUGS: ---
# NOTES: ---
# AUTHOR: Andreas Romeyke,
# CREATED: 28.08.2018 14:26:43
# REVISION: ---
#===============================================================================
use strict;
use warnings 'FATAL';
use utf8;
use v5.24;
use File::FormatIdentification::Pronom;

################################################################################
# main
################################################################################

my $pronomfile = shift @ARGV;
if ( !defined $pronomfile ) {
say "you need at least a pronom signature file";
}
my $pronom = File::FormatIdentification::Pronom->new(
"droid_signature_filename" => $pronomfile );
$pronom->print_csv_statistics();
1;

+ 996
- 0
lib/File/FormatIdentification/Pronom.pm View File

@@ -0,0 +1,996 @@
package File::FormatIdentification::Pronom;

use 5.024001;
use strict;
use warnings;
use diagnostics;
use XML::LibXML;
use Carp;
use List::Util qw( none first );
use Scalar::Util;
use YAML::XS;
use File::FormatIdentification::Regex;
use Moose;

our $VERSION = '0.01';

# Preloaded methods go here.
# flattens a regex-structure to a regex-string, expects a signature-pattern and a list of regex-structures
# returns regex
#
no warnings 'recursion';

sub _flatten_rx_recursive ($$$@) {
my $regex = shift;
my $lastpos = shift;
my $open_brackets = shift;
my @rx_groups = @_;
my $rx = shift @rx_groups;

#use Data::Printer;
#say "_flatten_rx_recursive";
#p( @rx_groups );
#p( $rx );
my $bracket_symbol = "(";
if ( !defined $regex ) { confess; }

if ( !defined $rx ) { # do nothing
while ( $open_brackets > 0 ) {
$regex .= ")";
$open_brackets--;
}
}
else {
my $pos_diff = $rx->{position} - $lastpos;
my $local_regex = $rx->{regex};
if ( !defined $local_regex ) {
$local_regex = '';
}
if ( 0 == $pos_diff ) {

# TODO:
File::FormatIdentification::Regex::simplify_two_or_combined_regex(
$regex, $local_regex );
$regex =
&_flatten_rx_recursive( "$regex|$local_regex", $lastpos,
$open_brackets, @rx_groups );
}
elsif ( $pos_diff > 0 ) { # is deeper
# look a head, if same pos found, then use bracket, otherwise not
if (
(
scalar @rx_groups > 0
&& ( $rx_groups[0]->{position} == $rx->{position} )
)
|| $pos_diff > 1
)
{ # use (
$regex = &_flatten_rx_recursive(
"$regex" . ( $bracket_symbol x $pos_diff ) . $local_regex,
$rx->{position}, $open_brackets += $pos_diff, @rx_groups );
}
else {
$regex = &_flatten_rx_recursive(
"$regex$local_regex", $rx->{position},
$open_brackets, @rx_groups
);
} ## end else [ if ( scalar @rx_groups...)]
}
elsif ( $pos_diff < 0 ) { # is higher
$regex = &_flatten_rx_recursive(
"$regex)$local_regex",
$rx->{position},
$open_brackets - 1, #($rx->{position} - $lastpos),
@rx_groups
);
}
else {
confess
"FL: pos=$rx->{position} lastpos=$lastpos regex='$regex' open=$open_brackets\n";
}
}
return $regex;
} ## end sub _flatten_rx_recursive ($$$@)
use warnings 'recursion';

sub _flatten_rx ($@) {
my $regex = shift;
my @rx_groups = @_;

#say "calling flatten_rx with regex=$regex quality=$quality";
#use Data::Printer;
#p( @rx_groups );
$regex = _flatten_rx_recursive( $regex, 0, 0, @rx_groups );
return $regex;
} ## end sub _flatten_rx ($@)

# expands pattern of form "FFFB[10:EB]" to FFFB10, FFFB11, ... FFFBEB
sub _expand_pattern ($) {
my $pattern = shift;
$pattern =~ s/\[!/[^/g;
$pattern =~ s/([0-9A-F]{2}):([0-9A-F]{2})\]/$1-$2]/g;
$pattern =~ s/([0-9A-F]{2})/\\x$1/g;

# substitute hex with printable ASCII-Output
$pattern =~ s#\\x(3[0-9]|[46][1-9A-F]|[57][0-9A])#chr( hex($1) );#egs;
return $pattern;
} ## end sub _expand_pattern ($)

# expands offsets min,max to regex ".{$min,$max}" and uses workarounds if $min or $max exceeds 32766
sub _expand_offsets($$) {
my $minoffset = shift;
my $maxoffset = shift;
my $byte =
'.'; # HINT: needs the character set modifier "aa" in $foo=~m/$regex/aa
#my $byte = '[\x00-\xff]';
my $offset_expanded = "";
if ( ( ( not defined $minoffset ) || ( length($minoffset) == 0 ) )
&& ( ( not defined $maxoffset ) || ( length($maxoffset) == 0 ) ) )
{
$offset_expanded = "";
}
elsif (( defined $minoffset )
&& ( length($minoffset) > 0 )
&& ( defined $maxoffset )
&& ( length($maxoffset) > 0 )
&& ( $minoffset == $maxoffset ) )
{
if ( $minoffset > 0 ) {
my $maxloops = int( $maxoffset / 32766 );
my $maxresidual = $maxoffset % 32766;
for ( my $i = 0 ; $i < $maxloops ; $i++ ) {
$offset_expanded .= $byte . "{32766}";
}
$offset_expanded .= $byte . "{$maxresidual}";
} ## end if ( $minoffset > 0 )
}
else {

# workaround, because perl quantifier limits,
# calc How many repetitions we need! Both offsets should be less than 32766
#TODO: check if this comes from Droid or is calculated

my $mintmp = 0;
my $maxtmp = 0;
if ( defined $minoffset && ( length($minoffset) > 0 ) ) {
$mintmp = $minoffset;
}
if ( defined $maxoffset && ( length($maxoffset) > 0 ) ) {
$maxtmp = $maxoffset;
}

my $maxloops;
if ( $maxtmp >= $mintmp ) {
$maxloops = int( $maxtmp / 32766 );
}
else {
$maxloops = int( $mintmp / 32766 );
}
my $maxresidual = $maxtmp % 32766;
my $minresidual = $mintmp % 32766;

#say "\tMaxloops=$maxloops maxres = $maxresidual minres=$minresidual";
my @offsets;
my $minstr = "";
my $maxstr = "";
if ( defined $minoffset && length($minoffset) > 0 ) {
$minstr = $minresidual;
$mintmp = $mintmp - $minresidual;
}

for ( my $i = 0 ; $i <= $maxloops ; $i++ ) {

# loop, so we assure the special handling of residuals
if ( $maxtmp > $maxresidual ) {
$maxstr = 32766;
}
elsif ( $maxtmp < 0 ) {
$maxstr = 0;
}
else {
$maxstr = $maxresidual;
}
if ( $mintmp > $minresidual ) {
$minstr = 32766;
}
elsif ( $mintmp < 0 ) {
$minstr = 0;
}
else {
$minstr = $minresidual;
}
#### handle residuals
if ( $i == 0 ) {
$minstr = $minresidual;
$mintmp = $mintmp - $minresidual;
}
elsif ( $i == $maxloops ) {
$maxstr = $maxresidual;
$maxtmp = $maxtmp - $maxresidual;
}

# mark offsets
my $tmp;
$tmp->{minoffset} = $minstr;
$tmp->{maxoffset} = $maxstr;
push @offsets, $tmp;
} ## end for ( my $i = 0 ; $i <=...)
my @filtered = map {
if ( !defined $maxoffset || length($maxoffset) == 0 ) {
$_->{maxoffset} = "";
}
if ( !defined $minoffset || length($minoffset) == 0 ) {
$_->{minoffset} = "";
}
$_;
} @offsets;
foreach my $tmp (@filtered) {

# ? at the end - means non-greedy
#$offset_expanded .= $byte."{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}?";
$offset_expanded .=
$byte . "{" . $tmp->{minoffset} . "," . $tmp->{maxoffset} . "}";
} ## end foreach my $tmp (@filtered)
} ## end else [ if ( ( ( not defined $minoffset...)))]

#say "DEBUG: minoffset='$minoffset' maxoffset='$maxoffset' --> offset_expanded='$offset_expanded'";

# minimization steps
$offset_expanded =~ s#{0,}#*#g;
$offset_expanded =~ s#{1,}#+#g;
$offset_expanded =~ s#{0,1}#?#g;
return $offset_expanded;
} ## end sub _expand_offsets($$)

# got XPath-object and returns a regex-structure as hashref
sub _parse_fragments ($) {
my $fq = shift;
my $position = $fq->getAttribute('Position');
my $minoffset = $fq->getAttribute('MinOffset');
my $maxoffset = $fq->getAttribute('MaxOffset');
my $rx = $fq->textContent;
my $expanded = _expand_pattern($rx);
my $ret;
$ret->{position} = $position;
$ret->{direction} = "left";
$ret->{regex} = "";

my ($offset_expanded) = _expand_offsets( $minoffset, $maxoffset );

if ( $fq->localname eq "LeftFragment" ) {
$ret->{direction} = "left";
$ret->{regex} = "($expanded)$offset_expanded";
}
elsif ( $fq->localname eq "RightFragment" ) {
$ret->{direction} = "right";
$ret->{regex} = "$offset_expanded($expanded)";
}

#say "pF: rx=$rx expanded=$expanded offset=$offset_expanded";
return $ret;
} ## end sub _parse_fragments ($)

# got XPath-object and search direction and returns a regex-structure as hashref
sub _parse_subsequence ($$) {
my $ssq = shift;
my $dir = shift;
my $position = $ssq->getAttribute('Position');
my $minoffset = $ssq->getAttribute('SubSeqMinOffset');
my $maxoffset = $ssq->getAttribute('SubSeqMaxOffset');

my $rx = $ssq->getElementsByTagName('Sequence')->get_node(1)->textContent;

my @lnodes = $ssq->getElementsByTagName('LeftFragment');
my @rnodes = $ssq->getElementsByTagName('RightFragment');
my @lrx_fragments = map { _parse_fragments($_) } @lnodes;
my @rrx_fragments = map { _parse_fragments($_) } @rnodes;
my $lregex = _flatten_rx( "", @lrx_fragments );
my $rregex = _flatten_rx( "", @rrx_fragments );
my $expanded = _expand_pattern($rx);

#if ( length($minoffset) > 0
# && length($maxoffset) > 0
# && $minoffset > $maxoffset ) {
# confess(
#"parse_subsequence: Maxoffset=$maxoffset < Minoffset=$minoffset! regex= '$rx'"
# );
# } ## end if ( length($minoffset...))

my $offset_expanded = _expand_offsets( $minoffset, $maxoffset );
my $prefix;
my $suffix;
my $ret;
my $regex;
if ( !defined $dir || length($dir) == 0 ) {
$regex = join( "", $lregex, $expanded, $rregex );
}
elsif ( $dir eq "BOFoffset" ) {
$regex =
join( "", $offset_expanded, "(", $lregex, $expanded, $rregex, ")" );
}
elsif ( $dir eq "EOFoffset" ) {
$regex =
join( "", "(", $lregex, $expanded, $rregex, ")", $offset_expanded );
}
else {
warn "unknown reference '$dir' found\n";
$regex = join( "", $lregex, $expanded, $rregex );
}
$ret->{regex} =
File::FormatIdentification::Regex::peep_hole_optimizer($regex);
$ret->{position} = $position;

return $ret;
} ## end sub _parse_subsequence ($$)

# got XPath-object and returns regex-string
sub _parse_bytesequence ($) {
my $bsq = shift;

#say "rx_groups in parse_byte_sequence:";
my $reference = $bsq->getAttribute('Reference');
; # if BOFoffset -> anchored begin of file, EOFofset -> end of file
my @nodes = $bsq->getElementsByTagName('SubSequence');
my @rx_groups = map { _parse_subsequence( $_, $reference ) } @nodes;
my $expanded = "";
my $regex_flattened = _flatten_rx( $expanded, @rx_groups );

#my $ro = Regexp::Optimizer->new;
#my $ro = Regexp::Assemble->new;
#$ro->add( $regex_flattened);
#$regex_flattened = $ro->as_string($regex_flattened);
#$regex_flattened = $ro->re;
my $regex;
if ( !defined $reference || 0 == length($reference) ) {
$regex = "$regex_flattened";
}
elsif ( $reference eq "BOFoffset" ) {
$regex = "\\A$regex_flattened";
}
elsif ( $reference eq "EOFoffset" ) {
$regex = "$regex_flattened\\Z";
}
else {
warn "unknown reference '$reference' found\n";
$regex = "$regex_flattened";
}

use Regexp::Optimizer;
my $ro = Regexp::Optimizer->new;

#say "regex='$regex'";
#$regex = $ro->as_string( $regex );
return $regex;
} ## end sub _parse_bytesequence ($)

# ($%signatures, $%internal) = parse_signaturefile( $file )
sub _parse_signaturefile($) {
my $pronomfile = shift;
my %signatures;

# hash{internalid}->{regex} = $regex
# ->{signature} = $signature
my %internal_signatures;

my $dom = XML::LibXML->load_xml( location => $pronomfile );
$dom->indexElements();
my $xp = XML::LibXML::XPathContext->new($dom);
$xp->registerNs( 'droid',
'http://www.nationalarchives.gov.uk/pronom/SignatureFile' );

# find Fileformats
#my $tmp = $xp->find('/*[local-name() = "FFSignatureFile"]')->get_node(1);
#say "E:", $tmp->nodeName;
#say "EXISTS:", $xp->exists('/droid:FFSignatureFile');
#say "EXISTS2", $xp->exists('/droid:FFSignatureFile/droid:FileFormatCollection/droid:FileFormat');

my $fmts = $xp->find(
'/*[local-name() = "FFSignatureFile"]/*[local-name() = "FileFormatCollection"]/*[local-name() = "FileFormat"]'
);
foreach my $fmt ( $fmts->get_nodelist() ) {
my $id = $fmt->getAttribute('ID');
my $mimetype = $fmt->getAttribute('MIMEtype');
my $name = $fmt->getAttribute('Name');
my $puid = $fmt->getAttribute('PUID');
my $version = $fmt->getAttribute('Version');
#

##
my @extensions =
map { $_->textContent() } $fmt->getElementsByTagName('Extension');
my @internalsignatures =
map { $_->textContent() }
$fmt->getElementsByTagName('InternalSignatureID');
my @haspriorityover = map { $_->textContent() }
$fmt->getElementsByTagName('HasPriorityOverFileFormatID');
$signatures{$id}->{mimetype} = $mimetype;
$signatures{$id}->{name} = $name;
$signatures{$id}->{puid} = $puid;
$signatures{$id}->{version} = $version; # optional
$signatures{$id}->{extensions} = \@extensions;
$signatures{$id}->{internal_signatures} = \@internalsignatures;

foreach my $prio (@haspriorityover) {
$signatures{$id}->{priorityover}->{$prio} = 1;
}

foreach my $internal (@internalsignatures) {
$internal_signatures{$internal}->{signature} = $id;
}
} ## end foreach my $fmt ( $fmts->get_nodelist...)

# find InternalSignatures
my $sigs =
$xp->find(
'/*[local-name() = "FFSignatureFile"]/*[local-name() = "InternalSignatureCollection"]/*[local-name() = "InternalSignature"]'
);

foreach my $sig ( $sigs->get_nodelist() ) {

my $id = $sig->getAttribute('ID');
my $specificity = $sig->getAttribute('Specificity');
$internal_signatures{$id}->{specificity} = $specificity;

#p( $sig->toString() );
my @nodes = $sig->getElementsByTagName('ByteSequence');

#p( @nodes );
my @rx_groups = map { _parse_bytesequence($_) } @nodes;
my @rx_quality =
map { File::FormatIdentification::Regex::calc_quality($_); }
@rx_groups;

$internal_signatures{$id}->{regex} = \@rx_groups;
$internal_signatures{$id}->{quality} = \@rx_quality;
} ## end foreach my $sig ( $sigs->get_nodelist...)

return ( \%signatures, \%internal_signatures );
} ## end sub _parse_signaturefile($)

sub uniq_signature_ids_by_priority {
my $self = shift;
my @signatures = @_;
my %found_signature_ids;

# which PUIDs are in list?
foreach my $signatureid (@signatures) {
if ( defined $signatureid ) {
$found_signature_ids{$signatureid} = 1;
}
}

# remove all signatures when actual signature has priority over
foreach my $signatureid ( keys %found_signature_ids ) {
foreach my $priority_over_sid (
keys %{ $self->{signatures}->{$signatureid}->{priorityover} } )
{
if ( exists $found_signature_ids{$priority_over_sid} ) {
delete $found_signature_ids{$priority_over_sid};
}
} ## end foreach my $priority_over_sid...
} ## end foreach my $signatureid ( keys...)

# reduce list to all signatures with correct priority
my @result =
grep { defined $found_signature_ids{ $_->{signature} } } @signatures;
return @result;
} ## end sub uniq_signature_ids_by_priority

has 'droid_signature_filename' => (
is => 'ro',
required => 1,
reader => 'get_droid_signature_filename',
trigger => sub {
my $self = shift;

#say "TRIGGER";
my $yaml_file = $self->get_droid_signature_filename() . ".yaml";
if ( $self->{auto_load} && -e $yaml_file ) {
$self->load_from_yamlfile($yaml_file);
}
else {
my ( $signatures, $internal_signatures ) =
_parse_signaturefile( $self->{droid_signature_filename} );
$self->{signatures} = $signatures;
$self->{internal_signatures} = $internal_signatures;

#die;
if ( $self->{auto_store} ) {
$self->save_as_yamlfile($yaml_file);
}
} ## end else [ if ( $self->{auto_load...})]
foreach my $s ( keys %{ $self->{signatures} } ) {
my $puid = $self->{signatures}->{$s}->{puid};
if ( defined $puid && length($puid) > 0 ) {
$self->{puids}->{$puid} = $s;
}
}
}
);

sub save_as_yamlfile {
my $self = shift;
my $filename = shift;
my @res;
push @res, $self->{signatures};
push @res, $self->{internal_signatures};
YAML::XS::DumpFile( "$filename", @res );
} ## end sub save_as_yamlfile

sub load_from_yamlfile {
my $self = shift;
my $filename = shift;
my ( $sig, $int ) = YAML::XS::LoadFile($filename);
$self->{signatures} = $sig;
$self->{internal_signatures} = $int;
} ## end sub load_from_yamlfile

has 'auto_store' => (
is => 'ro',
default => 1,
);

has 'auto_load' => (
is => 'ro',
default => 1,
);

sub get_all_signature_ids {
my $self = shift;
my @sigs = sort { $a <=> $b } keys %{ $self->{signatures} };
return @sigs;
}

sub get_signature_id_by_puid {
my $self = shift;
my $puid = shift;
my $sig = $self->{puids}->{$puid};
no warnings;
return $sig;
use warnings;
}

sub get_internal_ids_by_puid {
my $self = shift;
my $puid = shift;
my $sig = $self->get_signature_id_by_puid($puid);
my @ids = ();
if ( defined $sig ) {
@ids = grep { defined $_ }
@{ $self->{signatures}->{$sig}->{internal_signatures} };
}
return @ids;
}

sub get_file_endings_by_puid {
my $self = shift;
my $puid = shift;
my $sig = $self->get_signature_id_by_puid($puid);
my @endings = ();
if ( defined $sig ) {
@endings = $self->{signatures}->{$sig}->{extensions};
}
return @endings;
}

sub get_all_internal_ids {
my $self = shift;
my @ids = sort { $a <=> $b } keys %{ $self->{internal_signatures} };
foreach my $id (@ids) {
if ( !defined $id ) { confess("$id not defined") }
}
return @ids;
}

sub get_all_puids {
my $self = shift;
my @ids =
sort grep { defined $_ }
map { $self->{signatures}->{$_}->{puid}; }
grep { defined $_ } $self->get_all_signature_ids();
return @ids;
}

sub get_regular_expressions_by_internal_id {
my $self = shift;
my $internalid = shift;
if ( !defined $internalid ) { confess("internalid must exists!"); }
my @rx = @{ $self->{internal_signatures}->{$internalid}->{regex} };
return @rx;
}

sub get_all_regular_expressions {
my $self = shift;
my @ids = $self->get_all_internal_ids();
my @regexes = ();
foreach my $id (@ids) {
my @rx = $self->get_regular_expressions_by_internal_id($id);
push @regexes, @rx;
}
my @ret = sort @regexes;
return @ret;
}

sub get_qualities_by_internal_id {
my $self = shift;
my $internalid = shift;
if ( !defined $internalid ) { confess("internalid must exists!"); }
my $value = $self->{internal_signatures}->{$internalid}->{quality};
if ( defined $value ) {
return @{$value};
}
return;
}

sub get_signature_id_by_internal_id {
my $self = shift;
my $internalid = shift;
if ( !defined $internalid ) { confess("internalid must exists!"); }
return $self->{internal_signatures}->{$internalid}->{signature};
}

sub get_name_by_signature_id {
my $self = shift;
my $signature = shift;
return $self->{signatures}->{$signature}->{name};
}

sub get_puid_by_signature_id {
my $self = shift;
my $signature = shift;
return $self->{signatures}->{$signature}->{puid};
}

sub get_puid_by_internal_id {
my $self = shift;
my $internalid = shift;
if ( !defined $internalid ) { confess("internalid must exists!"); }
my $signature = $self->get_signature_id_by_internal_id($internalid);
return $self->get_puid_by_signature_id($signature);
}

sub get_quality_sorted_internal_ids {
my $self = shift;
my @ids = sort {

# sort by regexes
my @a_rxq = @{ $self->{internal_signatures}->{$a}->{quality} };
my @b_rxq = @{ $self->{internal_signatures}->{$b}->{quality} };
my $aq = 0;
foreach my $as (@a_rxq) { $aq += $as; }
my $bq = 0;
foreach my $bs (@b_rxq) { $bq += $bs; }

#use Data::Printer;
#p( $a );
#p( $aq );
$aq <=> $bq;
} $self->get_all_internal_ids();
return @ids;
}

sub get_combined_regex_by_puid {
my $self = shift;
my $puid = shift;
my @internals = $self->get_internal_ids_by_puid($puid);

#use Data::Printer;
#p( $puid );
#p( @internals );
my @regexes = map {
my @regexes_per_internal =
$self->get_regular_expressions_by_internal_id($_);
my $combined =
File::FormatIdentification::Regex::and_combine(@regexes_per_internal);

#p( $combined );
$combined;
} @internals;
my $result = File::FormatIdentification::Regex::or_combine(@regexes);

#p( $result );
return $result;
}

sub _prepare_statistics {
my $self = shift;
my $results;

# count of PUIDs
# count of internal ids (IDs per PUID)
# count of regexes
# count of file endings only
# count of internal ids without PUID
# larges and shortest regex
# complex and simple regex
# common regexes
#say "stat";
my @puids = $self->get_all_puids();
my $puids = scalar(@puids);
my @internals = $self->get_all_internal_ids();
my $internals = scalar(@internals);
my $regexes = 0;
my $fileendingsonly = 0;
my @fileendingsonly = ();
my $fileendings = 0;
my $int_per_puid = 0;
my $internal_without_puid = 0;
my @internal_without_puid = ();
my @quality_sorted_internal_ids = $self->get_quality_sorted_internal_ids();
my %uniq_regexes;

foreach my $internalid (@internals) {
my @regexes =
$self->get_regular_expressions_by_internal_id($internalid);
foreach my $rx (@regexes) {
my @tmp = ();
if ( exists $uniq_regexes{$rx} ) {
@tmp = @{ $uniq_regexes{$rx} };
}
push @tmp, $internalid;
$uniq_regexes{$rx} = \@tmp;
}

$regexes += scalar(@regexes);
my $sigid = $self->get_signature_id_by_internal_id($internalid);
if ( !defined $sigid ) {
$internal_without_puid++;
push @internal_without_puid, $internalid;
}
}
foreach my $puid (@puids) {
my @ints = $self->get_internal_ids_by_puid($puid);
my @fileendings = $self->get_file_endings_by_puid($puid);
if ( 0 == scalar(@ints) ) {
$fileendingsonly++;
push @fileendingsonly, $puid;
}
else {
$fileendings += scalar(@fileendings);
$int_per_puid += scalar(@ints);
}
}
foreach my $i (@quality_sorted_internal_ids) {
my $regex =
join( "#", $self->get_regular_expressions_by_internal_id($i) );
my $quality = join( " ", $self->get_qualities_by_internal_id($i) );

}

$results->{filename} = $self->get_droid_signature_filename();
$results->{count_of_puids} = $puids;
$results->{count_of_internal_ids} = $internals;
$results->{count_of_regular_expressions} = $regexes;
$results->{count_of_fileendings} = $fileendings;
$results->{count_of_puid_with_fileendings_only} = $fileendingsonly;
$results->{puids_with_fileendings_only} = \@fileendingsonly;
$results->{count_of_orphaned_internal_ids} = $internal_without_puid;
$results->{internal_ids_without_puids} = \@internal_without_puid;
no warnings;

for ( my $i = 0 ; $i <= 4 ; $i++ ) {
my $best_quality_internal = pop @quality_sorted_internal_ids;
if ( defined $best_quality_internal ) {
my $best_quality = join( ";",
$self->get_qualities_by_internal_id($best_quality_internal) );
my $best_puid =
$self->get_puid_by_internal_id($best_quality_internal);
my $best_name =
$self->get_name_by_signature_id(
$self->get_signature_id_by_internal_id($best_quality_internal)
);
my $best_regex = $self->get_combined_regex_by_puid($best_puid);
$results->{nth_best_quality}->[$i]->{internal_id} =
$best_quality_internal;
$results->{nth_best_quality}->[$i]->{puid} = $best_puid;
$results->{nth_best_quality}->[$i]->{name} = $best_name;
$results->{nth_best_quality}->[$i]->{quality} = $best_quality;
$results->{nth_best_quality}->[$i]->{combined_regex} = $best_regex;
}
}
for ( my $i = 0 ; $i <= 4 ; $i++ ) {
my $worst_quality_internal = shift @quality_sorted_internal_ids;
if ( defined $worst_quality_internal ) {
my $worst_quality = join( ";",
$self->get_qualities_by_internal_id($worst_quality_internal) );
my $worst_puid =
$self->get_puid_by_internal_id($worst_quality_internal);
my $worst_name =
$self->get_name_by_signature_id(
$self->get_signature_id_by_internal_id($worst_quality_internal)
);
my $worst_regex = $self->get_combined_regex_by_puid($worst_puid);
$results->{nth_worst_quality}->[$i]->{internal_id} =
$worst_quality_internal;
$results->{nth_worst_quality}->[$i]->{puid} = $worst_puid;
$results->{nth_worst_quality}->[$i]->{name} = $worst_name;
$results->{nth_worst_quality}->[$i]->{quality} = $worst_quality;
$results->{nth_worst_quality}->[$i]->{combined_regex} =
$worst_regex;
}
}
my @multiple_used_regex = grep {
my $tmp = $uniq_regexes{$_};
my @tmp = @{$tmp};
scalar(@tmp) > 1
} sort keys %uniq_regexes;
$results->{count_of_multiple_used_regex} = scalar(@multiple_used_regex);
for ( my $i = 0 ; $i <= $#multiple_used_regex ; $i++ ) {
$results->{multiple_used_regex}->[$i]->{regex} =
$multiple_used_regex[$i];
my @ids = join( ",", @{ $uniq_regexes{ $multiple_used_regex[$i] } } );
$results->{multiple_used_regex}->[$i]->{internal_ids} = \@ids;
}
return $results;
}

sub print_csv_statistics {
my $self = shift;
my $results = $self->_prepare_statistics();
my $version = $results->{filename};
$version =~ s/DROID_SignatureFile_V(\d+)\.xml/$1/;
$results->{version} = $version;
$results->{best_quality_puid} = $results->{nth_best_quality}->[0]->{puid};
$results->{best_quality_internal_id} =
$results->{nth_best_quality}->[0]->{internal_id};
$results->{best_quality_quality} =
$results->{nth_best_quality}->[0]->{quality};
$results->{best_quality_combined_regex} =
$results->{nth_best_quality}->[0]->{combined_regex};
$results->{worst_quality_puid} = $results->{nth_worst_quality}->[0]->{puid};
$results->{worst_quality_internal_id} =
$results->{nth_worst_quality}->[0]->{internal_id};
$results->{worst_quality_quality} =
$results->{nth_worst_quality}->[0]->{quality};
$results->{worst_quality_combined_regex} =
$results->{nth_worst_quality}->[0]->{combined_regex};

my @headers =
qw(version filename count_of_puids count_of_internal_ids count_of_regular_expressions count_of_fileendings count_of_puid_with_fileendings_only count_of_orphaned_internal_ids count_of_multiple_used_regex best_quality_puid best_quality_internal_id best_quality_quality best_quality_combined_regex worst_quality_puid worst_quality_internal_id worst_quality_quality worst_quality_combined_regex);
say "#", join( ",", @headers );
say join(
",",
map {
my $result = $results->{$_};
if ( !defined $result ) { $result = ""; }
$result;
} @headers
);
}

sub print_statistics {
my $self = shift;
my $verbose = shift;
my $results = $self->_prepare_statistics();

say "Statistics of file $results->{filename}";
say "=======================================";
say "";
say "Countings";
say "---------------------------------------";
say "Count of PUIDs: $results->{count_of_puids}";
say
" internal IDs: $results->{count_of_internal_ids}";
say
" regular expressions: $results->{count_of_regular_expressions}";
say
" file endings: $results->{count_of_fileendings}";
say
" PUIDs with file endings only: $results->{count_of_puid_with_fileendings_only}";

if ( defined $verbose ) {
say " (",
join( ",", @{ $results->{internal_ids_without_puids} } ), ")";
}
say
" orphaned internal IDs: $results->{count_of_orphaned_internal_ids}";
if ( defined $verbose ) {
say " (",
join( ",", @{ $results->{internal_ids_without_puids} } ), ")";
}
say "";
say "Quality of internal IDs";
say "---------------------------------------";

my $nth = 1;
foreach my $n ( @{ $results->{nth_best_quality} } ) {
say
"$nth-best quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
if ( defined $verbose ) {
say " combined regex: ", $n->{combined_regex};
}
$nth++;
}
say "";
$nth = 1;
foreach my $n ( @{ $results->{nth_worst_quality} } ) {
say
"$nth-worst quality internal ID (PUID, name): $n->{internal_id} ($n->{puid}, $n->{name}) -> $n->{quality}";
if ( defined $verbose ) {
say " combined regex: ", $n->{combined_regex};
}
$nth++;
}
say "";

say "";
say "Regular expressions";
say "---------------------------------------";
say
"Count of multiple used regular expressions: $results->{count_of_multiple_used_regex}";
if ( defined $verbose ) {
for ( my $i = 0 ; $i < $results->{count_of_multiple_used_regex} ; $i++ )
{
say " common regex group no $i:";
say " regex='"
. $results->{multiple_used_regex}->[$i]->{regex} . "'";
say " internal IDs: ",
join( ",",
@{ $results->{multiple_used_regex}->[$i]->{internal_ids} } );
}
}
say "";

#my @rx = $self->get_all_regular_expressions();
#use Data::Printer;
#p( %uniq_regexes );
}

1;

no Moose;
__PACKAGE__->meta->make_immutable;

__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

File::FormatIdentification::Pronom - Perl extension for parsing Pronom-Signatures using DROID-Signature file

=head1 SYNOPSIS

use File::FormatIdentification::Pronom;
my $pronomfile = "Droid-Signature.xml";
my ( $signatures, $internals ) = parse_signaturefile($pronomfile);


=head1 DESCRIPTION

Stub documentation for File::FormatIdentification::Pronom, created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head2 EXPORT

None by default.



=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

art1, E<lt>art1@E<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2018 by art1

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.24.1 or,
at your option, any later version of Perl 5 you may have available.


=cut

+ 251
- 0
lib/File/FormatIdentification/Regex.pm View File

@@ -0,0 +1,251 @@
package File::FormatIdentification::Regex;

use 5.024001;
use strict;
use warnings;
use diagnostics;
use String::LCSS;

use Regexp::Assemble;
use Regexp::Optimizer;
use Carp;
use Exporter 'import'; # gives you Exporter's import() method directly
our @EXPORT =
qw(and_combine or_combine calc_quality simplify_two_or_combined_regex peep_hole_optimizer )
; # symbols to export on request
our @EXPORT_OK = qw( hex_replace_from_bracket hex_replace_to_bracket );

our $VERSION = '0.01';


sub and_combine (@) {
my @rx_groups = map {
my $rx = $_;
my $rxfill = "";
my $ret = '';
if ( $rx =~ m#^\^$# ) { $ret = $rx; }
elsif ( $rx =~ m#^\$$# ) { $ret = $rx; }
else {
if ( $rx =~ m#\$$# ) {
$rxfill = ".*";
}
$ret = "(?=$rxfill$rx)";
}
$ret;
} @_;
my $combined = join( "", @rx_groups );

#my $rx = Regexp::Assemble->new;
#$rx->add( $combined );
#return $rx->as_string;
#my $o = Regexp::Optimizer->new;
#my $rcomb = qr/$combined/;
#return $o->as_string($rcomb);
return $combined;
}

sub or_combine (@) {
my $ro = Regexp::Assemble->new;
foreach my $rx (@_) {
$ro->add($rx);
}
return $ro->as_string;
}

sub simplify_two_or_combined_regex($$) {
my $rx1 = shift;
my $rx2 = shift;
my $common = "";
if (
# ($rx1 =~ m#\(([A-Za-z0-9]*)|(\\x[0-9A-F]{2})*\)#) &&
# ($rx2 =~ m#\(([A-Za-z0-9]*)|(\\x[0-9A-F]{2})*\)#)
( $rx1 =~ m#\(([A-Za-z0-9]*)\)# )
&& ( $rx2 =~ m#\(([A-Za-z0-9]*)\)# )

)
{
# only left simplify supported yet
my $common = String::LCSS::lcss( $rx1, $rx2 );

#say "";
#say "Found common='$common' of rx1='$rx1' rx2='$rx2'";
#say "";
}
return $common;
}

sub hex_replace_to_bracket {
my $regex = shift;
$regex =~ s#\\x([0-9A-F]{2})#\\x{$1}#g;
return $regex;
}

sub hex_replace_from_bracket {
my $regex = shift;
$regex =~ s#\\x\{([0-9A-F]{2})\}#\\x$1#g;
return $regex;
}

sub peep_hole_optimizer ($) {
my $regex = shift
; # only works if special Regexes within File::FormatIdentification:: used
$regex = hex_replace_to_bracket($regex);
my $oldregex = $regex;
##### first optimize bracket-groups
my $subrg =
qr#(?:[A-Za-z0-9])|(?:\\x\{[0-9A-F]{2}\})#; # matches: \x00-\xff or text
#my $subrg = qr#(?:\($subra\))#;
my $subre = qr#(?:\($subrg(?:\|$subrg)+\))|(?:$subrg)#
; # matches (…|…) or (…|…|…) ...
#$regex =~ s#\(\(($subra*)\)\)(?!\|)#(\1\)#g; # matches ((…))
$regex =~ s#\(\(($subre+)\)\)#($1)#g;
$regex =~ s#\(\((\([^)|]*\)(\|\([^)|]*\))+)\)\)#($1)#g;
##### optimize common subsequences
##### part1, combine bar|baz -> ba(r|z)
#say "BEFORE: regex=$regex";
while ($regex =~ m#\(($subrg*)\)\|\(($subrg*)\)#
|| $regex =~ m#($subrg*)\|($subrg*)# )
{
my $rx1 = $1;
my $rx2 = $2;

#say "common subseq: $regex -> rx1=$rx1 rx2=$rx2";

my $common = String::LCSS::lcss( $rx1, $rx2 );
if ( !defined $common || length($common) == 0 ) { last; }
if ( $common !~ m#^$subrg+$# ) { last; }

#say "!ok: $regex -> common=$common";

# common prefix
if ( $rx1 =~ m#^(.*)$common$# && $rx2 =~ m#^(.*)$common$# ) {

#say "suffix found";
$rx1 =~ m#^(.*)$common$#;
my $rx1_prefix = $1;
$rx2 =~ m#^(.*)$common$#;
my $rx2_prefix = $1;
my $subst = "($rx1_prefix|$rx2_prefix)$common";
if ( $regex =~ m#\(($subrg*)\)\|\(($subrg*)\)# ) {
$regex =~ s#\($subrg*\)\|\($subrg*\)#$subst#g;
}
elsif ( $regex =~ m#($subrg*)\|($subrg*)# ) {
$regex =~ s#$subrg*\|$subrg*#$subst#g;
}
}

# common suffix
elsif ( $rx1 =~ m#^$common(.*)$# && $rx2 =~ m#^$common(.*)$# ) {

#say "prefix found";
$rx1 =~ m#^$common(.*)$#;
my $rx1_suffix = $1;
$rx2 =~ m#^$common(.*)$#;
my $rx2_suffix = $1;
my $subst = "$common($rx1_suffix|$rx2_suffix)";

#say "subst=$subst";
if ( $regex =~ m#\(($subrg*)\)\|\(($subrg*)\)# ) {
$regex =~ s#\($subrg*\)\|\($subrg*\)#$subst#g;
}
elsif ( $regex =~ m#($subrg*)\|($subrg*)# ) {
$regex =~ s#$subrg*\|$subrg*#$subst#g;
}

#say "regex=$regex";
}
else {
last;
}
}
##### part2, combine barbara -> (bar){2}a
while ( $regex =~ m#($subrg{3,}?)(\1+)(?!$subrg*\})# ) {
my $sub = $1;
if ( $sub =~ m#^($subrg)\1+$# ) {
last;
}
my $l1 = length($1);
my $l2 = length($2);
my $matches = 1 + ( $l2 / $l1 );

#say "Found1 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";

if ( $sub =~ m#^$subrg$# ) {
$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#$sub\{$matches\}#;
}
else {
$regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#($sub)\{$matches\}#;
}
}
##### part2, combine toooor -> to{4}r
while ( $regex =~ m#($subrg)(\1{3,})(?!$subrg*\})# ) {
my $sub = $1;
my $l1 = length($1);
my $l2 = length($2);
my $matches = 1 + ( $l2 / $l1 );

#say "Found2 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";

if ( $sub =~ m#^$subrg$# ) {
$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#$sub\{$matches\}#;
}
else {
$regex =~ s#($subrg)\1{3,}(?!$subrg*\})#($sub)\{$matches\}#;
}
}
##### part2, combine foooo -> fo{4}
#while ($regex =~ m#($subrg)\1{3,}(?!$subrg*\})#) {
# my $sub = $1;
# my $matches = $#+; $matches++;
# say "Found in regex='$regex' sub='$sub' with matches=$matches";
# $regex =~ s#($subrg)\1{3,}(?!$subrg*\}#$sub\{$matches\}#;
#}
#### restore \x{ff} to \xff
$regex = hex_replace_from_bracket($regex);
if ( $regex =~ m#\\x0\{# ) {
confess "wrong substitution of oldregex = \n\t'", $oldregex,
"'\n -> \n\t'", $regex, "'";
}
return $regex;
}

sub calc_quality ($) {
my $regex = shift;

# replace all \xff with #
# replace all . with ( | | | | )
# replace all [abc] with (a|b|c)
# replace all [^abc] with (d|e|f|..|)
# then $len = count of # and $or = count of |
# divide it with $len / (1+$or)
my $len = 0;
my $alt = 0;
while ( $regex =~ s/\\x[0-9a-f]{2}// ) {
$len++;
}
while ( $regex =~ s/\[\^(.*?)\]// ) {
$alt += ( 256 - length($1) );
$len++;
}
while ( $regex =~ s/\[(.*?)\]// ) {
$alt += length($1);
$len++;
}
while ( $regex =~ s/\.// ) {
$alt += 256;
$len++;
}
while ( $regex =~ s/[A-Za-z0-9 ]// ) {
$len++;
}
my $tmp = $len / ( 1 + $alt );

my $quality = ( $tmp == 0 ) ? 0 : int( 1000 * log($tmp) ) / 1000;

#say "rest: $regex len=$len alt=$alt quality=$quality ($tmp)";
return $quality;
}

# see https://stackoverflow.com/questions/869809/combine-regexp#870506

1;

+ 42544
- 0
t/DROID_SignatureFile_V93.xml
File diff suppressed because it is too large
View File


+ 49
- 0
t/File/FormatIdentification/Pronom.t View File

@@ -0,0 +1,49 @@
#!/usr/bin/perl -w
use strict;
use warnings;
use diagnostics;

use Test::More tests => 9;
use Test::Exception;
use Test::File;
use Path::Tiny;
my $sigfile = path("t/DROID_SignatureFile_V93.xml");
my $ymlfile = path("t/DROID_SignatureFile_V93.xml.yaml");

### tests
BEGIN { use_ok("File::FormatIdentification::Pronom"); }
new_ok(
"File::FormatIdentification::Pronom" =>
[ { "droid_signature_filename" => $sigfile->absolute } ],
"object 1"
);
file_exists_ok( $ymlfile->absolute, "'object 1' has auto stored file" );
$ymlfile->remove;
new_ok(
"File::FormatIdentification::Pronom" => [
{
"droid_signature_filename" => $sigfile->absolute,
"auto_store" => 0
}
],
"object 2"
);
file_not_exists_ok( $ymlfile->absolute, "'object 2' has no auto stored file" );
my $obj3a = new_ok(
"File::FormatIdentification::Pronom" => [
{
"droid_signature_filename" => $sigfile->absolute,
"auto_store" => 1
}
],
"object 3a"
);
file_exists_ok( $ymlfile->absolute, "'object 3a' has auto stored file" );
my $obj3b = new_ok(
"File::FormatIdentification::Pronom" =>
[ { "droid_signature_filename" => $sigfile->absolute } ],
"object 3b'"
);
$ymlfile->remove;
is_deeply( $obj3b, $obj3a, "ensure 'obj3a' equals 'obj3b'" );
1;

+ 111
- 0
t/File/FormatIdentification/Regex.t View File

@@ -0,0 +1,111 @@
#!/usr/bin/perl -w
use strict;
use warnings;
use diagnostics;

use Test::More tests => 45;
use Test::Exception;
### tests
BEGIN { use_ok("File::FormatIdentification::Regex"); }
is( and_combine( '^', '$' ), '^$', 'and_combine(\'^\', \'$\')' );

# example from https://stackoverflow.com/questions/869809/combine-regexp#870506
is( and_combine( '^abc', 'xyz$' ),
'(?=^abc)(?=.*xyz$)', 'and_combine(\'^abc\', \'xyz$\')' );

# unsure if this will be correct:
# is(and_combine('abc', '.b.'), 'abc', "and_combine('abc', '.b.')");
# using this instead:
is( and_combine( 'abc', '.b.' ), '(?=abc)(?=.b.)',
"and_combine('abc', '.b.')" );

# usure if we should detect this:
# throws_ok( sub{and_combine('abc', 'xyz')}, qr(not combineable), "and_combine('abc', 'xyz') does not work");
# better to use this:
is( and_combine( 'foo', 'bar' ), "(?=foo)(?=bar)",
"and_combine('foo', 'bar')" );
is(
and_combine( 'foo', 'bar', 'baz' ),
"(?=foo)(?=bar)(?=baz)",
"and_combine('foo', 'bar', 'baz')"
);

# because Regex::Assemble changes order, following does not work:
# (or_combine('foo', 'bar'), '(?:foo|bar)', "or_combine('foo', 'bar')");
# using this instead:
is( or_combine( 'foo', 'bar' ), '(?:bar|foo)', "or_combine('foo', 'bar')" );
is( or_combine( 'foo', 'bar', 'baz' ),
'(?:ba[rz]|foo)', "or_combine('foo', 'bar', 'baz')" );
###
use File::FormatIdentification::Regex
qw( hex_replace_from_bracket hex_replace_to_bracket );
is(
hex_replace_to_bracket('\x00\x00\x00\x00\x00'),
'\x{00}\x{00}\x{00}\x{00}\x{00}',
'hex_replace_to_bracket(\'\x00\x00\x00\x00\x00\')'
);
is( hex_replace_from_bracket('\x{00}\x{00}\x{00}\x{00}\x{00}'),
'\x00\x00\x00\x00\x00',
'hex_replace_from_bracket(\'\x{00}\x{00}\x{00}\x{00}\x{00}\')' );
###
is( peep_hole_optimizer("foo"), "foo", "peep_hole_optimizer('foo')" );
is( peep_hole_optimizer("^foo"), "^foo", "peep_hole_optimizer('^foo')" );
is( peep_hole_optimizer("^(foo)"), "^(foo)", "peep_hole_optimizer('^(foo)')" );
is( peep_hole_optimizer("^((foo))"),
"^(foo)", "peep_hole_optimizer('^((foo))')" );
is( peep_hole_optimizer("^((foo)|(bar))"),
"^((foo)|(bar))", "peep_hole_optimizer('^((foo)|(bar))')" );
is( peep_hole_optimizer("^(((foo)|(bar)))"),
"^((foo)|(bar))", "peep_hole_optimizer('^(((foo)|(bar)))')" );
is( peep_hole_optimizer("^(((foo))|(bar))"),
"^((foo)|(bar))", "peep_hole_optimizer('^(((foo))|(bar))')" );
is( peep_hole_optimizer("^((foo)|((bar)))"),
"^((foo)|(bar))", "peep_hole_optimizer('^((foo)|((bar)))')" );
is( peep_hole_optimizer("(bar|baz)"),
"(ba(r|z))", "peep_hole_optimizer('(bar|baz)')" );
is( peep_hole_optimizer('(\x42|\x43)'),
'(\x42|\x43)', 'peep_hole_optimizer(\'(\x42|\x43)\')' );
is( peep_hole_optimizer('(\x34|\x44)'),
'(\x34|\x44)', 'peep_hole_optimizer(\'(\x34|\x44)\')' );
is( peep_hole_optimizer('(\x344|\x444)'),
'(\x344|\x444)', 'peep_hole_optimizer(\'(\x344|\x444)\')' );
is( peep_hole_optimizer("((bar)|(baz))"),
"(ba(r|z))", "peep_hole_optimizer('((bar)|(baz))')" );
is( peep_hole_optimizer("(barf|bazaar)"),
"(ba(rf|zaar))", "peep_hole_optimizer('(barf|bazaar)')" );
is( peep_hole_optimizer("(raf|saf)"),
"((r|s)af)", "peep_hole_optimizer('(raf|saf)')" );
is( peep_hole_optimizer("(braf|asaf)"),
"((br|as)af)", "peep_hole_optimizer('(braf|asaf)')" );
is( peep_hole_optimizer("(rag|saf)"),
"(rag|saf)", "peep_hole_optimizer('(rag|saf)')" );
is( peep_hole_optimizer("barbara"),
"(bar){2}a", "peep_hole_optimizer('barbara')" );
is( peep_hole_optimizer("toooor"), "to{4}r", "peep_hole_optimizer('toooor')" );
is( peep_hole_optimizer("toooooooooooor"),
"to{12}r", "peep_hole_optimizer('toooooooooor')" );
is( peep_hole_optimizer('\x00\x00\x00\x00\x00'),
'\x00{5}', 'peep_hole_optimizer(\'\x00\x00\x00\x00\x00\')' );
is(
peep_hole_optimizer(
'\A(\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xC2)'),
'\A(\x00{12}\xC2)',
'peep_hole_optimizer(\'\A(\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xC2)\')'
);
is( peep_hole_optimizer('\x0000000007006\x20'),
'\x000{7}7006\x20', 'peep_hole_optimizer(\'\x0000000007006\x20\')' );
is( peep_hole_optimizer("rhabarbarbarabarbara"),
"rha(bar){3}a(bar){2}a", "peep_hole_optimizer('rhabarbarbarabarbara')" );
is( peep_hole_optimizer("a{100000}"),
"a{100000}", "peep_hole_optimizer('a{100000}')" );
###
is( calc_quality('^'), 0, "calc_quality('^')" );
is( calc_quality('foo'), 1.098, "calc_quality('foo')" );
is( calc_quality('fo{2}'), 1.098, "calc_quality('fo{2}'" );
is( calc_quality('fo{2,}'), 1.098, "calc_quality('fo{2,}'" );
is( calc_quality('^foo'), 1.098, "calc_quality('^foo')" );
is( calc_quality('[fo]o'), -0.405, "calc_quality('[fo]o')" );
is( calc_quality('[^fo]o'), -4.848, "calc_quality('[^fo]o')" );
is( calc_quality('.o'), -4.855, "calc_quality('.o')" );
is( calc_quality('foobarbaz'), 2.197, "calc_quality('foobarbaz')" );
is( calc_quality('.........'), -5.545, "calc_quality('.........')" );

Loading…
Cancel
Save