Browse Source

- added comments

- bugfix in peep_hole_optimizer(), \x00 should not be used, \x{00} prefered to avoid misinterpretations
master
Andreas Romeyke 2 months ago
parent
commit
49fd167e2f
2 changed files with 11 additions and 11 deletions
  1. +3
    -3
      lib/File/FormatIdentification/Pronom.pm
  2. +8
    -8
      lib/File/FormatIdentification/Regex.pm

+ 3
- 3
lib/File/FormatIdentification/Pronom.pm View File

@@ -12,7 +12,7 @@ use YAML::XS;
use File::FormatIdentification::Regex;
use Moose;

our $VERSION = '0.01';
our $VERSION = '0.02';

# Preloaded methods go here.
# flattens a regex-structure to a regex-string, expects a signature-pattern and a list of regex-structures
@@ -109,10 +109,10 @@ sub _expand_pattern ($) {
my $pattern = $_[0];
$pattern =~ s/(?<=\[)!/^/g;
$pattern =~ s/(?<=[0-9A-F]{2}):(?=[0-9A-F]{2})\]/-]/g;
$pattern =~ s/(?=[0-9A-F]{2})/\\x/g;
$pattern =~ s/([0-9A-F]{2})/\\x{$1}/g;

# substitute hex with printable ASCII-Output
$pattern =~ s#\\x(3[0-9]|[46][1-9A-F]|[57][0-9A])#chr( hex($1) );#egs;
$pattern =~ s#\\x\{(3[0-9]|[46][1-9A-F]|[57][0-9A])\}#chr( hex($1) );#egs;
return $pattern;
} ## end sub _expand_pattern ($)



+ 8
- 8
lib/File/FormatIdentification/Regex.pm View File

@@ -1,5 +1,5 @@
package File::FormatIdentification::Regex;
# helper module to combine and optimize regexes
use 5.024001;
use strict;
use warnings;
@@ -87,7 +87,11 @@ sub hex_replace_from_bracket {

sub peep_hole_optimizer ($) {
my $regex = $_[0]; # only works if special Regexes within File::FormatIdentification:: used
$regex = hex_replace_to_bracket($regex);

#$regex = hex_replace_to_bracket($regex);
if ($regex =~ m/\\x[0-9]+/) {
confess "regex '$regex' has invalid \\x sequences, use \\x{} instead!";
}
my $oldregex = $regex;
##### first optimize bracket-groups
my $subrg =
@@ -198,15 +202,11 @@ sub peep_hole_optimizer ($) {
# say "Found in regex='$regex' sub='$sub' with matches=$matches";
# $regex =~ s#($subrg)\1{3,}(?!$subrg*\}#$sub\{$matches\}#;
#}
#### restore \x{ff} to \xff
$regex = hex_replace_from_bracket($regex);
if ( $regex =~ m#\\x0\{# ) {
confess "wrong substitution of oldregex = \n\t'", $oldregex,
"'\n -> \n\t'", $regex, "'";
}

return $regex;
}

# calc regex quality, if more specific the quality is higher
sub calc_quality ($) {
my $regex = shift;



Loading…
Cancel
Save