Perl-module which allows to use Droid/PRONOM signatures and to convert it to Perl regular expressions, analyze files using wxHexEditor tags to display matches and calc statistics. For PRONOM see https://www.nationalarchives.gov.uk/PRONOM/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

252 lines
7.2 KiB

  1. package File::FormatIdentification::Regex;
  2. use 5.024001;
  3. use strict;
  4. use warnings;
  5. use diagnostics;
  6. use String::LCSS;
  7. use Regexp::Assemble;
  8. use Regexp::Optimizer;
  9. use Carp;
  10. use Exporter 'import'; # gives you Exporter's import() method directly
  11. our @EXPORT =
  12. qw(and_combine or_combine calc_quality simplify_two_or_combined_regex peep_hole_optimizer )
  13. ; # symbols to export on request
  14. our @EXPORT_OK = qw( hex_replace_from_bracket hex_replace_to_bracket );
  15. our $VERSION = '0.01';
  16. sub and_combine (@) {
  17. my @rx_groups = map {
  18. my $rx = $_;
  19. my $rxfill = "";
  20. my $ret = '';
  21. if ( $rx =~ m#^\^$# ) { $ret = $rx; }
  22. elsif ( $rx =~ m#^\$$# ) { $ret = $rx; }
  23. else {
  24. if ( $rx =~ m#\$$# ) {
  25. $rxfill = ".*";
  26. }
  27. $ret = "(?=$rxfill$rx)";
  28. }
  29. $ret;
  30. } @_;
  31. my $combined = join( "", @rx_groups );
  32. #my $rx = Regexp::Assemble->new;
  33. #$rx->add( $combined );
  34. #return $rx->as_string;
  35. #my $o = Regexp::Optimizer->new;
  36. #my $rcomb = qr/$combined/;
  37. #return $o->as_string($rcomb);
  38. return $combined;
  39. }
  40. sub or_combine (@) {
  41. my $ro = Regexp::Assemble->new;
  42. foreach my $rx (@_) {
  43. $ro->add($rx);
  44. }
  45. return $ro->as_string;
  46. }
  47. sub simplify_two_or_combined_regex($$) {
  48. my $rx1 = shift;
  49. my $rx2 = shift;
  50. my $common = "";
  51. if (
  52. # ($rx1 =~ m#\(([A-Za-z0-9]*)|(\\x[0-9A-F]{2})*\)#) &&
  53. # ($rx2 =~ m#\(([A-Za-z0-9]*)|(\\x[0-9A-F]{2})*\)#)
  54. ( $rx1 =~ m#\(([A-Za-z0-9]*)\)# )
  55. && ( $rx2 =~ m#\(([A-Za-z0-9]*)\)# )
  56. )
  57. {
  58. # only left simplify supported yet
  59. my $common = String::LCSS::lcss( $rx1, $rx2 );
  60. #say "";
  61. #say "Found common='$common' of rx1='$rx1' rx2='$rx2'";
  62. #say "";
  63. }
  64. return $common;
  65. }
  66. sub hex_replace_to_bracket {
  67. my $regex = shift;
  68. $regex =~ s#\\x([0-9A-F]{2})#\\x{$1}#g;
  69. return $regex;
  70. }
  71. sub hex_replace_from_bracket {
  72. my $regex = shift;
  73. $regex =~ s#\\x\{([0-9A-F]{2})\}#\\x$1#g;
  74. return $regex;
  75. }
  76. sub peep_hole_optimizer ($) {
  77. my $regex = shift
  78. ; # only works if special Regexes within File::FormatIdentification:: used
  79. $regex = hex_replace_to_bracket($regex);
  80. my $oldregex = $regex;
  81. ##### first optimize bracket-groups
  82. my $subrg =
  83. qr#(?:[A-Za-z0-9])|(?:\\x\{[0-9A-F]{2}\})#; # matches: \x00-\xff or text
  84. #my $subrg = qr#(?:\($subra\))#;
  85. my $subre = qr#(?:\($subrg(?:\|$subrg)+\))|(?:$subrg)#
  86. ; # matches (…|…) or (…|…|…) ...
  87. #$regex =~ s#\(\(($subra*)\)\)(?!\|)#(\1\)#g; # matches ((…))
  88. $regex =~ s#\(\(($subre+)\)\)#($1)#g;
  89. $regex =~ s#\(\((\([^)|]*\)(\|\([^)|]*\))+)\)\)#($1)#g;
  90. ##### optimize common subsequences
  91. ##### part1, combine bar|baz -> ba(r|z)
  92. #say "BEFORE: regex=$regex";
  93. while ($regex =~ m#\(($subrg*)\)\|\(($subrg*)\)#
  94. || $regex =~ m#($subrg*)\|($subrg*)# )
  95. {
  96. my $rx1 = $1;
  97. my $rx2 = $2;
  98. #say "common subseq: $regex -> rx1=$rx1 rx2=$rx2";
  99. my $common = String::LCSS::lcss( $rx1, $rx2 );
  100. if ( !defined $common || length($common) == 0 ) { last; }
  101. if ( $common !~ m#^$subrg+$# ) { last; }
  102. #say "!ok: $regex -> common=$common";
  103. # common prefix
  104. if ( $rx1 =~ m#^(.*)$common$# && $rx2 =~ m#^(.*)$common$# ) {
  105. #say "suffix found";
  106. $rx1 =~ m#^(.*)$common$#;
  107. my $rx1_prefix = $1;
  108. $rx2 =~ m#^(.*)$common$#;
  109. my $rx2_prefix = $1;
  110. my $subst = "($rx1_prefix|$rx2_prefix)$common";
  111. if ( $regex =~ m#\(($subrg*)\)\|\(($subrg*)\)# ) {
  112. $regex =~ s#\($subrg*\)\|\($subrg*\)#$subst#g;
  113. }
  114. elsif ( $regex =~ m#($subrg*)\|($subrg*)# ) {
  115. $regex =~ s#$subrg*\|$subrg*#$subst#g;
  116. }
  117. }
  118. # common suffix
  119. elsif ( $rx1 =~ m#^$common(.*)$# && $rx2 =~ m#^$common(.*)$# ) {
  120. #say "prefix found";
  121. $rx1 =~ m#^$common(.*)$#;
  122. my $rx1_suffix = $1;
  123. $rx2 =~ m#^$common(.*)$#;
  124. my $rx2_suffix = $1;
  125. my $subst = "$common($rx1_suffix|$rx2_suffix)";
  126. #say "subst=$subst";
  127. if ( $regex =~ m#\(($subrg*)\)\|\(($subrg*)\)# ) {
  128. $regex =~ s#\($subrg*\)\|\($subrg*\)#$subst#g;
  129. }
  130. elsif ( $regex =~ m#($subrg*)\|($subrg*)# ) {
  131. $regex =~ s#$subrg*\|$subrg*#$subst#g;
  132. }
  133. #say "regex=$regex";
  134. }
  135. else {
  136. last;
  137. }
  138. }
  139. ##### part2, combine barbara -> (bar){2}a
  140. while ( $regex =~ m#($subrg{3,}?)(\1+)(?!$subrg*\})# ) {
  141. my $sub = $1;
  142. if ( $sub =~ m#^($subrg)\1+$# ) {
  143. last;
  144. }
  145. my $l1 = length($1);
  146. my $l2 = length($2);
  147. my $matches = 1 + ( $l2 / $l1 );
  148. #say "Found1 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
  149. if ( $sub =~ m#^$subrg$# ) {
  150. $regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#$sub\{$matches\}#;
  151. }
  152. else {
  153. $regex =~ s#($subrg{3,}?)\1+(?!$subrg*\})#($sub)\{$matches\}#;
  154. }
  155. }
  156. ##### part2, combine toooor -> to{4}r
  157. while ( $regex =~ m#($subrg)(\1{3,})(?!$subrg*\})# ) {
  158. my $sub = $1;
  159. my $l1 = length($1);
  160. my $l2 = length($2);
  161. my $matches = 1 + ( $l2 / $l1 );
  162. #say "Found2 in regex='$regex' sub='$sub' with \$2=$2 l1=$l1 l2=$l2 matches=$matches";
  163. if ( $sub =~ m#^$subrg$# ) {
  164. $regex =~ s#($subrg)\1{3,}(?!$subrg*\})#$sub\{$matches\}#;
  165. }
  166. else {
  167. $regex =~ s#($subrg)\1{3,}(?!$subrg*\})#($sub)\{$matches\}#;
  168. }
  169. }
  170. ##### part2, combine foooo -> fo{4}
  171. #while ($regex =~ m#($subrg)\1{3,}(?!$subrg*\})#) {
  172. # my $sub = $1;
  173. # my $matches = $#+; $matches++;
  174. # say "Found in regex='$regex' sub='$sub' with matches=$matches";
  175. # $regex =~ s#($subrg)\1{3,}(?!$subrg*\}#$sub\{$matches\}#;
  176. #}
  177. #### restore \x{ff} to \xff
  178. $regex = hex_replace_from_bracket($regex);
  179. if ( $regex =~ m#\\x0\{# ) {
  180. confess "wrong substitution of oldregex = \n\t'", $oldregex,
  181. "'\n -> \n\t'", $regex, "'";
  182. }
  183. return $regex;
  184. }
  185. sub calc_quality ($) {
  186. my $regex = shift;
  187. # replace all \xff with #
  188. # replace all . with ( | | | | )
  189. # replace all [abc] with (a|b|c)
  190. # replace all [^abc] with (d|e|f|..|)
  191. # then $len = count of # and $or = count of |
  192. # divide it with $len / (1+$or)
  193. my $len = 0;
  194. my $alt = 0;
  195. while ( $regex =~ s/\\x[0-9a-f]{2}// ) {
  196. $len++;
  197. }
  198. while ( $regex =~ s/\[\^(.*?)\]// ) {
  199. $alt += ( 256 - length($1) );
  200. $len++;
  201. }
  202. while ( $regex =~ s/\[(.*?)\]// ) {
  203. $alt += length($1);
  204. $len++;
  205. }
  206. while ( $regex =~ s/\.// ) {
  207. $alt += 256;
  208. $len++;
  209. }
  210. while ( $regex =~ s/[A-Za-z0-9 ]// ) {
  211. $len++;
  212. }
  213. my $tmp = $len / ( 1 + $alt );
  214. my $quality = ( $tmp == 0 ) ? 0 : int( 1000 * log($tmp) ) / 1000;
  215. #say "rest: $regex len=$len alt=$alt quality=$quality ($tmp)";
  216. return $quality;
  217. }
  218. # see https://stackoverflow.com/questions/869809/combine-regexp#870506
  219. 1;