Perl-module which allows to use Droid/PRONOM signatures and to convert it to Perl regular expressions, analyze files using wxHexEditor tags to display matches and calc statistics. For PRONOM see https://www.nationalarchives.gov.uk/PRONOM/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

335 lines
9.4 KiB

  1. #!/usr/bin/env perl
  2. #===============================================================================
  3. #
  4. # FILE: pronom2wxhexeditor.pl
  5. #
  6. # USAGE: ./pronom2wxhexeditor.pl
  7. #
  8. # DESCRIPTION: perl ./pronom2wxhexeditor.pl <DROIDSIGNATURE-FILE> <BINARYFILE>
  9. #
  10. # OPTIONS: ---
  11. # REQUIREMENTS: ---
  12. # BUGS: ---
  13. # NOTES: ---
  14. # AUTHOR: Andreas Romeyke,
  15. # CREATED: 28.08.2018 14:26:43
  16. # REVISION: ---
  17. #===============================================================================
  18. use strict;
  19. use warnings 'FATAL';
  20. use utf8;
  21. use v5.24;
  22. use Fcntl qw(:seek);
  23. use Digest::CRC qw( crc8 );
  24. use Scalar::Util;
  25. use File::Map qw(:map :extra);
  26. use File::FormatIdentification::Pronom;
  27. # calc a random color
  28. sub rndcolor {
  29. my $rgb = int( rand( 256 * 256 * 256 ) );
  30. return sprintf( "#%06x", $rgb );
  31. }
  32. sub puidcolor {
  33. my $puid = shift;
  34. my $crc = crc8($puid);
  35. return sprintf( "#%02x%02x%02x", $crc, $crc, $crc );
  36. }
  37. sub dircolor {
  38. my $direction = shift;
  39. my $pos = shift;
  40. if ( $direction > 0 ) {
  41. return sprintf( "#ff%04x", $pos * $pos );
  42. }
  43. elsif ( $direction < 0 ) {
  44. return sprintf( "#%04xff", $pos * $pos );
  45. }
  46. else {
  47. return sprintf( "#%02xff%02x", $pos, $pos );
  48. }
  49. } ## end sub dircolor
  50. # helper function to collect all things needed for output and adds to a given buffer
  51. sub push_output ($$$$$$$$$$) {
  52. my %tmp;
  53. $tmp{puid} = shift;
  54. $tmp{name} = shift;
  55. $tmp{begin} = shift;
  56. $tmp{end} = shift;
  57. $tmp{regex} = shift;
  58. #$tmp{hexdump} = shift;
  59. $tmp{position} = shift;
  60. $tmp{signature} = shift;
  61. $tmp{internal_signature} = shift;
  62. $tmp{bytesequence} = shift;
  63. my $ref_buffer = shift;
  64. push @{$ref_buffer}, \%tmp;
  65. return;
  66. } ## end sub push_output
  67. #render HTML output
  68. sub render_for_html {
  69. my $ref_buffer = shift;
  70. my $fh = shift;
  71. my $binaryfile = shift;
  72. my @tmp = sort { $a->{begin} <=> $b->{begin} } ( @{$ref_buffer} );
  73. say $fh <<HEAD;
  74. <html><head />
  75. <body>
  76. <h1> Result for "$binaryfile"</h1>
  77. HEAD
  78. foreach my $tagid ( 0 .. $#tmp ) {
  79. my $pos = $tmp[$tagid]->{position};
  80. my $begin = $tmp[$tagid]->{begin};
  81. my $end = $tmp[$tagid]->{end};
  82. my $puid = $tmp[$tagid]->{puid};
  83. my $name = $tmp[$tagid]->{name};
  84. my $regex = $tmp[$tagid]->{regex};
  85. my $internal = $tmp[$tagid]->{internal_signature};
  86. my $bytesequence = $tmp[$tagid]->{bytesequence};
  87. my $partial = get_partial_regex( $pos, $regex );
  88. #my $hexdump = $tmp[$tagid]->{hexdump};
  89. #if ( length($hexdump) > 10 ) {
  90. # $hexdump = substr( $hexdump, 0, 10 ) . "...";
  91. #}
  92. my $fgcolor = puidcolor($puid);
  93. #my $bgcolor = dircolor( $begin <=> $end, $pos );
  94. my $bgcolor = rndcolor();
  95. say $fh "
  96. <h2>$puid</h2>
  97. <p>Internal Signature: $internal</p>
  98. <p>Byte Sequence: $bytesequence</p>
  99. <p>Bytes $begin - $end</p>
  100. <p>$name</p>
  101. <p>regex=$regex</p>
  102. <p>matching $pos-th partial regex: $partial</p>
  103. "
  104. #<p>pos=$pos</p>
  105. #<p>hexdump:<br />$hexdump</p>"
  106. } ## end foreach my $tagid ( 0 .. $#tmp)
  107. say $fh <<FOOT;
  108. </body>
  109. </html>
  110. FOOT
  111. return;
  112. } ## end sub render_for_html
  113. # render output for wxhexeditor
  114. sub render_for_wxhexeditor {
  115. my $ref_buffer = shift;
  116. my $fh = shift;
  117. my $binaryfile = shift;
  118. my @tmp = sort {
  119. if ( $a->{begin} == $b->{begin} ) {
  120. return ( $a->{end} <=> $b->{end} );
  121. }
  122. else {
  123. return ( $a->{begin} <=> $b->{begin} );
  124. }
  125. } ( @{$ref_buffer} );
  126. say $fh <<HEAD;
  127. <?xml version="1.0" encoding="UTF-8"?>
  128. <wxHexEditor_XML_TAG>
  129. <filename path="$binaryfile">
  130. HEAD
  131. foreach my $tagid ( 0 .. $#tmp ) {
  132. my $pos = $tmp[$tagid]->{position};
  133. my $begin = $tmp[$tagid]->{begin};
  134. my $end = $tmp[$tagid]->{end};
  135. my $puid = $tmp[$tagid]->{puid};
  136. my $name = $tmp[$tagid]->{name};
  137. my $regex = $tmp[$tagid]->{regex};
  138. my $internal = $tmp[$tagid]->{internal_signature};
  139. my $bytesequence = $tmp[$tagid]->{bytesequence};
  140. #my $hexdump = $tmp[$tagid]->{hexdump};
  141. #if ( length($hexdump) > 10 ) {
  142. # $hexdump = substr( $hexdump, 0, 10 ) . "...";
  143. #}
  144. my $fgcolor = puidcolor($puid);
  145. #my $bgcolor = dircolor( $begin <=> $end, $pos );
  146. my $bgcolor = rndcolor();
  147. my $partial = get_partial_regex( $pos, $regex );
  148. say $fh "
  149. <TAG id='$tagid'>
  150. <start_offset>$begin</start_offset>
  151. <end_offset>$end</end_offset>
  152. <tag_text>$puid
  153. $name
  154. at Bytes($begin, $end)
  155. $regex
  156. matching $pos-th partial regex: $partial
  157. Internal Signature: $internal
  158. Byte Sequence: $bytesequence
  159. </tag_text>
  160. <font_colour>$fgcolor</font_colour>
  161. <note_colour>$bgcolor</note_colour>
  162. </TAG>";
  163. } ## end foreach my $tagid ( 0 .. $#tmp)
  164. say $fh <<FOOT;
  165. </filename>
  166. </wxHexEditor_XML_TAG>
  167. FOOT
  168. return;
  169. } ## end sub render_for_wxhexeditor
  170. sub get_partial_regex($$) {
  171. my $position = shift;
  172. my $regex = shift;
  173. if ( $regex =~ m/\({$position}(.{20})/ ) { return "'$1'..."; }
  174. return "";
  175. }
  176. ################################################################################
  177. # main
  178. ################################################################################
  179. my $pronomfile = shift @ARGV;
  180. my $binaryfile = shift @ARGV;
  181. if ( !defined $pronomfile ) {
  182. say "you need at least a pronom signature file";
  183. }
  184. if ( !defined $binaryfile ) {
  185. say "you need an binaryfile";
  186. }
  187. # write basic main.osd
  188. open( my $filehandle, "<", "$binaryfile" );
  189. binmode($filehandle);
  190. seek( $filehandle, 0, SEEK_END );
  191. my $eof = tell($filehandle);
  192. my $pronom = File::FormatIdentification::Pronom->new(
  193. "droid_signature_filename" => $pronomfile );
  194. my @output_buffer;
  195. #my $pathobj = path($binaryfile);
  196. #my $filestream = $pathobj->slurp_raw;
  197. map_file my $filestream, $binaryfile, "<";
  198. advise( $filestream, 'random' );
  199. foreach my $internalid ( $pronom->get_all_internal_ids() ) {
  200. my $sig = $pronom->get_signature_id_by_internal_id($internalid);
  201. if ( !defined $sig ) { next; }
  202. my $puid = $pronom->get_puid_by_signature_id($sig);
  203. my $name = $pronom->get_name_by_signature_id($sig);
  204. my @regexes = $pronom->get_regular_expressions_by_internal_id($internalid);
  205. my @res;
  206. my $timer = time;
  207. #print "internalid=$internalid";
  208. foreach my $regex (@regexes) {
  209. # MATCHed?
  210. #warn "$internalid, regex='$regex'\n";
  211. if ( !defined $regex ) {
  212. warn "No regex found for internalid $internalid\n";
  213. }
  214. #say "REGEX='$regex'";
  215. if ( $filestream =~ /$regex/saa ) {
  216. my $tmp;
  217. $tmp->{matched} = 1;
  218. $tmp->{regex} = $regex;
  219. #$tmp->{groups};
  220. #use Data::Printer;
  221. #p( @+ );
  222. #p( @- );
  223. my %groups;
  224. for ( my $match = 0 ; $match <= $#- ; $match++ ) {
  225. if ( defined $-[$match] && defined $+[$match] ) {
  226. my $matches;
  227. my $begin = $-[$match];
  228. my $end = $+[$match];
  229. $matches->{begin} = $begin;
  230. $matches->{end} = $end;
  231. $matches->{pos} = $match;
  232. $groups{ ( $begin, $end ) } = $matches;
  233. }
  234. }
  235. my @uniqgroups = values %groups;
  236. #use Data::Printer;
  237. #p( @uniqgroups );
  238. $tmp->{groups} = \@uniqgroups;
  239. #p( $tmp->{groups} );
  240. #die "matched '$_'";
  241. push @res, $tmp;
  242. }
  243. else {
  244. last; # break for loop
  245. }
  246. }
  247. if ( ( scalar @res ) == ( scalar @regexes ) ) { # all matches successfull
  248. # my %tmp;
  249. # $tmp{puid} = shift;
  250. # $tmp{name} = shift;
  251. # $tmp{begin} = shift;
  252. # $tmp{end} = shift;
  253. # $tmp{regex} = shift;
  254. # #$tmp{hexdump} = shift;
  255. # $tmp{position} = shift;
  256. # $tmp{signature} = shift;
  257. # my $ref_buffer = shift;
  258. for ( my $receiptidx = 0 ; $receiptidx <= $#res ; $receiptidx++ ) {
  259. my $receipt = $res[$receiptidx];
  260. foreach my $group ( @{ $receipt->{groups} } ) {
  261. push_output(
  262. $puid,
  263. $name,
  264. $group->{begin},
  265. $group->{end},
  266. $receipt->{regex},
  267. $group->{pos},
  268. $sig,
  269. $internalid,
  270. $receiptidx,
  271. \@output_buffer
  272. );
  273. }
  274. }
  275. }
  276. #say " ... time=", (time - $timer), "s";
  277. } ## end foreach my $internal ( keys...)
  278. open( my $OUT, ">", "$binaryfile.tags" );
  279. open( my $HTML, ">", "$binaryfile.html" );
  280. render_for_wxhexeditor(
  281. #filter_matches_by_signature_priority( $signatures, \@output_buffer ),
  282. \@output_buffer,
  283. $OUT, $binaryfile
  284. );
  285. render_for_html(
  286. #filter_matches_by_signature_priority( $signatures, \@output_buffer ),
  287. \@output_buffer,
  288. $HTML, $binaryfile
  289. );
  290. close $HTML;
  291. close $OUT;
  292. 1;