diff -rbc FuzzyOcr-2.3f/CHANGES FuzzyOcr-2.3g/CHANGES *** FuzzyOcr-2.3f/CHANGES Tue Sep 12 11:56:52 2006 --- FuzzyOcr-2.3g/CHANGES Thu Sep 14 12:04:34 2006 *************** *** 1,3 **** --- 1,59 ---- + version 2.3g: + Added: + Option: focr_keep_bad_images + The default value for this option is zero(0). + When set to 1, the plugin will not remove a tempdir whenever it registers + an error or timeout from any of the 'helper' apps. + When set to 2, the plugin will always keep the tempdir. Beware that on heavily + loaded systems, this might fill your /tmp partition. + + Util: fuzzy-cleantmp + This utility can be used to remove tempdirs left behind if the plugin was + configured to save them. It takes one parameter: hours to keep (12 by default) + This can safely be placed inside CRON to prune /tmp. + + Util: gif2anim + This utility (from ImageMagic) extracts images from animated gifs as well + as giving information regarding delays and image sizes. Requires identify and + convert to work (these are required, so not a problem). + + Fixed: + Bug: 'convert' + An invalid parameter was specified when using 'convert' to assemble animated gifs + resulting in an error message, and the image was not scanned. + + Bug: 'safe_db' + When checking for images in safe_db hash, because we score then as zero (0), + we did not 'short circuit' correctly. This has now been fixed. + + Changed: + known_image_hash + This procedure was called with two parameters: $digest and $score. + $digest was not used, so it has been removed. Also, just in the off chance + that $score is zero, it uses $Score{base} to score the image. + + fuzzyocr_check + Added code to better determine the name of the attachment. Sometimes, the name + is hidden in the 'content-id' header of the image/* MIME part, so we extract + it from there if no name is given when this header is available. Also it makes + shure that problematic characters are changed so as to not give PERL any more + grief. + + A copy of the original message is now saved in the tempdir created, so that + when we instruct the plugin to keep the created tempdir, we have a copy of the + original message to further assist in troubleshooting problems. + + A file is created in tempdir containing all the expanded commands used to + process the images. This can help to troubleshoot invalid command errors. + + Removed some debuglog lines to reduce the lines logged. + + Uses gif2anim (if available) to extract images from animated gifs. + TODO: + I will try to the generated anim file to root out animated gif spam where + the spam message is not in the largest frame, or is in the frame with the + largest delay, as well as other tricks... + version 2.3f: Fixed: Properly initialized $h and $w to zero so that when getting the height and width diff -rbc FuzzyOcr-2.3f/FuzzyOcr.cf FuzzyOcr-2.3g/FuzzyOcr.cf *** FuzzyOcr-2.3f/FuzzyOcr.cf Tue Sep 12 11:43:48 2006 --- FuzzyOcr-2.3g/FuzzyOcr.cf Thu Sep 14 10:35:24 2006 *************** *** 14,23 **** #### Logging options ##### # Verbosity level (see manual) Attention: Don't set to 0, but to 0.0 for quiet operation. (Default value: 1) ! #focr_verbose 1 # ! # Logfile (make sure it is writable by the plugin) (Default value: stderr) ! #focr_logfile stderr ########################## ##### Wordlists ##### --- 14,23 ---- #### Logging options ##### # Verbosity level (see manual) Attention: Don't set to 0, but to 0.0 for quiet operation. (Default value: 1) ! #focr_verbose 2 # ! # Logfile (make sure it is writable by the plugin) (Default value: /etc/mail/spamassassin/FuzzyOcr.log) ! #focr_logfile /etc/mail/spamassassin/FuzzyOcr.log ########################## ##### Wordlists ##### *************** *** 47,85 **** #focr_bin_convert /usr/bin/convert #focr_bin_identify /usr/bin/identify #focr_bin_gocr /usr/bin/gocr ! ! #Search the following path for bin utils above #focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin ############################################################################################ ! ##### Scansets, comma seperated (Default value: $gocr -i -, $gocr -l 180 -d 2 -i -) ##### # Each scanset consists of one or more commands which make text out of pnm input. # Each scanset is run seperately on the PNM data, results are combined in scoring. ! #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile # # To use only one scan with default values, uncomment the next line instead ! #focr_scansets $gocr -i $pfile # # Some example for more advanced sets ! # This one uses the first the standard scan, then a scanset which first reduces the image ! # to 3 colors and then scans it with custom settings and then it scans again only with ! # these custom settings # NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC ! #focr_scansets $gocr -i $pfile, pnmnorm $pfile 2>$efile| pnmquant 3 2>>$efile | pnmnorm 2>>$efile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i $pfile ######################################################################################### ##### Various Score/Scan settings ##### # Timeout for the plugin, in seconds. (Maximum runtime of the plugin) (Default value: 10) ! #focr_timeout 10 # # Default detection treshold (see manual) (Default value: 0.3) (Can be changed on a per word basis in the wordlist). #focr_threshold 0.3 # ! # This is the score for a hit after focr_counts_required matches ! #focr_base_score 4 # # This is the additional score for every additional match after focr_counts_required matches (Default value: 1) ! #focr_add_score 1 # # This is the score to give for a wrong content-type (e.g. JPEG image but content type says GIF) (Default value: 1.5) #focr_wrongctype_score 1.5 --- 47,87 ---- #focr_bin_convert /usr/bin/convert #focr_bin_identify /usr/bin/identify #focr_bin_gocr /usr/bin/gocr ! # ! # Use this option to search for all of the above utilitites using the following search order (path like): ! # This reduces typing errors, and allows to have more than one version installed in the system, and ! # give priority to selected dirs. #focr_path_bin /usr/local/netpbm/bin:/usr/local/bin:/usr/bin + # ############################################################################################ ! ##### Scansets, comma seperated (Default value: $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile) ##### # Each scanset consists of one or more commands which make text out of pnm input. # Each scanset is run seperately on the PNM data, results are combined in scoring. ! #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile, $gocr -l 140 -d 2 -i $pfile # # To use only one scan with default values, uncomment the next line instead ! #focr_scansets $gocr -i - # # Some example for more advanced sets ! # Thisone uses the first the standard scan, then a scanset which first reduces the image to 3 colors and then scans it with custom settings ! # and then it scans again only with these custom settings # NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC ! #focr_scansets $gocr -i -, pnmnorm 2>$errfile | pnmquant 3 2>>$errfile | pnmnorm 2>>$errfile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i - ######################################################################################### ##### Various Score/Scan settings ##### # Timeout for the plugin, in seconds. (Maximum runtime of the plugin) (Default value: 10) ! #focr_timeout 15 # # Default detection treshold (see manual) (Default value: 0.3) (Can be changed on a per word basis in the wordlist). #focr_threshold 0.3 # ! # This is the score for a hit after focr_counts_required matches (Default value: 5) ! #focr_base_score 5 # # This is the additional score for every additional match after focr_counts_required matches (Default value: 1) ! #focr_add_score 0.375 # # This is the score to give for a wrong content-type (e.g. JPEG image but content type says GIF) (Default value: 1.5) #focr_wrongctype_score 1.5 *************** *** 91,97 **** #focr_corrupt_unfixable_score 5 # # This is used to disable the OCR engine if the message has already more points than this value (Default value: 10) ! #focr_autodisable_score 10 # # Number of minimum matches before the rule scores (Default value: 2) #focr_counts_required 2 --- 93,99 ---- #focr_corrupt_unfixable_score 5 # # This is used to disable the OCR engine if the message has already more points than this value (Default value: 10) ! #focr_autodisable_score 25 # # Number of minimum matches before the rule scores (Default value: 2) #focr_counts_required 2 *************** *** 112,134 **** # # If the image hash database feature is enabled, specify the file here to use as database # (Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb) ! #focr_digest_db /etc/mail/spamassassin/FuzzyOcr.hashdb # # If the image hash db feature is enabled, specify the file here to use as database # (Default value: /etc/mail/spamassassin/FuzzyOcr.db) ! #focr_db_hash /etc/mail/spamassassin/FuzzyOcr.db # # If the image hash db feature is enabled, specify the file here to use as database ! # (Default value: /etc/mail/spamassassin/FuzzyOcr.db) ! #focr_db_safe /etc/mail/spamassassin/FuzzyOcr.safe.db # # Expire records from focr_digest_db after (Default: 35) days ! #focr_db_max_days 35 # # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) #focr_hashing_learn_scanned 1 # ! # If enabled, this will save image files in tempdir when errors occur to further debug the plugin. (Default: 0.0). ! #focr_keep_bad_images 0.0 ! # ###################################################################### --- 114,138 ---- # # If the image hash database feature is enabled, specify the file here to use as database # (Default value: /etc/mail/spamassassin/FuzzyOcr.hashdb) ! #focr_digest_hash /etc/mail/spamassassin/FuzzyOcr.hashdb # # If the image hash db feature is enabled, specify the file here to use as database # (Default value: /etc/mail/spamassassin/FuzzyOcr.db) ! #focr_hash_db /etc/mail/spamassassin/FuzzyOcr.db # # If the image hash db feature is enabled, specify the file here to use as database ! # (Default value: /etc/mail/spamassassin/FuzzyOcr.safe.db) ! #focr_safe_db /etc/mail/spamassassin/FuzzyOcr.safe.db # # Expire records from focr_digest_db after (Default: 35) days ! #focr_db_max_days 15 # # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) #focr_hashing_learn_scanned 1 # ! # Keep files that generate errors ! # 0 = always cleanup ! # 1 = keep only if error ! # 2 = always keep ! #focr_keep_bad_images 1 ###################################################################### diff -rbc FuzzyOcr-2.3f/FuzzyOcr.pm FuzzyOcr-2.3g/FuzzyOcr.pm *** FuzzyOcr-2.3f/FuzzyOcr.pm Tue Sep 12 12:10:31 2006 --- FuzzyOcr-2.3g/FuzzyOcr.pm Thu Sep 14 11:18:41 2006 *************** *** 83,92 **** our $pms; our @scansets; ! our @bin_utils = qw/giffix giftext gifasm gifinter giftopnm jpegtopnm pngtopnm bmptopnm tifftopnm ppmhist convert identify gocr/; ! our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype autodisable/; our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames --- 83,93 ---- our $pms; our @scansets; ! our @bin_utils = qw/giffix giftext gifasm gifinter giftopnm gif2anim jpegtopnm pngtopnm bmptopnm tifftopnm ppmhist convert identify gocr/; ! our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype ! autodisable/; our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames *************** *** 286,293 **** untie %DB; } load_global_words( $Option{global_wordlist} ); - my $w = scalar(keys %words); - debuglog("Loaded <$w> words from \"$Option{global_wordlist}\""); unless (@scansets) { @scansets = ( '$gocr -i $pfile', '$gocr -l 180 -d 2 -i $pfile'); --- 287,292 ---- *************** *** 306,311 **** --- 305,311 ---- handle_error( $err_msges[3], ( $_[0] ) ); return; } + my $cnt = 0; open WORDLIST, "<$_[0]"; while(my $w = ) { chomp($w); *************** *** 317,326 **** ($w, $wt) = (lc($1), $2); $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); } ! $words{$w} = $wt; } close WORDLIST; ! return 1; } sub load_personal_words { --- 317,326 ---- ($w, $wt) = (lc($1), $2); $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); } ! $words{$w} = $wt; $cnt++; } close WORDLIST; ! debuglog("Loaded <$cnt> words from \"$_[0]\""); } sub load_personal_words { *************** *** 334,339 **** --- 334,340 ---- ); return; } + my $cnt = 0; open WORDLIST, "<$_[0]"; while(my $w = ) { chomp($w); *************** *** 345,353 **** ($w, $wt) = ($1, $2); $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); } ! $words{$w} = $wt; } close WORDLIST; } sub max { --- 346,355 ---- ($w, $wt) = ($1, $2); $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); } ! $words{$w} = $wt; $cnt++; } close WORDLIST; + debuglog("Updated Word List with $cnt words from $_[0]"); } sub max { *************** *** 375,380 **** --- 377,383 ---- return(0) unless $rh; return(0) unless $rw; return(0) unless $rcn; + return(0) unless $rkey; return(0) if ((abs($ds - $rs ) / $rs ) > $Threshold{s}); return(0) if ((abs($dh - $rh ) / $rh ) > $Threshold{h}); return(0) if ((abs($dw - $rw ) / $rw ) > $Threshold{w}); *************** *** 444,450 **** if (defined $DB{$key}) { $dbm = $DB{$key}; if ($img eq $dbm->{basic}) { ! $ret = $dbm->{score}; $dbm->{fname} = $fname; $dbm->{ctype} = $ctype; debuglog("Updating $txt info File:'$fname' Type:'$ctype'"); --- 447,453 ---- if (defined $DB{$key}) { $dbm = $DB{$key}; if ($img eq $dbm->{basic}) { ! $ret = $dbm->{score} || 0.001; $dbm->{fname} = $fname; $dbm->{ctype} = $ctype; debuglog("Updating $txt info File:'$fname' Type:'$ctype'"); *************** *** 455,461 **** my $now = time - ($Option{db_max_days}*86400); foreach my $k (keys %DB) { $dbm = $DB{$k}; ! $hash = $dbm->{basic} . '::' . $k; if (within_threshold($digest,$hash)) { $ret = $dbfile eq $Option{db_hash} ? $dbm->{score} : $dbm->{match}; $txt = 'Approx'; $new = $k; --- 458,464 ---- my $now = time - ($Option{db_max_days}*86400); foreach my $k (keys %DB) { $dbm = $DB{$k}; ! $hash = $dbm->{basic} ? $dbm->{basic} : '0:0:0:0' . '::' . $k; if (within_threshold($digest,$hash)) { $ret = $dbfile eq $Option{db_hash} ? $dbm->{score} : $dbm->{match}; $txt = 'Approx'; $new = $k; *************** *** 516,521 **** --- 519,525 ---- return; } debuglog("Adding Hash with score($score) to \"$dbfile\""); + debuglog("Digest: $digest"); my ($img,$key) = split('::',$digest,2); my $dbm = $DB{$key}; *************** *** 660,667 **** } sub known_img_hash { ! my $digest = shift; ! my $score = shift; for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} = sprintf( "%0.3f", $score ); --- 664,670 ---- } sub known_img_hash { ! my $score = shift || $Score{base}; for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} = sprintf( "%0.3f", $score ); *************** *** 672,677 **** --- 675,681 ---- sub removedir { my $dir = $_[0]; + return unless -d $dir; opendir D, $dir; my @files = readdir D; closedir D; *************** *** 679,691 **** next if $f eq '.'; next if $f eq '..'; my $ff = Mail::SpamAssassin::Util::untaint_file_path("$dir/$f"); ! my $cnt = unlink $ff; ! unless ($cnt) { debuglog("Cannot remove: $ff"); } } unless(rmdir $dir) { ! debuglog("Cannot remove DIR:$dir"); } } --- 683,695 ---- next if $f eq '.'; next if $f eq '..'; my $ff = Mail::SpamAssassin::Util::untaint_file_path("$dir/$f"); ! unless (unlink $ff) { debuglog("Cannot remove: $ff"); } } + debuglog("Remove DIR: $dir"); unless(rmdir $dir) { ! debuglog("Cannot remove DIR: $dir"); } } *************** *** 704,710 **** my $imgerr = 0; my $homedir = (getpwuid($<))[7]; ! debuglog("Starting FuzzyOcr..."); #debuglog("Attempting to load personal wordlist..."); if ($homedir) { load_personal_words( $homedir . "/$Option{personal_wordlist}" ); --- 708,714 ---- my $imgerr = 0; my $homedir = (getpwuid($<))[7]; ! #debuglog("Starting FuzzyOcr..."); #debuglog("Attempting to load personal wordlist..."); if ($homedir) { load_personal_words( $homedir . "/$Option{personal_wordlist}" ); *************** *** 715,725 **** } foreach my $p ( ! $pms->{msg}->find_parts(qr/^image\b/i), $pms->{msg}->find_parts(qr(Application/Octet-Stream)i) ) { my $ctype = $p->{'type'}; my $fname = $p->{'name'} || 'unknown'; my $test = 0; $test++ if ($ctype =~ /image/i); $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i); --- 719,736 ---- } foreach my $p ( ! $pms->{msg}->find_parts(qr(^image\b)i), $pms->{msg}->find_parts(qr(Application/Octet-Stream)i) ) { my $ctype = $p->{'type'}; my $fname = $p->{'name'} || 'unknown'; + if (($fname eq 'unknown') and + (defined $p->{'headers'}->{'content-id'}) + ){ + $fname = join('',@{$p->{'headers'}->{'content-id'}}); + $fname =~ s/[<>]//g; + $fname =~ tr/\@\$\%\&/_/s; + } my $test = 0; $test++ if ($ctype =~ /image/i); $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i); *************** *** 734,743 **** debuglog("Scan canceled, cannot create Image TMPDIR."); return 0; } ! my $imgfilename = $imgdir . "/" . $fname; my $unique = 0; while (-e $imgfilename) { ! $imgfilename = $imgdir . "/" . chr(65+$unique) . "." . $fname; } unless (open PICT, ">$imgfilename") { --- 745,769 ---- debuglog("Scan canceled, cannot create Image TMPDIR."); return 0; } ! #keep raw email for debugging later ! my $imgfilename = $imgdir . "/raw.eml"; ! unless (-e $imgfilename) { ! if (open RAW, ">$imgfilename") { ! print RAW $pms->{msg}->get_pristine(); ! close RAW; ! debuglog("Saved: $imgfilename"); ! } ! } ! ! $fname =~ tr{a-zA-Z0-9\.}{_}cs; ! $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( ! $imgdir . "/" . $fname ! ); my $unique = 0; while (-e $imgfilename) { ! $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( ! $imgdir . "/" . chr(65+$unique) . "." . $fname ! ); } unless (open PICT, ">$imgfilename") { *************** *** 755,767 **** } if ($cnt == 0) { ! debuglog("Skipping OCR, no image files found..."); return 0; } debuglog("Found: $cnt images"); $cnt = 0; my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; IMAGE: foreach my $file (keys %imgfiles) { my $pic = $imgfiles{$file}; --- 781,797 ---- } if ($cnt == 0) { ! #debuglog("Skipping OCR, no image files found..."); ! removedir($imgdir) if defined $imgdir; return 0; } debuglog("Found: $cnt images"); $cnt = 0; my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; + my $haserr = open RAWERR, ">$imgdir/raw.err"; + debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); + IMAGE: foreach my $file (keys %imgfiles) { my $pic = $imgfiles{$file}; *************** *** 772,778 **** my $ptype = 0; my $tfile = $file; my $pfile = $file . ".pnm"; ! my $efile = $file . ".stderr"; if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) { debuglog("Found GIF header name=\"$$pic{fname}\""); --- 802,808 ---- my $ptype = 0; my $tfile = $file; my $pfile = $file . ".pnm"; ! my $efile = $file . ".err"; if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) { debuglog("Found GIF header name=\"$$pic{fname}\""); *************** *** 815,830 **** } else { debuglog("Image is single non-interlaced..."); ! $tfile = $$pic{fname} . ".fixed"; $retcode = $t->run_and_catch(sub { ! qx($App{giffix} $file >$tfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{giffix}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } ! if (open ERR,$efile) { @stderr_data = ; close ERR; foreach (@stderr_data) { --- 845,862 ---- } else { debuglog("Image is single non-interlaced..."); ! $tfile .= ".fixed"; ! printf RAWERR "## $App{giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{giffix} $file >$tfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{giffix}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } ! if (open ERR, $efile) { @stderr_data = ; close ERR; foreach (@stderr_data) { *************** *** 853,885 **** if ($image_count gt 1) { debuglog("File contains more than one image..."); - my $cfile = $tfile . ".asm"; if ($image_count lt $Option{gif_max_frames}) { debuglog("Assembling images..."); $retcode = $t->run_and_catch(sub { ! qx($App{convert} $tfile +append >$cfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{convert}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } - $tfile = $cfile; } else { debuglog("Image count exceeds limit, skipping some..."); $retcode = $t->run_and_catch(sub{ ! qx($App{gifasm} -d $imgdir/out $tfile 2>$efile); }); if ($retcode) { chomp $retcode; ! debuglog("$App{gifasm}: Timed out [$retcode], skipping..."); ! foreach my $n (0 .. $image_count) { ! my $f = sprintf "%s/out%02d",$imgdir,$n; ! unlink $f if (-e $f); ! } ++$imgerr if $Option{keep_bad_images}; next; }; my $fs = 0; foreach my $n (0 .. $image_count) { my $f = Mail::SpamAssassin::Util::untaint_file_path( sprintf("%s/out%02d",$imgdir,$n) --- 885,926 ---- if ($image_count gt 1) { debuglog("File contains more than one image..."); if ($image_count lt $Option{gif_max_frames}) { debuglog("Assembling images..."); + my $cfile = $tfile; $tfile .= ".gif"; + printf RAWERR qq(## $App{convert} $cfile -append >$tfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{convert} $cfile -append >$tfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{convert}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } } else { debuglog("Image count exceeds limit, skipping some..."); + my $app = $App{gifasm}; + if (-x $App{gif2anim}) { + $app = $App{gif2anim}; $tfile .= ".gif"; + printf RAWERR qq(## $app $tfile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub{ ! qx($app $tfile); }); + } else { + printf RAWERR qq(## $app -d $imgdir/out $tfile 2>>$efile\n) if ($haserr>0); + $retcode = $t->run_and_catch(sub{ + qx($app -d $imgdir/out $tfile 2>>$efile); + }); + } if ($retcode) { chomp $retcode; ! debuglog("$app: Timed out [$retcode], skipping..."); ! printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; }; my $fs = 0; + if ($app eq $App{gifasm}) { foreach my $n (0 .. $image_count) { my $f = Mail::SpamAssassin::Util::untaint_file_path( sprintf("%s/out%02d",$imgdir,$n) *************** *** 887,914 **** my $s = -s $f || 0; $tfile = $f if ($fs < $s); } } } if ($interlaced_gif) { ! my $cfile = $tfile . ".non"; $retcode = $t->run_and_catch(sub{ ! qx($App{gifinter} $tfile >$cfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{gifinter}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } - $tfile = $cfile; } $retcode = $t->run_and_catch(sub { ! qx($App{giftopnm} $tfile >$pfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{giftopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } } --- 928,969 ---- my $s = -s $f || 0; $tfile = $f if ($fs < $s); } + } else { + my $base = $file; $base =~ s/\.\S+//; + opendir TMP, $imgdir; + my @files = grep {m/^${base}_\d{4}.gif$/i} readdir TMP; + closedir TMP; + foreach my $f (@files) { + my $uf = Mail::SpamAssassin::Util::untaint_file_path($f); + my $s = -s $uf || 0; + $tfile = $f if ($fs < $s); + } + } } } if ($interlaced_gif) { ! my $cfile = $tfile; $tfile .= ".non"; ! printf RAWERR qq(## $App{gifinter} $cfile >$tfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub{ ! qx($App{gifinter} $cfile >$tfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{gifinter}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } } + printf RAWERR qq(## $App{giftopnm} $tfile >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{giftopnm} $tfile >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{giftopnm}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } } *************** *** 924,931 **** next IMAGE; } } $retcode = $t->run_and_catch(sub { ! qx($App{jpegtopnm} $file >$pfile 2>$efile); }); if ($retcode) { chomp $retcode; --- 979,987 ---- next IMAGE; } } + printf RAWERR qq(## $App{jpegtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{jpegtopnm} $file >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; *************** *** 945,956 **** next IMAGE; } } $retcode = $t->run_and_catch(sub { ! qx($App{pngtopnm} $file >$pfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{pngtopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } } --- 1001,1014 ---- next IMAGE; } } + printf RAWERR qq(## $App{pngtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{pngtopnm} $file >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{pngtopnm}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } } *************** *** 966,977 **** next IMAGE; } } $retcode = $t->run_and_catch(sub { ! qx($App{bmptopnm} $file >$pfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{bmptopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } } --- 1024,1037 ---- next IMAGE; } } + printf RAWERR qq(## $App{bmptopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{bmptopnm} $file >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{bmptopnm}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } } *************** *** 990,1001 **** next IMAGE; } } $retcode = $t->run_and_catch(sub { ! qx($App{convert} tiff:$file pnm:$pfile 2>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{convert}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } } --- 1050,1063 ---- next IMAGE; } } + printf RAWERR qq(## $App{convert} tiff:$file pnm:$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { ! qx($App{convert} tiff:$file pnm:$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{convert}: Timed out [$retcode], skipping..."); + printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}; next; } } *************** *** 1011,1017 **** debuglog("Error calculating the image hash, skipping hash check..."); } else { if (my $score = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype})) { ! known_img_hash($digest, $score); debuglog("Message is SPAM. Scoring with known old score and ending..."); removedir($imgdir); return 0; --- 1073,1079 ---- debuglog("Error calculating the image hash, skipping hash check..."); } else { if (my $score = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype})) { ! known_img_hash($score); debuglog("Message is SPAM. Scoring with known old score and ending..."); removedir($imgdir); return 0; *************** *** 1033,1051 **** my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; ! $scan =~ s/\$efile/$efile/g; ! unlink $efile if -e $efile; debuglog("Trying: $scanset"); my @ocrdata; $retcode = $t->run_and_catch(sub { @ocrdata = qx($scan 2>>$efile); }); if ($retcode) { chomp $retcode; ! open ERR,"<$efile"; my @stderr = ; close ERR; ! debuglog(join( '', $retcode,@stderr )); debuglog("Skipping scanset \"$scanset\" because of errors, trying next..."); next; } --- 1095,1116 ---- my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; ! $scan =~ s/\$efile/$efile/; ! #unlink $efile if (-e $efile); debuglog("Trying: $scanset"); my @ocrdata; + printf RAWERR qq(## $scan 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { @ocrdata = qx($scan 2>>$efile); }); if ($retcode) { chomp $retcode; ! open ERR,$efile; my @stderr = ; close ERR; ! my $errstr = join( '', $retcode,@stderr ); ! debuglog($errstr); ! printf RAWERR qq($errstr\n) if ($haserr>0); debuglog("Skipping scanset \"$scanset\" because of errors, trying next..."); next; } *************** *** 1088,1093 **** --- 1153,1159 ---- push(@hashes, $info); } } + close RAWERR if ($haserr>0); if ( $cnt >= $Option{counts_required} ) { my $score = sprintf "%0.3f",$Score{base} + (( $cnt - $Option{counts_required} ) * $Score{add} ); *************** *** 1125,1131 **** } else { debuglog("Ignoring ".scalar(@hashes)." hashes, ".$cnt." words found!"); } ! removedir($imgdir) unless $imgerr; debuglog("FuzzyOcr ending successfully..."); return 0; } --- 1191,1199 ---- } else { debuglog("Ignoring ".scalar(@hashes)." hashes, ".$cnt." words found!"); } ! if ($imgerr == 0 and $Option{keep_bad_images}<2) { ! removedir($imgdir); ! } debuglog("FuzzyOcr ending successfully..."); return 0; }