diff -rbc FuzzyOcr-2.3e/CHANGES FuzzyOcr-2.3f/CHANGES *** FuzzyOcr-2.3e/CHANGES Tue Sep 12 08:38:42 2006 --- FuzzyOcr-2.3f/CHANGES Tue Sep 12 11:56:52 2006 *************** *** 1,3 **** --- 1,17 ---- + version 2.3f: + Fixed: + Properly initialized $h and $w to zero so that when getting the height and width + from an image, if the size parameters cannot be parsed, they can get properly tested. + + Fixed: + Hashing now works. $digest was getting reset because it went out of scope. grrr. + + Fixed: + $efile was only being replaced for first occurrence in complex scansets. + + Fixed: + Various bugs where: Use of uninitialized values were reported. + version 2.3e: Fixed: Option: 'focr_db_safe' diff -rbc FuzzyOcr-2.3e/FuzzyOcr.cf FuzzyOcr-2.3f/FuzzyOcr.cf *** FuzzyOcr-2.3e/FuzzyOcr.cf Tue Sep 12 08:28:36 2006 --- FuzzyOcr-2.3f/FuzzyOcr.cf Tue Sep 12 11:43:48 2006 *************** *** 128,131 **** --- 128,134 ---- # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) #focr_hashing_learn_scanned 1 # + # If enabled, this will save image files in tempdir when errors occur to further debug the plugin. (Default: 0.0). + #focr_keep_bad_images 0.0 + # ###################################################################### diff -rbc FuzzyOcr-2.3e/FuzzyOcr.pm FuzzyOcr-2.3f/FuzzyOcr.pm *** FuzzyOcr-2.3e/FuzzyOcr.pm Tue Sep 12 08:28:48 2006 --- FuzzyOcr-2.3f/FuzzyOcr.pm Tue Sep 12 12:10:31 2006 *************** *** 90,96 **** our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames ! db_safe db_hash db_max_days path_bin scansets keep_bad_images enable_image_hashing digest_db hashing_learn_scanned/; our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); --- 90,96 ---- our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames ! db_hash db_safe db_max_days path_bin scansets keep_bad_images enable_image_hashing digest_db hashing_learn_scanned/; our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); *************** *** 501,514 **** } sub add_image_hash_db { ! my $digest = shift; ! my $score = shift; my $ret = 0; if ($Option{enable_image_hashing} == 2) { ! my $dbfile = shift || $Option{db_hash}; ! my $fname = shift; ! my $ctype = shift; my %DB = (); tie %DB, 'MLDBM', $dbfile or $ret++; if ($ret>0) { --- 501,514 ---- } sub add_image_hash_db { ! my $digest = $_[0]; ! my $score = $_[1]; my $ret = 0; if ($Option{enable_image_hashing} == 2) { ! my $dbfile = $_[2] || $Option{db_hash}; ! my $fname = $_[3]; ! my $ctype = $_[4]; my %DB = (); tie %DB, 'MLDBM', $dbfile or $ret++; if ($ret>0) { *************** *** 549,555 **** sub calc_image_hash { my $pfile = $_[0]; ! my ($rcode, $hash, $h, $w); my $s = -s $pfile; foreach my $a (qw/identify ppmhist/) { --- 549,555 ---- sub calc_image_hash { my $pfile = $_[0]; ! my ($rcode, $hash); my $s = -s $pfile; foreach my $a (qw/identify ppmhist/) { *************** *** 567,574 **** if ($rcode) { chomp $rcode; debuglog("$App{identify}: Timed out [$rcode], skipping..."); ! return (1, $hash); } foreach (@stdout_data) { if ($_ =~ /(\d+)x(\d+)/) { $h = $1; --- 567,576 ---- if ($rcode) { chomp $rcode; debuglog("$App{identify}: Timed out [$rcode], skipping..."); ! return (1, ''); } + + my ($h,$w) = (0,0); foreach (@stdout_data) { if ($_ =~ /(\d+)x(\d+)/) { $h = $1; *************** *** 578,584 **** } if ($h == 0 or $w == 0) { debuglog("Unable to determine size of image, skipping..."); ! return(1,$hash); } $rcode = $t->run_and_catch(sub { @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null); --- 580,586 ---- } if ($h == 0 or $w == 0) { debuglog("Unable to determine size of image, skipping..."); ! return(1,''); } $rcode = $t->run_and_catch(sub { @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null); *************** *** 586,592 **** if ($rcode) { chomp $rcode; debuglog("$App{ppmhist}: Timed out [$rcode], skipping..."); ! return (1, $hash); } my $cnt = 0; my $c = scalar(@stdout_data); --- 588,594 ---- if ($rcode) { chomp $rcode; debuglog("$App{ppmhist}: Timed out [$rcode], skipping..."); ! return (1, ''); } my $cnt = 0; my $c = scalar(@stdout_data); *************** *** 617,623 **** sub wrong_ctype { my ( $format, $ctype ) = @_; ! if ($Score{wrongctype}) { my $debuginfo = ""; if ( $Option{verbose} > 0 ) { $debuginfo = --- 619,625 ---- sub wrong_ctype { my ( $format, $ctype ) = @_; ! if ($Score{wctypescore}) { my $debuginfo = ""; if ( $Option{verbose} > 0 ) { $debuginfo = *************** *** 626,634 **** } for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} = ! sprintf( "%0.3f", $Score{wrongctype} ); } ! $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $Score{wrongctype}, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" ); } } --- 628,636 ---- } for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} = ! sprintf( "%0.3f", $Score{wctypescore} ); } ! $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $Score{wctypescore}, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" ); } } *************** *** 693,705 **** debuglog("Scan canceled, message has already more than $Score{autodisable} points."); return 0; } - my $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); - unless ($imgdir) { - debuglog("Scan canceled, cannot create Image TMPDIR."); - return 0; - } - debuglog("Using: $imgdir"); my %imgfiles = (); my @found = (); my @hashes = (); --- 695,702 ---- debuglog("Scan canceled, message has already more than $Score{autodisable} points."); return 0; } + my $imgdir; my %imgfiles = (); my @found = (); my @hashes = (); *************** *** 732,737 **** --- 729,739 ---- next; } + $imgdir = Mail::SpamAssassin::Util::secure_tmpdir() unless ($imgdir); + unless ($imgdir) { + debuglog("Scan canceled, cannot create Image TMPDIR."); + return 0; + } my $imgfilename = $imgdir . "/" . $fname; my $unique = 0; while (-e $imgfilename) { *************** *** 742,763 **** debuglog("Cannot write \"$imgfilename\", skipping..."); next; } - binmode PICT; my $pdata = $p->decode(); print PICT $pdata; close PICT; $imgfiles{$imgfilename}{header} = substr($pdata,0,6); $imgfiles{$imgfilename}{ctype} = $ctype; $imgfiles{$imgfilename}{fname} = $fname; } ! unless (keys %imgfiles) { ! rmdir $imgdir; debuglog("Skipping OCR, no image files found..."); return 0; } ! ! my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; --- 744,764 ---- debuglog("Cannot write \"$imgfilename\", skipping..."); next; } my $pdata = $p->decode(); + binmode PICT; print PICT $pdata; close PICT; + debuglog("Wrote: $imgfilename"); $cnt++; $imgfiles{$imgfilename}{header} = substr($pdata,0,6); $imgfiles{$imgfilename}{ctype} = $ctype; $imgfiles{$imgfilename}{fname} = $fname; } ! if ($cnt == 0) { debuglog("Skipping OCR, no image files found..."); return 0; } ! debuglog("Found: $cnt images"); $cnt = 0; my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; *************** *** 771,777 **** my $ptype = 0; my $tfile = $file; my $pfile = $file . ".pnm"; ! my $efile = $file . ".err"; if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) { debuglog("Found GIF header name=\"$$pic{fname}\""); --- 772,778 ---- my $ptype = 0; my $tfile = $file; my $pfile = $file . ".pnm"; ! my $efile = $file . ".stderr"; if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) { debuglog("Found GIF header name=\"$$pic{fname}\""); *************** *** 823,829 **** debuglog("$App{giffix}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } ! open ERR, "< $efile"; @stderr_data = ; close ERR; foreach (@stderr_data) { --- 824,830 ---- debuglog("$App{giffix}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } ! if (open ERR,$efile) { @stderr_data = ; close ERR; foreach (@stderr_data) { *************** *** 833,838 **** --- 834,840 ---- } } } + } if ($corrupt) { if ($interlaced_gif or ($image_count gt 1)) { *************** *** 879,886 **** }; my $fs = 0; foreach my $n (0 .. $image_count) { ! my $f = sprintf "%s/out%02d",$imgdir,$n; ! $tfile = $f if ($fs < -s $f); } } } --- 881,891 ---- }; my $fs = 0; foreach my $n (0 .. $image_count) { ! my $f = Mail::SpamAssassin::Util::untaint_file_path( ! sprintf("%s/out%02d",$imgdir,$n) ! ); ! my $s = -s $f || 0; ! $tfile = $f if ($fs < $s); } } } *************** *** 1001,1008 **** if($Option{enable_image_hashing}) { debuglog("Calculating the image hash..."); ! my ($rcode, $digest) = calc_image_hash($pfile); ! if ($rcode) { debuglog("Error calculating the image hash, skipping hash check..."); } else { if (my $score = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype})) { --- 1006,1013 ---- if($Option{enable_image_hashing}) { debuglog("Calculating the image hash..."); ! ($corrupt, $digest) = calc_image_hash($pfile); ! if ($corrupt) { debuglog("Error calculating the image hash, skipping hash check..."); } else { if (my $score = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype})) { *************** *** 1028,1034 **** my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; ! $scan =~ s/\$efile/$efile/; unlink $efile if -e $efile; debuglog("Trying: $scanset"); my @ocrdata; --- 1033,1039 ---- my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; ! $scan =~ s/\$efile/$efile/g; unlink $efile if -e $efile; debuglog("Trying: $scanset"); my @ocrdata;