Only in FuzzyOcr-2.3f: CHANGES diff -rbc FuzzyOcr-2.3d/FuzzyOcr.cf FuzzyOcr-2.3f/FuzzyOcr.cf *** FuzzyOcr-2.3d/FuzzyOcr.cf Fri Sep 8 10:43:45 2006 --- FuzzyOcr-2.3f/FuzzyOcr.cf Tue Sep 12 11:43:48 2006 *************** *** 55,70 **** ##### Scansets, comma seperated (Default value: $gocr -i -, $gocr -l 180 -d 2 -i -) ##### # Each scanset consists of one or more commands which make text out of pnm input. # Each scanset is run seperately on the PNM data, results are combined in scoring. ! #focr_scansets $gocr -i -, $gocr -l 180 -d 2 -i - # # To use only one scan with default values, uncomment the next line instead ! #focr_scansets $gocr -i - # # Some example for more advanced sets ! # Thisone uses the first the standard scan, then a scanset which first reduces the image to 3 colors and then scans it with custom settings ! # and then it scans again only with these custom settings # NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC ! #focr_scansets $gocr -i -, pnmnorm 2>$errfile | pnmquant 3 2>>$errfile | pnmnorm 2>>$errfile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i - ######################################################################################### ##### Various Score/Scan settings ##### --- 55,71 ---- ##### Scansets, comma seperated (Default value: $gocr -i -, $gocr -l 180 -d 2 -i -) ##### # Each scanset consists of one or more commands which make text out of pnm input. # Each scanset is run seperately on the PNM data, results are combined in scoring. ! #focr_scansets $gocr -i $pfile, $gocr -l 180 -d 2 -i $pfile # # To use only one scan with default values, uncomment the next line instead ! #focr_scansets $gocr -i $pfile # # Some example for more advanced sets ! # This one uses the first the standard scan, then a scanset which first reduces the image ! # to 3 colors and then scans it with custom settings and then it scans again only with ! # these custom settings # NOTE: This is for advanced users only, if you have questions how to use this, ask on the ML or on IRC ! #focr_scansets $gocr -i $pfile, pnmnorm $pfile 2>$efile| pnmquant 3 2>>$efile | pnmnorm 2>>$efile | $gocr -l 180 -d 2 -i -, $gocr -l 180 -d 2 -i $pfile ######################################################################################### ##### Various Score/Scan settings ##### *************** *** 127,130 **** --- 128,134 ---- # Automatically add hashes of spam images recognized by OCR to the Image Hash database, to disable, set to 0.0 (Default value: 1) #focr_hashing_learn_scanned 1 # + # If enabled, this will save image files in tempdir when errors occur to further debug the plugin. (Default: 0.0). + #focr_keep_bad_images 0.0 + # ###################################################################### diff -rbc FuzzyOcr-2.3d/FuzzyOcr.pm FuzzyOcr-2.3f/FuzzyOcr.pm *** FuzzyOcr-2.3d/FuzzyOcr.pm Mon Sep 11 11:08:03 2006 --- FuzzyOcr-2.3f/FuzzyOcr.pm Tue Sep 12 12:10:31 2006 *************** *** 90,96 **** our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames ! db_hash db_max_days path_bin scansets keep_bad_images enable_image_hashing digest_db hashing_learn_scanned/; our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); --- 90,96 ---- our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames ! db_hash db_safe db_max_days path_bin scansets keep_bad_images enable_image_hashing digest_db hashing_learn_scanned/; our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); *************** *** 358,477 **** else { return $_[0] } } - sub reorder { - my $tmp = join( '', @_ ); - return split( '\n', $tmp ); - } - - sub pipe_io { - $SIG{PIPE} = 'IGNORE'; - my $pipecmd = shift; - my $input = shift; - my $filecount = 0; - my $silent = 0; - my $ignerror = 0; - my $tmpdir; - my @stdout = (); - my @stderr = (); - my ( $tmpfile, $tfilepath ) = Mail::SpamAssassin::Util::secure_tmpfile(); - my ( $errfile, $efilepath ) = Mail::SpamAssassin::Util::secure_tmpfile(); - close($tmpfile); - close($errfile); - if ($tmpfile eq $errfile) { - debuglog("Got same tmpfile twice! Aborting pipe_io() to avoid deadlocking"); - return ( 1, \@stdout, \@stderr ); - unlink($tmpfile); - } - - if($pipecmd =~ /\$tmpdir/) { - $tmpdir = Mail::SpamAssassin::Util::secure_tmpdir(); - $pipecmd =~ s/\$tmpdir/$tmpdir/g; - $filecount = shift; - } else { - $silent = shift; - $ignerror = shift; - } - - $pipecmd =~ s/\$errfile/$errfile/g; - my $pipe_pid = open( PIPE_IN, "| $pipecmd 1>$tmpfile 2>>$errfile" ); - - unless ($pipe_pid) { - unless($silent) { - handle_error( $err_msges[0], ( $pipecmd, $? >> 8, $!, $tmpfile ) ); - } - unlink($tmpfile); - unlink($errfile); - return ( $?, \@stdout, \@stderr ); - } - flock( PIPE_IN, LOCK_EX ); - print PIPE_IN $input; - flock( PIPE_IN, LOCK_UN ); - close(PIPE_IN); - if ($? and not $ignerror) { - unless($silent) { - handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) ); - } - unlink($tmpfile); - unlink($errfile); - return ( $?, \@stdout, \@stderr ); - } - if ($filecount) { - my $tsize = 0; - my $tcount = 0; - foreach my $nr (0..$filecount-1) { - my $filesize = 0; - if ($nr < 10) { - $filesize = -s "$tmpdir/out0$nr.gif"; - } else { - $filesize = -s "$tmpdir/out$nr.gif"; - } - if ($filesize > $tsize) { - $tsize = $filesize; - $tcount = $nr; - } - } - if ($tcount < 10) { - open( PIPE_OUT, "< $tmpdir/out0$tcount.gif" ); - } else { - open( PIPE_OUT, "< $tmpdir/out$tcount.gif" ); - } - flock( PIPE_OUT, LOCK_EX ); - @stdout = ; - flock( PIPE_OUT, LOCK_UN ); - close PIPE_OUT; - foreach my $nr (0..$filecount) { - if ($nr < 10) { - unlink("$tmpdir/out0$nr.gif"); - } else { - unlink("$tmpdir/out$nr.gif"); - } - } - rmdir($tmpdir); - } else { - unless (open( PIPE_OUT, "< $tmpfile" ) - and open( PIPE_ERR, "< $errfile" ) ) - { - unless($silent) { - handle_error( $err_msges[1], ( $pipecmd, $? >> 8, $!, $tmpfile ) ); - } - unlink($tmpfile); - unlink($errfile); - return ( $?, \@stdout, \@stderr ); - } - flock( PIPE_OUT, LOCK_EX ); - flock( PIPE_ERR, LOCK_EX ); - @stdout = ; - @stderr = ; - flock( PIPE_OUT, LOCK_UN ); - flock( PIPE_ERR, LOCK_UN ); - close(PIPE_OUT); - close(PIPE_ERR); - } - unlink($tmpfile) if (-e $tmpfile); - unlink($errfile) if (-e $errfile); - return ( 0, \@stdout, \@stderr ); - } - sub handle_error { my ( $err_msg, @var_vals ) = @_; debuglog(sprintf( $err_msg, @var_vals )); --- 358,363 ---- *************** *** 615,628 **** } sub add_image_hash_db { ! my $digest = shift; ! my $score = shift; my $ret = 0; if ($Option{enable_image_hashing} == 2) { ! my $dbfile = shift || $Option{db_hash}; ! my $fname = shift; ! my $ctype = shift; my %DB = (); tie %DB, 'MLDBM', $dbfile or $ret++; if ($ret>0) { --- 501,514 ---- } sub add_image_hash_db { ! my $digest = $_[0]; ! my $score = $_[1]; my $ret = 0; if ($Option{enable_image_hashing} == 2) { ! my $dbfile = $_[2] || $Option{db_hash}; ! my $fname = $_[3]; ! my $ctype = $_[4]; my %DB = (); tie %DB, 'MLDBM', $dbfile or $ret++; if ($ret>0) { *************** *** 663,669 **** sub calc_image_hash { my $pfile = $_[0]; ! my ($rcode, $hash, $h, $w); my $s = -s $pfile; foreach my $a (qw/identify ppmhist/) { --- 549,555 ---- sub calc_image_hash { my $pfile = $_[0]; ! my ($rcode, $hash); my $s = -s $pfile; foreach my $a (qw/identify ppmhist/) { *************** *** 681,688 **** if ($rcode) { chomp $rcode; debuglog("$App{identify}: Timed out [$rcode], skipping..."); ! return (1, $hash); } foreach (@stdout_data) { if ($_ =~ /(\d+)x(\d+)/) { $h = $1; --- 567,576 ---- if ($rcode) { chomp $rcode; debuglog("$App{identify}: Timed out [$rcode], skipping..."); ! return (1, ''); } + + my ($h,$w) = (0,0); foreach (@stdout_data) { if ($_ =~ /(\d+)x(\d+)/) { $h = $1; *************** *** 692,698 **** } if ($h == 0 or $w == 0) { debuglog("Unable to determine size of image, skipping..."); ! return(1,$hash); } $rcode = $t->run_and_catch(sub { @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null); --- 580,586 ---- } if ($h == 0 or $w == 0) { debuglog("Unable to determine size of image, skipping..."); ! return(1,''); } $rcode = $t->run_and_catch(sub { @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null); *************** *** 700,706 **** if ($rcode) { chomp $rcode; debuglog("$App{ppmhist}: Timed out [$rcode], skipping..."); ! return (1, $hash); } my $cnt = 0; my $c = scalar(@stdout_data); --- 588,594 ---- if ($rcode) { chomp $rcode; debuglog("$App{ppmhist}: Timed out [$rcode], skipping..."); ! return (1, ''); } my $cnt = 0; my $c = scalar(@stdout_data); *************** *** 807,819 **** debuglog("Scan canceled, message has already more than $Score{autodisable} points."); return 0; } - my $imgdir = Mail::SpamAssassin::Util::secure_tmpdir(); - unless ($imgdir) { - debuglog("Scan canceled, cannot create Image TMPDIR."); - return 0; - } - debuglog("Using: $imgdir"); my %imgfiles = (); my @found = (); my @hashes = (); --- 695,702 ---- debuglog("Scan canceled, message has already more than $Score{autodisable} points."); return 0; } + my $imgdir; my %imgfiles = (); my @found = (); my @hashes = (); *************** *** 846,851 **** --- 729,739 ---- next; } + $imgdir = Mail::SpamAssassin::Util::secure_tmpdir() unless ($imgdir); + unless ($imgdir) { + debuglog("Scan canceled, cannot create Image TMPDIR."); + return 0; + } my $imgfilename = $imgdir . "/" . $fname; my $unique = 0; while (-e $imgfilename) { *************** *** 856,877 **** debuglog("Cannot write \"$imgfilename\", skipping..."); next; } - binmode PICT; my $pdata = $p->decode(); print PICT $pdata; close PICT; $imgfiles{$imgfilename}{header} = substr($pdata,0,6); $imgfiles{$imgfilename}{ctype} = $ctype; $imgfiles{$imgfilename}{fname} = $fname; } ! unless (keys %imgfiles) { ! rmdir $imgdir; debuglog("Skipping OCR, no image files found..."); return 0; } ! ! my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; --- 744,764 ---- debuglog("Cannot write \"$imgfilename\", skipping..."); next; } my $pdata = $p->decode(); + binmode PICT; print PICT $pdata; close PICT; + debuglog("Wrote: $imgfilename"); $cnt++; $imgfiles{$imgfilename}{header} = substr($pdata,0,6); $imgfiles{$imgfilename}{ctype} = $ctype; $imgfiles{$imgfilename}{fname} = $fname; } ! if ($cnt == 0) { debuglog("Skipping OCR, no image files found..."); return 0; } ! debuglog("Found: $cnt images"); $cnt = 0; my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; *************** *** 937,943 **** debuglog("$App{giffix}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } ! open ERR, "< $efile"; @stderr_data = ; close ERR; foreach (@stderr_data) { --- 824,830 ---- debuglog("$App{giffix}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}; next; } ! if (open ERR,$efile) { @stderr_data = ; close ERR; foreach (@stderr_data) { *************** *** 947,952 **** --- 834,840 ---- } } } + } if ($corrupt) { if ($interlaced_gif or ($image_count gt 1)) { *************** *** 993,1000 **** }; my $fs = 0; foreach my $n (0 .. $image_count) { ! my $f = sprintf "%s/out%02d",$imgdir,$n; ! $tfile = $f if ($fs < -s $f); } } } --- 881,891 ---- }; my $fs = 0; foreach my $n (0 .. $image_count) { ! my $f = Mail::SpamAssassin::Util::untaint_file_path( ! sprintf("%s/out%02d",$imgdir,$n) ! ); ! my $s = -s $f || 0; ! $tfile = $f if ($fs < $s); } } } *************** *** 1115,1122 **** if($Option{enable_image_hashing}) { debuglog("Calculating the image hash..."); ! my ($rcode, $digest) = calc_image_hash($pfile); ! if ($rcode) { debuglog("Error calculating the image hash, skipping hash check..."); } else { if (my $score = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype})) { --- 1006,1013 ---- if($Option{enable_image_hashing}) { debuglog("Calculating the image hash..."); ! ($corrupt, $digest) = calc_image_hash($pfile); ! if ($corrupt) { debuglog("Error calculating the image hash, skipping hash check..."); } else { if (my $score = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype})) { *************** *** 1142,1151 **** my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; debuglog("Trying: $scanset"); my @ocrdata; $retcode = $t->run_and_catch(sub { ! @ocrdata = qx($scan 2>$efile); }); if ($retcode) { chomp $retcode; --- 1033,1044 ---- my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; + $scan =~ s/\$efile/$efile/g; + unlink $efile if -e $efile; debuglog("Trying: $scanset"); my @ocrdata; $retcode = $t->run_and_catch(sub { ! @ocrdata = qx($scan 2>>$efile); }); if ($retcode) { chomp $retcode;