# FuzzyOcr plugin, version 2.3j # Changelog: # version 2.0 # Replaced imagemagick with netpbm # Invoke giffix to fix broken gifs before conversion # Support png images # Analyze the file to detect the format without content-type # Added several configuration parameters # version 2.1 # Added scoring for wrong content-type # Added scoring for broken gif images # Added configuration for helper applications # Added autodisable_score feature to disable the OCR engine if the message has already enough points # version 2.1b # Rule bugfix to avoid warnings # version 2.1c # Applied patch provided by Howard Kash to fix problems with spamassassin + Mailscanner + FuzzyOcr # Removed '-' from jpegtopnm arguments to provide backwards compatibility for older netpbm versions # Fixed typo (treshold -> threshold) # version 2.2 # Small bugfix in content-type check for jpeg (jpg was not matching), thanks to Matthias Keller # Added more error handling # Removed debug files, added logfile instead # More messages with verbose = 2 # version 2.3 # Multiple scans with different pnm preprocessing and gocr arguments possible # Support for interlaced gifs # Support for animated gifs # Temporary file handling reorganized # External wordlist support # Personalized wordlist support # Spaces are now stripped from wordlist words and OCR results before matching # Experimental MD5 Database feature # version 2.3b # MD5 Database replaced by different feature database # Corrupted images are now handled better # Added a timeout function to avoid lockups # Added threshold overriding on word basis in wordlist # Various bugfixes # # written by Christian Holler decoder_at_own-hero_dot_net # patch by Jorge Valdes jorge_at_joval_info package FuzzyOcr; use strict; use warnings; use Mail::SpamAssassin; use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Util; use Mail::SpamAssassin::Timeout; use Mail::SpamAssassin::Plugin; use String::Approx 'adistr'; use Image::Magick; use MLDBM qw(DB_File Storable); use FileHandle; use Fcntl ':flock'; our @ISA = qw (Mail::SpamAssassin::Plugin); our %App = (); our %Option = (); our %Score = (); our %Threshold = (); our %words = (); our $self; our $pms; our @scansets; our @bin_utils = qw/giffix giftext gifinter giftopnm jpegtopnm pngtopnm bmptopnm ppmhist gocr/; our @pgm_scores = qw/base add corrupt corrupt_unfixable wrongctype autodisable/; our @pgm_opts = qw/personal_wordlist global_wordlist logfile threshold counts_required verbose timeout gif_max_frames db_hash db_safe db_max_days path_bin scansets keep_bad_images anim_delay anim_max_frames score_ham enable_image_hashing digest_db hashing_learn_scanned/; our @paths = qw(/usr/local/netpbm/bin /usr/local/bin /usr/bin); # Default values $Option{threshold} = 0.3; $Option{counts_required} = 2; $Option{verbose} = 1; $Option{timeout} = 10; $Option{gif_max_frames} = 5; $Option{logfile} = "stderr"; $Option{enable_image_hashing} = 0; $Option{hashing_learn_scanned} = 1; $Option{digest_db} = "/etc/mail/spamassassin/FuzzyOcr.hashdb"; $Option{global_wordlist} = "/etc/mail/spamassassin/FuzzyOcr.words"; $Option{personal_wordlist} = ".spamassassin/fuzzyocr.words"; $Option{db_hash} = "/etc/mail/spamassassin/FuzzyOcr.db"; $Option{db_safe} = "/etc/mail/spamassassin/FuzzyOcr.safe.db"; $Option{db_max_days} = 35; $Option{keep_bad_images} = 0; $Option{anim_delay} = 100; $Option{anim_max_frames} = 2; $Option{score_ham} = 0; # Default scores $Score{base} = 4; $Score{add} = 1; $Score{corrupt} = 2.5; $Score{corrupt_unfixable} = 5; $Score{wrongctype} = 1.5; $Score{autodisable} = 10; # Default thresolds $Threshold{s} = $Threshold{h} = $Threshold{w} = $Threshold{cn} = 0.01; $Threshold{c} = 5; $Threshold{max_hash} = 5; # constructor: register the eval rule sub new { my ( $class, $mailsa ) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsa); bless( $self, $class ); $self->register_eval_rule("fuzzyocr_check"); $self->register_eval_rule("dummy_check"); return $self; } sub parse_config { my ( $self, $opts ) = @_; if ( $opts->{key} =~ /^focr_bin_/i ) { my $p = lc $opts->{key}; $p =~ s/focr_bin_//; if (grep {m/$p/} @bin_utils) { $App{$p} = $opts->{value}; debuglog("App{$p} => $App{$p}"); } else { debuglog("unknown App: $opts->{key}"); } } elsif ( $opts->{key} =~ m/_score$/i ) { my $o = lc $opts->{key}; $o =~ s/focr_//; $o =~ s/_score//; if (grep {m/$o/} @pgm_scores) { $Score{$o} = $opts->{value}; debuglog("Score{$o} = $Score{$o}"); } else { debuglog("unknown Score: $opts->{key}"); } } else { my $o = lc $opts->{key}; $o =~ s/focr_//; if (grep {m/$o/} @pgm_opts) { if ($o eq 'scansets') { @scansets = (); # remove foreach my $s (split(',',$opts->{value})) { $s =~ s/^\s*//; $s =~ s/\s*$//; push @scansets,$s; debuglog("Found scan: $s"); } } elsif ($o eq 'path_bin') { @paths = (); # remove foreach my $p (split(':',$opts->{value})) { next unless -d $p; push @paths,$p; debuglog("Valid search path: $p"); } } else { $Option{$o} = $opts->{value}; debuglog("Option $o = $Option{$o}"); } } else { debuglog("unknown Option: $opts->{key}"); } } 1; } sub finish_parsing_end { if ($Option{logfile} !~ m/stderr/i) { if(!Mail::SpamAssassin::Logger::add(method=>'file', filename=>$Option{logfile})) { debuglog("Could not log into $Option{logfile}"); } else { Mail::SpamAssassin::Logger::add_facilities('file'); debuglog("Now logging to \"$Option{logfile}\""); } } unless (@paths) { foreach my $p ( '/usr/local/netpbm/bin', '/usr/local/bin', '/usr/bin' ) { if (-d $p) { push @paths, $p; debuglog("Searching in: $p"); } } } foreach my $a (@bin_utils) { if (defined $App{$a} and ! -x $App{$a}) { debuglog("cannot exec $a, removing..."); delete $App{$a}; } foreach my $p (@paths) { my $f = "$p/$a"; if (! defined $App{$a} and -x $f) { $App{$a} = $f; last; } } if (defined $App{$a}) { debuglog("Using $a => $App{$a}"); } else { debuglog("Cannot find executable for $a"); } } if ($Option{enable_image_hashing} == 2 and -r $Option{digest_db}) { my %DB; my $dbm; my $err = 0; my $now = time - ($Option{db_max_days}*86400); tie %DB, 'MLDBM', $Option{db_hash} or $err++; if ($err) { debuglog("Could not open \"$Option{db_hash}\""); } else { my $hash = 0; debuglog("Expiring records prior to: ".scalar(localtime($now))); foreach my $k (keys %DB) { my $db = $DB{$k}; if ($db->{check} < $now) { debuglog("Expire: <$k> Reason: $db->{check} < $now"); delete $DB{$k}; $hash++; } } debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)") if ($hash>0); $hash = 0; open HASH, $Option{digest_db}; while () { chomp; my($score,$basic,$key) = split('::',$_,3); next if (defined $DB{$key}); $dbm = $DB{$key}; $dbm->{score} = $score; $dbm->{basic} = $basic; $dbm->{input} = $dbm->{check} = time; $dbm->{match} = 1; $DB{$key} = $dbm; $hash++; } close HASH; debuglog("Imported <$hash> Image Hashes from \"$Option{digest_db}\"") if ($hash>0); $hash = scalar(keys %DB); debuglog("<$hash> Known BAD Image Hashes Available"); } untie %DB; $err = 0; tie %DB, 'MLDBM', $Option{db_safe} or $err++; if ($err) { debuglog("Could not open \"$Option{db_safe}\""); } else { my $hash = 0; foreach my $k (keys %DB) { my $db = $DB{$k}; if ($db->{check} < $now) { debuglog("Expire: <$k> Reason: $db->{check} < $now"); delete $DB{$k}; $hash++; } } debuglog("Expired <$hash> Image Hashes after $Option{db_max_days} day(s)") if ($hash>0); $hash = scalar(keys %DB); debuglog("<$hash> Known GOOD Image Hashes Available"); } untie %DB; } load_global_words( $Option{global_wordlist} ); unless (@scansets) { @scansets = ( '$gocr -i $pfile', '$gocr -l 180 -d 2 -i $pfile'); } foreach (@scansets) { debuglog("Using scan: $_"); } } sub dummy_check { return 0; } sub load_global_words { unless ( -r $_[0] ) { debuglog("Cannot read Global wordlist: \"$_[0]\"\n Please check file path and permissions are correct."); return; } my $cnt = 0; open WORDLIST, "<$_[0]"; while(my $w = ) { chomp($w); $w =~ s/\s*//; $w =~ s/#(.*)//; next unless $w; my $wt = $Option{threshold}; if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { ($w, $wt) = (lc($1), $2); $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); } $words{$w} = $wt; $cnt++; } close WORDLIST; debuglog("Loaded <$cnt> words from \"$_[0]\""); } sub load_personal_words { unless ( -e $_[0] ) { #debuglog("Personal wordlist <$_[0]> not found, skipping..."); return; } unless ( -r $_[0] ) { debuglog("Cannot read from wordlist \"$_[0]\"\n Please make sure that permissions are correct." ); return; } my $cnt = 0; open WORDLIST, "<$_[0]"; while(my $w = ) { chomp($w); $w =~ s/\s*//; $w =~ s/#(.*)//; next unless $w; my $wt = $Option{threshold}; if ($w =~ /^(.*?)::(0(\.\d+){0,1})/) { ($w, $wt) = ($1, $2); $wt = $Option{threshold} unless ($wt =~ m/[\d\.]+/); } $words{$w} = $wt; $cnt++; } close WORDLIST; debuglog("Updated Word List with <$cnt> words from \"$_[0]\""); } sub max { unless ( defined( $_[0] ) and defined( $_[1] ) ) { return 0 } unless ( defined( $_[0] ) ) { return $_[1] } unless ( defined( $_[1] ) ) { return $_[0] } if ( $_[0] < $_[1] ) { return $_[1] } else { return $_[0] } } sub within_threshold { my $digest = shift; my $record = shift; my ($dimg,$dkey) = split('::',$digest); my ($rimg,$rkey) = split('::',$record); my ($ds, $dh, $dw, $dcn) = split(':',$dimg); my ($rs, $rh, $rw, $rcn) = split(':',$rimg); return(0) unless $rs; return(0) unless $rh; return(0) unless $rw; return(0) unless $rcn; return(0) unless $rkey; return(0) if ((abs($ds - $rs ) / $rs ) > $Threshold{s}); return(0) if ((abs($dh - $rh ) / $rh ) > $Threshold{h}); return(0) if ((abs($dw - $rw ) / $rw ) > $Threshold{w}); return(0) if ((abs($dcn - $rcn) / $rcn) > $Threshold{cn}); my @rcf = split('::',$rkey); my @dcf = split('::',$dkey); my (@dcfs, @rcfs); foreach (@dcf) { push @dcfs,split(':',$_); } foreach (@rcf) { push @rcfs,split(':',$_); } my $total = scalar(@rcfs); if ($total == scalar(@dcfs)) { my $match = 0; foreach (0 .. ($total-1)) { $match++ if (abs($dcfs[$_] - $rcfs[$_]) <= $Threshold{c}); } debuglog("image matched <$match> of <$total> colors"); return(1) if ($match == $total); } return(0); } sub fmt_time { my $when = time - $_[0]; my $ret; if ($when>86400) { my $d = int($when/86400); $when -= $d*86400; $ret = "$d days,"; } if ($when>3600) { my $h = int($when/3600); $when -= $h*3600; $ret .= " $h hours"; } if ($when>60) { my $m = int($when/60); $when -= $m*60; $ret .= " $m minutes"; } if ($when>0) { $ret .= " $when seconds"; } $ret .= " ago"; return $ret; } sub check_image_hash_db { my $digest = $_[0]; my $dbfile = $_[1] || $Option{db_hash}; my $fname = $_[2]; my $ctype = $_[3]; my ($img, $key) = split('::', $digest,2); my $hash = $digest; my $ret = 0; my $txt = 'Exact'; my $dinfo; my %DB = (); my $dbm; my $new = $key; if ($Option{enable_image_hashing} == 2) { tie %DB, 'MLDBM', $dbfile, O_RDWR or $ret++; if ($ret>0) { debuglog("No Image Hash database found at \"$dbfile\", or permissions wrong."); return (0,''); } if (defined $DB{$key}) { $dbm = $DB{$key}; if ($img eq $dbm->{basic}) { $ret = $dbm->{score} || 0.001; $dinfo = $dbm->{dinfo} || ''; $dbm->{fname} = $fname; $dbm->{ctype} = $ctype; debuglog("Updating $txt info File:'$fname' Type:'$ctype'"); $DB{$key} = $dbm; } } if ($ret == 0) { my $now = time - ($Option{db_max_days}*86400); foreach my $k (keys %DB) { $dbm = $DB{$k}; $hash = $dbm->{basic} ? $dbm->{basic} : '0:0:0:0' . '::' . $k; if (within_threshold($digest,$hash)) { $ret = $dbfile eq $Option{db_hash} ? $dbm->{score} : $dbm->{match}; $txt = 'Approx'; $new = $k; $dinfo = $dbm->{dinfo} || ''; debuglog("Found in: <$dbfile>"); last; } # Has the record expired?? $dbm->{check} = $now - 1 unless defined $dbm->{check}; if ($dbm->{check} < $now) { debuglog("Expiring <$k> older than $Option{db_max_days} days"); delete $DB{$k}; } } } if ($ret>0) { $dbm->{match}++; if ($dbfile eq $Option{db_hash}) { $ret = sprintf("%0.3f",$dbm->{score}); debuglog("Found Score <$ret> for $txt Image Hash"); } debuglog("Matched [$dbm->{match}] time(s). Prev match: ".fmt_time($dbm->{check})); $dbm->{check} = time; $DB{$new} = $dbm; } untie %DB; return ($ret,$dinfo); } elsif ($Option{enable_image_hashing} == 1) { $ret = open HASH, $Option{digest_db}; unless($ret) { debuglog("No Image Hash database found at \"$Option{digest_db}\", or permissions wrong."); return (0,''); } while () { chomp; ($ret,$hash) = split('::',$_,2); if (within_threshold($digest,$hash)) { debuglog("Found Score <$ret> for Hash <$hash>"); return ($ret,''); } } close HASH; return (0,''); } } sub add_image_hash_db { my $digest = $_[0]; my $score = $_[1]; my $ret = 0; if ($Option{enable_image_hashing} == 2) { my $dbfile = $_[2] || $Option{db_hash}; my %DB = (); tie %DB, 'MLDBM', $dbfile or $ret++; if ($ret>0) { debuglog("Unable to open/create Image Hash database at \"$dbfile\", check permissions."); return; } debuglog("Adding Hash to \"$dbfile\""); my ($img,$key) = split('::',$digest,2); my $dbm = $DB{$key}; $dbm->{fname} = $_[3]; $dbm->{ctype} = $_[4]; $dbm->{dinfo} = $_[5]; $dbm->{basic} = $img; $dbm->{score} = $score; $dbm->{input} = $dbm->{check} = time; $dbm->{match} = $dbfile eq $Option{db_hash} ? 0 : 1; $DB{$key} = $dbm; untie %DB; } elsif ($Option{enable_image_hashing} == 1) { if (-e $Option{digest_db}) { $ret = open DB, ">>$Option{digest_db}"; } else { $ret = open DB, ">$Option{digest_db}"; } unless ($ret) { debuglog("Unable to open/create Image Hash database at \"$Option{digest_db}\", check permissions."); return; } debuglog("Adding Hash to \"$Option{digest_db}\""); flock( DB, LOCK_EX ); seek( DB, 0, 2 ); print DB "${score}::${digest}\n"; flock( DB, LOCK_UN ); close(DB); } debuglog("Digest: $digest"); } sub calc_image_hash { my $pfile = $_[0]; my ($rcode, $hash); foreach my $a (qw/ppmhist/) { unless (defined $App{$a}) { info("FuzzyOcr: calc_image_hash cannot exec $a"); return (1, ''); } } my $img = new Image::Magick; my ($w,$h,$s,$t) = $img->ping($pfile); $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my @stdout_data; $rcode = $t->run_and_catch(sub { @stdout_data = qx($App{ppmhist} -noheader $pfile 2>/dev/null); }); if ($rcode) { chomp $rcode; debuglog("$App{ppmhist}: Timed out [$rcode], skipping..."); return (1, ''); } my $cnt = 0; my $c = scalar(@stdout_data); $hash = sprintf "%d:%d:%d:%d",$s,$h,$w,$c; if ($Threshold{max_hash}) { foreach (@stdout_data) { $_ =~ s/ +/ /g; my(@d) = split(' ', $_); $hash .= sprintf("::%d:%d:%d:%d:%d",@d); if ($cnt++ ge $Threshold{max_hash}) { last; } } } debuglog("Got: <$hash>"); return(0, $hash); } sub debuglog { my @lines = split('\n',$_[0]); my $limit = $_[1] || 1; if ( $Option{verbose} > $limit ) { foreach (@lines) { info("FuzzyOcr: $_"); } } else { foreach (@lines) { dbg ("FuzzyOcr: $_"); } } } sub wrong_ctype { my ( $format, $ctype ) = @_; if ($Score{wrongctype}) { my $debuginfo = ""; if ( $Option{verbose} > 0 ) { $debuginfo = ("Image has format \"$format\" but content-type is \"$ctype\""); debuglog($debuginfo); } for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_WRONG_CTYPE"} = sprintf( "%0.3f", $Score{wrongctype} ); } $pms->_handle_hit( "FUZZY_OCR_WRONG_CTYPE", $Score{wrongctype}, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_WRONG_CTYPE} . "\n$debuginfo" ); } } sub corrupt_img { my ($score, $err) = @_; if ($score>0) { my $debuginfo = ""; if ( $Option{verbose} > 0 ) { chomp($err); $debuginfo = ("Corrupt image: $err"); debuglog($debuginfo); } for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_CORRUPT_IMG"} = sprintf( "%0.3f", $score ); } $pms->_handle_hit( "FUZZY_OCR_CORRUPT_IMG", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_CORRUPT_IMG} . "\n$debuginfo" ); } } sub known_img_hash { my $score = $_[0] || $Score{base}; my $dinfo = $_[1] ? "\n$_[1]" : ''; for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR_KNOWN_HASH"} = sprintf( "%0.3f", $score ); } $pms->_handle_hit( "FUZZY_OCR_KNOWN_HASH", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR_KNOWN_HASH} . $dinfo ); } sub removedir { my $dir = $_[0]; return unless -d $dir; opendir D, $dir; my @files = readdir D; closedir D; foreach my $f (@files) { next if $f eq '.'; next if $f eq '..'; my $ff = Mail::SpamAssassin::Util::untaint_file_path("$dir/$f"); unless (unlink $ff) { debuglog("Cannot remove: $ff"); } } debuglog("Remove DIR: $dir"); unless(rmdir $dir) { debuglog("Cannot remove DIR: $dir"); } } sub fuzzyocr_check { ( $self, $pms ) = @_; if ( $pms->get_score() > $Score{autodisable} ) { debuglog("Scan canceled, message has already more than $Score{autodisable} points."); return 0; } my $imgdir; my %imgfiles = (); my @found = (); my @hashes = (); my $cnt = 0; my $imgerr = 0; #debuglog("Starting FuzzyOcr..."); #debuglog("Attempting to load personal wordlist..."); if ($Option{personal_wordlist} =~ m/^\//) { load_personal_words( $Option{personal_wordlist} ); } else { my $homedir = (getpwuid($<))[7]; if ($homedir) { load_personal_words( $homedir . "/$Option{personal_wordlist}" ); } elsif (defined($ENV{HOME})) { load_personal_words( $ENV{HOME} . "/$Option{personal_wordlist}" ); } else { debuglog("Variable \$ENV{HOME} not defined and getpwuid failed, personal wordlist function not available..."); } } foreach my $p ( $pms->{msg}->find_parts(qr(^image\b)i), $pms->{msg}->find_parts(qr(Application/Octet-Stream)i) ) { my $ctype = $p->{'type'}; my $fname = $p->{'name'} || 'unknown'; if (($fname eq 'unknown') and (defined $p->{'headers'}->{'content-id'}) ){ $fname = join('',@{$p->{'headers'}->{'content-id'}}); $fname =~ s/[<>]//g; $fname =~ tr/\@\$\%\&/_/s; } my $test = 0; $test++ if ($ctype =~ /image/i); $test++ if ($fname =~ /(gif|jpg|jpeg|png|bmp|tiff)$/i); if ($test == 0) { debuglog("Skipping file with content-type=\"$ctype\" name=\"$fname\""); next; } $imgdir = Mail::SpamAssassin::Util::secure_tmpdir() unless ($imgdir); unless ($imgdir) { debuglog("Scan canceled, cannot create Image TMPDIR."); return 0; } #keep raw email for debugging later my $imgfilename = $imgdir . "/raw.eml"; unless (-e $imgfilename) { if (open RAW, ">$imgfilename") { print RAW $pms->{msg}->get_pristine(); close RAW; debuglog("Saved: $imgfilename"); } } $fname =~ tr{a-zA-Z0-9\.}{_}cs; $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( $imgdir . "/" . $fname ); my $unique = 0; while (-e $imgfilename) { $imgfilename = Mail::SpamAssassin::Util::untaint_file_path( $imgdir . "/" . chr(65+$unique) . "." . $fname ); $unique++; } unless (open PICT, ">$imgfilename") { debuglog("Cannot write \"$imgfilename\", skipping..."); next; } my $pdata = $p->decode(); binmode PICT; print PICT $pdata; close PICT; debuglog("Wrote: $imgfilename"); $cnt++; $imgfiles{$imgfilename}{header} = substr($pdata,0,6); $imgfiles{$imgfilename}{ctype} = $ctype; $imgfiles{$imgfilename}{fname} = $fname; } if ($cnt == 0) { #debuglog("Skipping OCR, no image files found..."); removedir($imgdir) if (defined($imgdir) and ($Option{keep_bad_images}<2)); return 0; } debuglog("Found: $cnt images"); $cnt = 0; my $t = Mail::SpamAssassin::Timeout->new({ secs => $Option{timeout} }); my $retcode; my $haserr = open RAWERR, ">$imgdir/raw.err"; debuglog("Errors to: $imgdir/raw.err") if ($haserr>0); IMAGE: foreach my $file (keys %imgfiles) { my $pic = $imgfiles{$file}; debuglog("Analyzing file with content-type=\"$$pic{ctype}\""); my @used_scansets = (); my $corrupt = 0; my $digest; my $ptype = 0; my $tfile = $file; my $pfile = $file . ".pnm"; my $efile = $file . ".err"; debuglog("pfile => $pfile"); debuglog("efile => $efile"); if ( substr($$pic{header},0,3) eq "\x47\x49\x46" ) { debuglog("Found GIF header name=\"$$pic{fname}\""); $ptype = 1; if ( $$pic{ctype} !~ /gif/i ) { wrong_ctype( "GIF", $$pic{ctype} ); } my $interlaced_gif = 0; my $image_count = 0; foreach my $a (qw/giftext giffix gifinter giftopnm/) { unless (defined $App{$a}) { debuglog("Cannot exec $a, skipping image"); next IMAGE; } } my @stdout_data; my @stderr_data; $retcode = $t->run_and_catch(sub { @stdout_data = qx($App{giftext} $file); }); if ($retcode) { chomp $retcode; debuglog("$App{giftext} Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}>0; next; } foreach (@stdout_data) { unless ($interlaced_gif) { if ( $_ =~ /Image is Interlaced/i ) { $interlaced_gif = 1; } } if ( $_ =~ /^Image #/ ) { $image_count++; } } if ($interlaced_gif or ($image_count gt 1)) { debuglog("Image is interlaced or animated..."); } else { debuglog("Image is single non-interlaced..."); $tfile .= "-fixed.gif"; printf RAWERR "## $App{giffix} $file >$tfile 2>>$efile\n" if ($haserr>0); $retcode = $t->run_and_catch(sub { qx($App{giffix} $file >$tfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{giffix}: Timed out [$retcode], skipping..."); printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}>0; next; } if (open ERR, $efile) { @stderr_data = ; close ERR; foreach (@stderr_data) { if ( $_ =~ /GIF-LIB error/i ) { $corrupt = $_; last; } } } } if ($corrupt) { if ($interlaced_gif or ($image_count gt 1)) { debuglog("Skipping corrupted interlaced image..."); corrupt_img($Score{corrupt_unfixable}, $corrupt); next; } if (-z $tfile) { debuglog("Uncorrectable corruption detected, skipping non-interlaced image..."); corrupt_img($Score{corrupt_unfixable}, $corrupt); next; } debuglog("Image is corrupt, but seems fixable, continuing..."); corrupt_img($Score{corrupt}, $corrupt); } if ($image_count gt 1) { debuglog("File contains <$image_count> images..."); my $img = new Image::Magick; my $read_err = $img->Read($tfile); if ($read_err) { debuglog ("$read_err, skipping..."); next; } if ($image_count gt $Option{gif_max_frames}) { debuglog("Image count exceeds limit, skipping some..."); my $num = $#{$img}; my %size = (); my %delay = (); my %imgs = (); foreach my $n (0 .. $num) { ($delay{$n},$size{$n}) = $img->[$n]->Get('delay','filesize'); $imgs{$n} = 0; } foreach my $k (keys %delay) { $imgs{$k}++ if ($delay{$k} ge $Option{anim_delay}); } my $cnt = 1; foreach my $k (sort {$size{$b} <=> $size{$a}} keys %size) { $imgs{$k}++; last if (++$cnt>$Option{anim_max_frames}); } foreach my $n (0 .. $num) { next if $imgs{$n} > 0; undef $img->[$n]; # Remove unwanted frames; $image_count--; } } if ($image_count>0) { debuglog("Assembling <$image_count> images..."); my $img2 = $img->Append(); my $tfile2 = $tfile; if ($tfile2 =~ m/\.gif$/i) { $tfile2 =~ s/\.gif$/-multi.gif/i; } else { $tfile2 .= ".gif"; } open IMAGE, ">$tfile2"; my $write_err = $img2->Write(file=>\*IMAGE, filename=>$tfile2); close IMAGE; if ($write_err) { debuglog($write_err); } else { $tfile = $tfile2; } } else { debuglog("No frames left, skipping..."); corrupt_img($Score{corrupt}, "No valid frames left"); next; } } if ($interlaced_gif) { debuglog("Processing interlaced_gif $tfile..."); my $cfile = $tfile; if ($tfile =~ m/\.gif$/i) { $tfile =~ s/\.gif$/-fixed.gif/i; } else { $tfile .= ".gif"; } printf RAWERR qq(## $App{gifinter} $cfile >$tfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub{ qx($App{gifinter} $cfile >$tfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{gifinter}: Timed out [$retcode], skipping..."); printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}>0; next; } } printf RAWERR qq(## $App{giftopnm} $tfile >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { qx($App{giftopnm} $tfile >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{giftopnm}: Timed out [$retcode], skipping..."); printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}>0; next; } } elsif ( substr($$pic{header},0,2) eq "\xff\xd8" ) { debuglog("Found JPEG header name=\"$$pic{fname}\""); $ptype = 2; if ( $$pic{ctype} !~ /(jpeg|jpg)/i ) { wrong_ctype( "JPEG", $$pic{ctype} ); } foreach my $a (qw/jpegtopnm/) { unless (defined $App{$a}) { debuglog("Cannot exec $a, skipping image"); next IMAGE; } } printf RAWERR qq(## $App{jpegtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { qx($App{jpegtopnm} $file >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{jpegtopnm}: Timed out [$retcode], skipping..."); ++$imgerr if $Option{keep_bad_images}>0; next; } } elsif ( substr($$pic{header},0,4) eq "\x89\x50\x4e\x47" ) { debuglog("Found PNG header name=\"$$pic{fname}\""); $ptype = 3; if ( $$pic{ctype} !~ /png/i ) { wrong_ctype( "PNG", $$pic{ctype} ); } foreach my $a (qw/pngtopnm/) { unless (defined $App{$a}) { debuglog("Cannot exec $a, skipping image"); next IMAGE; } } printf RAWERR qq(## $App{pngtopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { qx($App{pngtopnm} $file >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{pngtopnm}: Timed out [$retcode], skipping..."); printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}>0; next; } } elsif ( substr($$pic{header},0,2) eq "BM" ) { debuglog("Found BMP header name=\"$$pic{fname}\""); $ptype = 4; if ( $$pic{ctype} !~ /bmp/i ) { wrong_ctype( "BMP", $$pic{ctype} ); } foreach my $a (qw/bmptopnm/) { unless (defined $App{$a}) { debuglog("Cannot exec $a, skipping image"); next IMAGE; } } printf RAWERR qq(## $App{bmptopnm} $file >$pfile 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { qx($App{bmptopnm} $file >$pfile 2>>$efile); }); if ($retcode) { chomp $retcode; debuglog("$App{bmptopnm}: Timed out [$retcode], skipping..."); printf RAWERR "?? Timed out > $retcode\n" if ($haserr>0); ++$imgerr if $Option{keep_bad_images}>0; next; } } elsif ( (substr($$pic{header},0,4) eq "\x4d\x4d\x00\x2a") or (substr($$pic{header},0,4) eq "\x49\x49\x2a\x00") ) { debuglog("Found TIFF header name=\"$$pic{fname}\""); $ptype = 5; if ( $$pic{ctype} !~ /tiff/i ) { wrong_ctype( "TIFF", $$pic{ctype} ); } my $img = new Image::Magick; $img->Read($file); unless (open PNM, ">$pfile") { debuglog("Cannot create $pfile"); ++$imgerr if $Option{keep_bad_images}>0; next; } my $write_err = $img->Write(file=>\*PNM, filename=>$pfile); close PNM; if ($write_err) { debuglog($write_err); next; } } else { debuglog("Image type not recognized, unknown format. Skipping this image..."); next; } if($Option{enable_image_hashing}) { debuglog("Calculating the image hash: $pfile"); ($corrupt, $digest) = calc_image_hash($pfile); if ($corrupt) { debuglog("Error calculating the image hash, skipping hash check..."); } else { my ($score, $dinfo); ($score,$dinfo) = check_image_hash_db($digest, $Option{db_hash}, $$pic{fname}, $$pic{ctype}); if ($score > 0) { known_img_hash($score,$dinfo); debuglog("Message is SPAM. $dinfo"); removedir($imgdir); return 0; } ($score,$dinfo) = check_image_hash_db($digest, $Option{db_safe}, $$pic{fname}, $$pic{ctype}); if ($score > 0) { debuglog("Image in KNOWN_GOOD. Skipping OCR checks..."); next IMAGE; } } if ($digest eq '') { debuglog("Empty Hash, skipping..."); next IMAGE; } } else { debuglog("Image hashing disabled in configuration, skipping..."); } my @ocr_results = (); foreach my $scanset (@scansets) { my $scan = $scanset; $scan =~ s/\$gocr/$App{gocr}/; $scan =~ s/\$pfile/$pfile/; $scan =~ s/\$efile/$efile/g; #unlink $efile if (-e $efile); debuglog("Trying: $scanset"); my @ocrdata; printf RAWERR qq(## $scan 2>>$efile\n) if ($haserr>0); $retcode = $t->run_and_catch(sub { @ocrdata = qx($scan 2>>$efile); }); if ($retcode) { chomp $retcode; open ERR,$efile; my @stderr = ; close ERR; my $errstr = join( '', $retcode,@stderr ); debuglog($errstr); printf RAWERR qq($errstr\n) if ($haserr>0); debuglog("Skipping scanset \"$scanset\" because of errors, trying next..."); next; } push( @ocr_results, [@ocrdata] ); push( @used_scansets, $scanset ); } my $mcnt = 0; foreach my $ww (keys %words) { my $w = lc $ww; $w =~ s/[^a-z]//g; my $wcnt = 0; my $gcnt = 0; foreach my $ocr_set (@ocr_results) { my $cwcnt = 0; foreach (@$ocr_set) { tr/!;|081/iiioal/; s/[^a-zA-Z]//g; $_ = lc; my $matched = abs(adistr( $w, $_ )); if ( $matched < $words{$ww} ) { $cwcnt++; debuglog( "Found word \"$w\" in line\n \"$_\" \n with fuzz of " . sprintf("%0.4f",$matched) . " scanned with scanset $used_scansets[$gcnt]" ); } } $wcnt = max( $wcnt, $cwcnt ); $gcnt++; } $cnt += $wcnt; $mcnt += $wcnt; if ( ( $Option{verbose} > 0 ) and ($wcnt) ) { push( @found, "\"$w\" in $wcnt lines" ); } } if ($Option{enable_image_hashing}) { my $info = join('::',$mcnt,$$pic{fname},$$pic{ctype},$digest); push(@hashes, $info); } } close RAWERR if ($haserr>0); if ($cnt == 0) { if ($Option{enable_image_hashing} == 2 and @hashes) { debuglog("Message is ham, saving..."); foreach my $h (@hashes) { my ($mcnt,$fname,$ctype,$digest) = split('::',$h,4); next if $mcnt; add_image_hash_db($digest,0,$Option{db_safe},$fname,$ctype); } } } else { my $score = '0.000'; my $debuginfo = ( "Words found:\n" . join( "\n", @found ) . "\n($cnt word occurrences found)" ); if ($cnt >= $Option{counts_required}) { $score = sprintf "%0.3f", $Score{base} + (( $cnt - $Option{counts_required} ) * $Score{add} ); debuglog("Message is spam, score = $score"); } else { $score = sprintf("%0.3f", $Score{add} * $cnt) if $Option{score_ham}; debuglog("Message is ham, score = $score"); } if ($Option{enable_image_hashing} and $Option{hashing_learn_scanned} and $score > 0) { foreach my $h (@hashes) { my ($mcnt,$fname,$ctype,$digest) = split('::',$h,4); next unless $mcnt; add_image_hash_db($digest,$score,$Option{db_hash},$fname,$ctype,$debuginfo); } } if ( $Option{verbose} > 0 ) { debuglog($debuginfo); } for my $set ( 0 .. 3 ) { $pms->{conf}->{scoreset}->[$set]->{"FUZZY_OCR"} = $score; } $pms->_handle_hit( "FUZZY_OCR", $score, "BODY: ", $pms->{conf}->{descriptions}->{FUZZY_OCR} . "\n$debuginfo" ); } if ($imgerr == 0 and $Option{keep_bad_images}<2) { removedir($imgdir); } debuglog("FuzzyOcr ending successfully..."); return 0; } 1;