Main Page | Compound List | File List | Compound Members | File Members

MailHeader.C

00001 
00002 /*
00003 
00004   This email filter was written by Ian Kaplan, Bear Products
00005   International.  It is copyrighted by Ian Kaplan, 2004,
00006   www.bearcave.com.
00007 
00008   You have permission to use this software without restriction on two
00009   conditions:
00010 
00011     1. You must preserve this copyright notice in this software and
00012        any software derived from it.
00013 
00014     2. You accept any risk entailed in using this software.  By
00015        using this software, you acknowledge that you have a
00016        sophisticated background in software engineering and 
00017        understand the way this software functions.  You further
00018        acknowledge that using this software may result in the
00019        irretrievable loss of important e-email and you alone
00020        are responsible for this loss.
00021 
00022   If either of these conditions are unacceptable, you may not use any
00023   part of this software.
00024 
00025  */
00026 
00027 #include <assert.h>
00028 #include <errno.h>
00029 #include <string.h>
00030 #include <ctype.h>
00031 
00032 #include <vector>
00033 
00034 #include "Logger.h"
00035 #include "SpamUtil.h"
00036 #include "MailHeader.h"
00037 
00038 using namespace std;
00039 
00040 
00041 MailHeader::MailHeader(SpamParameters &p,
00042                        HeaderInfo &info) : mParams( p ), mHeadInfo( info )
00043 {
00044   log = pLogger->getLogger("MailHeader");
00045   pPushBack = 0;
00046   pushBackBuf[0] = '\0';
00047   boundaryStr[0] = '\0';
00048   foundValidAddress = false;
00049 }
00050 
00051 
00068 MailFilter::classification MailHeader::checkSubject(const char *buf, FILE *fp)
00069 {
00070   log.log(Logger::DEBUG, "checkSubject", "enter");
00071 
00072   char msg[256];
00073   char subject[256];
00074   char foundStr[128];
00075 
00076   // convert to lower case
00077   SpamUtil().toLower(subject, buf, sizeof(subject));
00078 
00079   foundStr[0] = '\0';
00080   MailFilter::classification klass = SpamUtil().checkLine(subject, 
00081                                                           mParams, 
00082                                                           foundStr, 
00083                                                           sizeof(foundStr));
00084   
00085   if (klass == MailFilter::SUSPECT || klass == MailFilter::GARBAGE) {
00086     if (klass == MailFilter::SUSPECT) {
00087       sprintf(msg, "Found \"spam\" word \"%s\", email marked as SUSPECT", 
00088               foundStr );
00089     }
00090     else if (klass == MailFilter::GARBAGE) {
00091       mHeadInfo.reason( foundStr );
00092       sprintf(msg, "Found \"kill\" word \"%s\", email marked as GARBAGE", 
00093               foundStr );
00094     }
00095     log.log(Logger::DEBUG, "checkSubject", msg );
00096   }
00097 
00098   pPushBack = 0;
00099   char *pBuf;
00100   if ((pBuf = fgets(pushBackBuf, sizeof(pushBackBuf), fp)) != 0) {
00101     if (SpamUtil().isBlankLine(pBuf) || SpamUtil().findColon(pBuf) != 0) {
00102       pPushBack = pBuf;
00103     }
00104   }
00105   
00106   log.log(Logger::DEBUG, "checkSubject", "exit");
00107   return klass;
00108 } // checkSubject
00109 
00110 
00123 MailFilter::classification MailHeader::checkFrom(const char *buf)
00124 {
00125   MailFilter::classification klass = MailFilter::UNKNOWN;
00126   log.log(Logger::DEBUG, "checkFrom", "enter");
00127 
00128   vector<const char *> fromAddrs = mParams.getSection(SpamParameters::from_address);
00129   vector<const char *> killAddrs = mParams.getSection(SpamParameters::from_kill);
00130 
00131   char msg[128];
00132   char from[256];
00133 
00134   SpamUtil().toLower(from, buf, sizeof(from));
00135 
00136   size_t len;
00137   len = killAddrs.size();
00138   for (size_t i = 0; i < len; i++) {
00139     if (strstr(from, killAddrs[i]) != 0) {
00140       sprintf(msg, "Found address \"%s\", email marked as GARBAGE", 
00141               killAddrs[i] );
00142       mHeadInfo.reason( msg );
00143       log.log(Logger::DEBUG, "checkFrom", msg );
00144       klass = MailFilter::GARBAGE;
00145       break;
00146     }
00147   }
00148 
00149   if (klass == MailFilter::UNKNOWN) {
00150     len = fromAddrs.size();
00151     for (size_t i = 0; i < len; i++) {
00152       if (strstr(from, fromAddrs[i]) != 0) {
00153         sprintf(msg, "Found \"from address\" \"%s\", email marked as EMAIL", 
00154                 fromAddrs[i] );
00155         log.log(Logger::DEBUG, "checkFrom", msg );
00156         klass = MailFilter::EMAIL;
00157         break;
00158       }
00159     } // for
00160   }
00161 
00162   log.log(Logger::DEBUG, "checkFrom", "exit");
00163   return klass;
00164 } // checkFrom
00165 
00166 
00167 
00188 bool MailHeader::addrContinues(const char *buf)
00189 {
00190   bool rslt = false;
00191 
00192   log.log(Logger::DEBUG, "addrContinues", "enter");
00193   if (buf) {
00194     int end = strlen( buf );
00195 
00196     if (end > 0) {
00197       end--;
00198       const char *endPtr;
00199       for (endPtr = &buf[end]; endPtr >= buf && isspace(*endPtr); endPtr--)
00200         /* nada */;
00201       if (*endPtr == ',') {
00202         rslt = true;
00203       }
00204       else if (*endPtr == '"') {
00205         if (endPtr > buf && *(endPtr-1) == '\'') {
00206           rslt = true;
00207         }
00208       }
00209     }
00210   }
00211 
00212   char msgbuf[128];
00213   sprintf( msgbuf, "returns %s", (rslt) ? "TRUE" : "FALSE" );
00214   log.log(Logger::DEBUG, "addrContinues", msgbuf );
00215 
00216   log.log(Logger::DEBUG, "addrContinues", "exit");
00217   return rslt;
00218 } // addrContinues
00219 
00220 
00221 
00235 MailFilter::classification MailHeader::checkDomainAddrs(const char *domainName,
00236                                                         const char *pBuf)
00237 {
00238   log.log(Logger::DEBUG, "checkDomainAddrs", "enter");
00239 
00240   assert( ((domainName != 0) && (pBuf != 0)) );
00241 
00242   vector<const char *> validUsers = mParams.getSection(SpamParameters::valid_users);
00243   const size_t numUsers = validUsers.size();
00244   MailFilter::classification klass = MailFilter::UNKNOWN;
00245   
00246   bool done = false;
00247   size_t domainNameLen = strlen( domainName );
00248   const char *domainPtr = strstr(pBuf, domainName );
00249   while (domainPtr) {
00250     if (domainPtr > pBuf+2) {
00251       domainPtr--;
00252       if (*domainPtr == '@') {
00253         // find the start and end of the user name
00254         const char *endPtr = domainPtr;
00255         domainPtr--;
00256         const char *beginPtr = domainPtr;
00257         while (beginPtr >= pBuf && isalnum( *beginPtr ))
00258           beginPtr--;
00259         if (!isalnum(*beginPtr)) {
00260           beginPtr++;
00261         }
00262 
00263         // Now check to see if the user name is in the valid_users list
00264         // Note that this function is used for both To: and Cc:, so 
00265         // foundValidAddress could have been set in a previous call.
00266         bool foundInList = false;
00267         for (size_t i = 0; i < numUsers; i++) {
00268           const char *word = validUsers[i];
00269           if (SpamUtil().match(beginPtr, endPtr, word)) {
00270             foundValidAddress = true;
00271             foundInList = true;
00272           }
00273         } // for
00274 
00275         if (!foundInList) {
00276           char msg[128];
00277           char user[128];
00278           size_t ix = 0;
00279           for (const char *pCh = beginPtr; pCh < endPtr; pCh++, ix++) {
00280             user[ix] = *pCh;
00281           }
00282           user[ix] = '\0';
00283           sprintf(msg, "Non-valid user \"%s\", email marked as GARBAGE", 
00284                   user );
00285           mHeadInfo.reason( msg );
00286           log.log(Logger::DEBUG, "checkDomainAddrs", msg );
00287           klass = MailFilter::GARBAGE;
00288         }
00289         // endPtr points to the '@'
00290         pBuf = (endPtr + 1);
00291       }
00292     }
00293     if (klass == MailFilter::UNKNOWN) {
00294       pBuf = pBuf + domainNameLen;
00295       domainPtr = strstr(pBuf, domainName);
00296     }
00297     else {
00298       break;  // exit the while loop
00299     }
00300   } // while
00301 
00302   log.log(Logger::DEBUG, "checkDomainAddrs", "exit");
00303 
00304   return klass;
00305 } // checkDomainAddrs
00306 
00307 
00308 
00334 MailFilter::classification MailHeader::checkAddressSection(const char *buf, FILE *fp)
00335 {
00336   log.log(Logger::DEBUG, "checkAddressSection", "enter");
00337 
00338   MailFilter::classification klass = MailFilter::UNKNOWN;
00339   vector<const char *> toAddrs = mParams.getSection(SpamParameters::to_list);
00340   vector<const char *> myDomain = mParams.getSection(SpamParameters::my_domain);
00341 
00342   const char *domainName = 0;
00343   if (myDomain.size() > 0)
00344     domainName = myDomain[0];
00345   
00346   const char *pBuf = buf;
00347   const size_t toAddrLen = toAddrs.size();
00348 
00349   char localBuf[1024];
00350   size_t i;
00351   bool done;
00352   do {
00353     done = true;
00354 
00355     SpamUtil().toLower(localBuf, pBuf, sizeof(localBuf));
00356     for (i = 0; i < toAddrLen; i++) {
00357       if (strstr(localBuf, toAddrs[i]) != 0) {
00358         const char *hit = toAddrs[i];
00359         char msg[128];
00360         sprintf(msg, "found \"%s\", marked as EMAIL", hit );
00361         log.log(Logger::DEBUG, "checkAddressSection", msg );
00362         klass = MailFilter::EMAIL;
00363         break;
00364       }
00365     } // for
00366 
00367     if (klass == MailFilter::UNKNOWN && domainName != 0) {
00368       klass = checkDomainAddrs( domainName, localBuf );
00369     }
00370 
00371     if (klass == MailFilter::UNKNOWN) {
00372       if (addrContinues(localBuf)) {
00373         if ((pBuf = fgets(localBuf, sizeof(localBuf), fp)) != 0) {
00374           done = false;
00375         }
00376       }
00377     }
00378 
00379   } while (!done);
00380 
00381   log.log(Logger::DEBUG, "checkAddressSection", "exit");
00382 
00383   return klass;
00384 } // checkAddressSection
00385 
00386 
00416 MailFilter::classification MailHeader::checkReceived(const char *buf,
00417                                                      FILE *fp,
00418                                                      size_t line)
00419 {
00420   log.log(Logger::DEBUG, "checkReceived", "enter");
00421   MailFilter::classification klass = MailFilter::UNKNOWN;
00422 
00423   const char *pColon;
00424   do {
00425     pPushBack = fgets(pushBackBuf, sizeof(pushBackBuf), fp);
00426     if (pPushBack != 0) {
00427       if ((klass == MailFilter::UNKNOWN) && (strstr(pPushBack, "forged") != 0)) {
00428         klass = MailFilter::SUSPECT;
00429       }
00430       pColon = SpamUtil().findColon( pPushBack );
00431     }
00432   } while (pPushBack != 0 && !pColon);
00433 
00434   log.log(Logger::DEBUG, "checkReceived", "exit");
00435   return klass;
00436 } // checkReceived
00437 
00438 
00456 void MailHeader::saveBoundary(const char *pBound)
00457 {
00458   log.log(Logger::DEBUG, "saveBoundary", "enter");
00459 
00460   if (*pBound == '=') {
00461     pBound++;
00462     // skip any white space between the "=" and the quote
00463     pBound = SpamUtil().skipWhiteSpace( pBound );
00464     if (*pBound == '"') {
00465       pBound++;
00466     }
00467     const size_t len = sizeof(boundaryStr) - 1;
00468     size_t ix = 0;
00469     while (*pBound && 
00470            ix < len && 
00471            (! isspace(*pBound)) &&
00472            *pBound != '"') {
00473       boundaryStr[ix] = *pBound;
00474       pBound++;
00475       ix++;
00476     }
00477     boundaryStr[ix] = '\0';
00478 
00479     if (ix > 0) {
00480       char msg[128];
00481       sprintf(msg, "boundary str. = \"%s\"", boundaryStr );
00482       log.log(Logger::DEBUG, "saveBoundary", msg);
00483     }
00484   }
00485   else {
00486     log.log(Logger::ERROR, "saveBoundary", "'=' expected");
00487   }
00488 
00489   log.log(Logger::DEBUG, "saveBoundary", "exit");
00490 } // saveBoundary
00491 
00492 
00561 MailFilter::classification MailHeader::parseContentType(FILE* fp,
00562                                                         char *contentBuf )
00563 {
00564 
00565 
00566   const char *BOUNDARY = "boundary";
00567   static size_t BOUNDARY_LEN = strlen( BOUNDARY );
00568   const char *CONTENT_ENCODE = "Content-Transfer-Encoding";
00569   static size_t CONTENT_ENCODE_LEN = strlen(CONTENT_ENCODE);
00570 
00571   log.log(Logger::DEBUG, "parseContentType", "enter");
00572 
00573   MailFilter::classification klass = MailFilter::UNKNOWN;
00574 
00575   SpamUtil::contentType type = SpamUtil().classifySection( contentBuf );
00576 
00577   {
00578     char *pBound = strstr(contentBuf, BOUNDARY);
00579     if (pBound && type != SpamUtil::TEXT) {
00580       saveBoundary( pBound + BOUNDARY_LEN );
00581     }
00582   }
00583   
00584   pPushBack = 0;
00585   char *pBuf;
00586   if ((pBuf = fgets(pushBackBuf, sizeof(pushBackBuf), fp)) != 0) {
00587     // If the line that follows the Content-Type does not have a colon
00588     // (e.g., findColon() does not return a pointer) and it is not a 
00589     // blank line, then it may be a boundary definition or a charset
00590     // definition.  If it is a boundary we want to pick it up.  Otherwise
00591     // we want to skip it.
00592     if (SpamUtil().isBlankLine(pBuf) || SpamUtil().findColon(pBuf) != 0) {
00593       pPushBack = pBuf;
00594     }
00595     else {
00596       char *pBound = strstr(pBuf, BOUNDARY);
00597       if (pBound && type != SpamUtil::TEXT) {
00598         saveBoundary( pBound + BOUNDARY_LEN );
00599       }
00600       // get the next line
00601       pBuf = fgets(pushBackBuf, sizeof(pushBackBuf), fp);
00602       pPushBack = pBuf;
00603     }
00604 
00605     // Check for the Content-Transfer-Encoding line and see if it is base64
00606     char *pEncode;
00607     if ((pEncode = strstr(pBuf, CONTENT_ENCODE)) != 0) {
00608       pPushBack = 0;
00609       if (strstr(pEncode + CONTENT_ENCODE_LEN, "base64")) {
00610         type = SpamUtil::BASE64;
00611       }
00612     }
00613 
00614   }
00615 
00616   if (type == SpamUtil::BASE64) {
00617     if (mParams.hasFlag("kill_base64")) {
00618       mHeadInfo.reason("found base64 encoded information");
00619       klass = MailFilter::GARBAGE;
00620     }
00621     else {
00622       klass = MailFilter::SUSPECT;
00623     }
00624   }
00625   else if (type == SpamUtil::HTML) {
00626       klass = MailFilter::SUSPECT;
00627   }
00628   else if (type == SpamUtil::IMAGE || type == SpamUtil::AUDIO) {
00629       klass = MailFilter::SUSPECT;
00630   }
00631 
00632   const char *typeName;
00633   typeName = SpamUtil().typeToStr( type );
00634 
00635   char msg[128];
00636   sprintf(msg, "mail type = %s", typeName );
00637   log.log(Logger::DEBUG, "parseContentType", msg );
00638 
00639   log.log(Logger::DEBUG, "parseContentType", "exit");
00640   return klass;
00641 } // parseContentType
00642 
00643 
00644 
00659 void MailHeader::fillInSections(FILE *fp)
00660 {
00661   static const char *TO = "to:";
00662   static const size_t TO_LEN = strlen( TO );
00663   static const char *FROM = "from:";
00664   static const size_t FROM_LEN = strlen( FROM );
00665   static const char *SUBJECT = "subject:";
00666   static const size_t SUBJECT_LEN = strlen( SUBJECT );
00667   static const char *DATE = "date:";
00668   static const size_t DATE_LEN = strlen( DATE );
00669   char buf[1024];
00670   char *pBuf = 0;
00671   size_t bufSize = 0;
00672 
00673   log.log(Logger::DEBUG, "fillInSections", "enter");
00674 
00675   do {
00676     if (pPushBack == 0) {
00677       pushBackBuf[0] = '0';
00678       bufSize = sizeof(buf);
00679       pBuf = fgets(buf, bufSize, fp);
00680     }
00681     else {
00682       pBuf = pPushBack;
00683       bufSize = sizeof( pushBackBuf );
00684       pPushBack = 0;
00685     }
00686     if (pBuf != 0) {
00687       if (! SpamUtil().isBlankLine( pBuf )) {
00688         char *pCopy = 0;
00689         if (SpamUtil().match(pBuf, TO_LEN, TO)) {
00690           pCopy = pBuf + TO_LEN;
00691           mHeadInfo.to( pCopy );
00692         }
00693         else if (SpamUtil().match(pBuf, FROM_LEN, FROM)) {
00694           pCopy = pBuf + FROM_LEN;
00695           mHeadInfo.from( pCopy );
00696         }
00697         else if (SpamUtil().match(pBuf, SUBJECT_LEN, SUBJECT)) {
00698           pCopy = pBuf + SUBJECT_LEN;
00699           mHeadInfo.subject( pCopy );
00700         }
00701         else if (SpamUtil().match(pBuf, DATE_LEN, DATE)) {
00702           pCopy = pBuf + DATE_LEN;
00703           mHeadInfo.date( pCopy );
00704         }
00705       }
00706       else {
00707         // found a blank line which follows the mail header
00708         break;
00709       }
00710     }
00711   } while (pBuf != 0);
00712 
00713   if (pBuf == 0) {
00714     log.log(Logger::DEBUG, "fillInSections", "end-of-file reached");
00715   }
00716   log.log(Logger::DEBUG, "fillInSections", "exit");
00717 } // fillInSections
00718 
00719 
00804 MailFilter::classification MailHeader::checkHeader(FILE *fp )
00805 {
00806   log.log(Logger::DEBUG, "checkHeader", "enter");
00807   MailFilter::classification klass = MailFilter::BAD_VALUE;
00808   if (!feof(fp)) {
00809     klass = MailFilter::UNKNOWN;
00810     char *pBuf;
00811 
00812     // Skip any blank lines which start the e-mail message
00813     while ((pPushBack = fgets(pushBackBuf, sizeof(pushBackBuf), fp)) != 0) {
00814       if (! SpamUtil().isBlankLine( pPushBack )) {
00815         break;
00816       }
00817     } // while
00818 
00819     if (pPushBack != 0) {  // Loop through the e-mail header
00820       size_t line = 1;
00821       const char *RECEIVED = "received";
00822       static const size_t RECEIVED_LEN = strlen(RECEIVED);
00823       const char *SUBJECT = "subject";
00824       static const size_t SUBJECT_LEN = strlen(SUBJECT);
00825       const char *FROM = "from";
00826       static const size_t FROM_LEN = strlen(FROM);
00827       const char *CONTENT_TYPE = "content-type";
00828       const char *TO = "to";
00829       static const size_t TO_LEN = strlen(TO);
00830       const char *CC = "cc";
00831       const char *DATE = "date";
00832       const char *pBound = 0;
00833       const char *pColon = 0;
00834       char buf[1024];
00835       do { // DO
00836         if (pPushBack == 0) {
00837           pushBackBuf[0] = '\0';
00838           pBuf = fgets(buf, sizeof(buf), fp);
00839         }
00840         else {
00841           pBuf = pPushBack;
00842           pPushBack = 0;
00843         }
00844         if (!pBuf) {
00845           break;
00846         }
00847 
00848         if (! SpamUtil().isBlankLine( pBuf )) {
00849           pColon = SpamUtil().findColon( pBuf );
00850           if (SpamUtil().match(pBuf, FROM_LEN, FROM)) {
00851             pColon = pBuf + FROM_LEN;
00852             if (*pColon == ':') {
00853               pColon++;
00854               mHeadInfo.from( pColon );
00855             }
00856             else {
00857               mHeadInfo.fromNoColon( pColon );
00858             }
00859             klass = checkFrom( pColon );
00860           }
00861           else if (pColon != 0) {
00862             if (SpamUtil().match(pBuf, RECEIVED_LEN, RECEIVED)) {
00863               pColon = pBuf + RECEIVED_LEN + 1;
00864               klass = checkReceived( pColon, fp, line );
00865             } else if (SpamUtil().match(pBuf, SUBJECT_LEN, SUBJECT)) {
00866               pColon = pBuf + SUBJECT_LEN + 1;
00867               mHeadInfo.subject(pColon);
00868               klass = checkSubject( pColon, fp );
00869             }
00870             else if (SpamUtil().match(pBuf, pColon, CONTENT_TYPE)) {
00871               pColon++;
00872               klass = parseContentType(fp, pBuf);
00873             }
00874             else if (SpamUtil().match(pBuf, TO_LEN, TO)) {
00875               pColon = pBuf + TO_LEN + 1;
00876               mHeadInfo.to( pColon );
00877               klass = checkAddressSection(pColon, fp);
00878             }
00879             else if (SpamUtil().match(pBuf, pColon, CC)) {
00880               pColon++;
00881               klass = checkAddressSection(pColon, fp);
00882             }
00883             else if (SpamUtil().match(pBuf, pColon, DATE)) {
00884               pColon++;
00885               mHeadInfo.date(pColon);
00886             }
00887           } // has a colon (pColon != 0
00888         } // is blank line
00889         else {
00890           // found a blank line
00891           break;
00892         }
00893       } while (klass == MailFilter::UNKNOWN && pBuf != 0);
00894 
00895       // if we have not finished on a black line, fill in any sections that
00896       // have not been encountered yet.
00897       if (! SpamUtil().isBlankLine( pBuf )) {
00898         fillInSections(fp);
00899       }
00900 
00901       // If the email was not addressed to a known mailing list and
00902       // and address in the SpamFilterParams section my_address is
00903       // not found, then it is classified as SPAM.
00904       if (klass == MailFilter::UNKNOWN && (! foundValidAddress)) {
00905         log.log(Logger::DEBUG, "checkHeader", "Did not find a valid To: or Cc: address");
00906         klass = MailFilter::SUSPECT;
00907       }
00908     } // if pBuf
00909   }
00910 
00911   mHeadInfo.klass( klass );
00912 
00913   char msg[128];
00914   sprintf(msg, "return value = %s", SpamUtil().classificationToStr( klass ));
00915   log.log(Logger::DEBUG, "checkHeader", msg );
00916   log.log(Logger::DEBUG, "checkHeader", "exit");
00917   return klass;
00918 } // checkHeader

Generated on Sat Mar 27 13:07:38 2004 for Mail Filter by doxygen 1.3.3