00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #include <assert.h>
00028 #include <errno.h>
00029 #include <string.h>
00030 #include <ctype.h>
00031
00032 #include <vector>
00033
00034 #include "Logger.h"
00035 #include "SpamUtil.h"
00036 #include "MailHeader.h"
00037
00038 using namespace std;
00039
00040
00041 MailHeader::MailHeader(SpamParameters &p,
00042 HeaderInfo &info) : mParams( p ), mHeadInfo( info )
00043 {
00044 log = pLogger->getLogger("MailHeader");
00045 pPushBack = 0;
00046 pushBackBuf[0] = '\0';
00047 boundaryStr[0] = '\0';
00048 foundValidAddress = false;
00049 }
00050
00051
00068 MailFilter::classification MailHeader::checkSubject(const char *buf, FILE *fp)
00069 {
00070 log.log(Logger::DEBUG, "checkSubject", "enter");
00071
00072 char msg[256];
00073 char subject[256];
00074 char foundStr[128];
00075
00076
00077 SpamUtil().toLower(subject, buf, sizeof(subject));
00078
00079 foundStr[0] = '\0';
00080 MailFilter::classification klass = SpamUtil().checkLine(subject,
00081 mParams,
00082 foundStr,
00083 sizeof(foundStr));
00084
00085 if (klass == MailFilter::SUSPECT || klass == MailFilter::GARBAGE) {
00086 if (klass == MailFilter::SUSPECT) {
00087 sprintf(msg, "Found \"spam\" word \"%s\", email marked as SUSPECT",
00088 foundStr );
00089 }
00090 else if (klass == MailFilter::GARBAGE) {
00091 mHeadInfo.reason( foundStr );
00092 sprintf(msg, "Found \"kill\" word \"%s\", email marked as GARBAGE",
00093 foundStr );
00094 }
00095 log.log(Logger::DEBUG, "checkSubject", msg );
00096 }
00097
00098 pPushBack = 0;
00099 char *pBuf;
00100 if ((pBuf = fgets(pushBackBuf, sizeof(pushBackBuf), fp)) != 0) {
00101 if (SpamUtil().isBlankLine(pBuf) || SpamUtil().findColon(pBuf) != 0) {
00102 pPushBack = pBuf;
00103 }
00104 }
00105
00106 log.log(Logger::DEBUG, "checkSubject", "exit");
00107 return klass;
00108 }
00109
00110
00123 MailFilter::classification MailHeader::checkFrom(const char *buf)
00124 {
00125 MailFilter::classification klass = MailFilter::UNKNOWN;
00126 log.log(Logger::DEBUG, "checkFrom", "enter");
00127
00128 vector<const char *> fromAddrs = mParams.getSection(SpamParameters::from_address);
00129 vector<const char *> killAddrs = mParams.getSection(SpamParameters::from_kill);
00130
00131 char msg[128];
00132 char from[256];
00133
00134 SpamUtil().toLower(from, buf, sizeof(from));
00135
00136 size_t len;
00137 len = killAddrs.size();
00138 for (size_t i = 0; i < len; i++) {
00139 if (strstr(from, killAddrs[i]) != 0) {
00140 sprintf(msg, "Found address \"%s\", email marked as GARBAGE",
00141 killAddrs[i] );
00142 mHeadInfo.reason( msg );
00143 log.log(Logger::DEBUG, "checkFrom", msg );
00144 klass = MailFilter::GARBAGE;
00145 break;
00146 }
00147 }
00148
00149 if (klass == MailFilter::UNKNOWN) {
00150 len = fromAddrs.size();
00151 for (size_t i = 0; i < len; i++) {
00152 if (strstr(from, fromAddrs[i]) != 0) {
00153 sprintf(msg, "Found \"from address\" \"%s\", email marked as EMAIL",
00154 fromAddrs[i] );
00155 log.log(Logger::DEBUG, "checkFrom", msg );
00156 klass = MailFilter::EMAIL;
00157 break;
00158 }
00159 }
00160 }
00161
00162 log.log(Logger::DEBUG, "checkFrom", "exit");
00163 return klass;
00164 }
00165
00166
00167
00188 bool MailHeader::addrContinues(const char *buf)
00189 {
00190 bool rslt = false;
00191
00192 log.log(Logger::DEBUG, "addrContinues", "enter");
00193 if (buf) {
00194 int end = strlen( buf );
00195
00196 if (end > 0) {
00197 end--;
00198 const char *endPtr;
00199 for (endPtr = &buf[end]; endPtr >= buf && isspace(*endPtr); endPtr--)
00200 ;
00201 if (*endPtr == ',') {
00202 rslt = true;
00203 }
00204 else if (*endPtr == '"') {
00205 if (endPtr > buf && *(endPtr-1) == '\'') {
00206 rslt = true;
00207 }
00208 }
00209 }
00210 }
00211
00212 char msgbuf[128];
00213 sprintf( msgbuf, "returns %s", (rslt) ? "TRUE" : "FALSE" );
00214 log.log(Logger::DEBUG, "addrContinues", msgbuf );
00215
00216 log.log(Logger::DEBUG, "addrContinues", "exit");
00217 return rslt;
00218 }
00219
00220
00221
00235 MailFilter::classification MailHeader::checkDomainAddrs(const char *domainName,
00236 const char *pBuf)
00237 {
00238 log.log(Logger::DEBUG, "checkDomainAddrs", "enter");
00239
00240 assert( ((domainName != 0) && (pBuf != 0)) );
00241
00242 vector<const char *> validUsers = mParams.getSection(SpamParameters::valid_users);
00243 const size_t numUsers = validUsers.size();
00244 MailFilter::classification klass = MailFilter::UNKNOWN;
00245
00246 bool done = false;
00247 size_t domainNameLen = strlen( domainName );
00248 const char *domainPtr = strstr(pBuf, domainName );
00249 while (domainPtr) {
00250 if (domainPtr > pBuf+2) {
00251 domainPtr--;
00252 if (*domainPtr == '@') {
00253
00254 const char *endPtr = domainPtr;
00255 domainPtr--;
00256 const char *beginPtr = domainPtr;
00257 while (beginPtr >= pBuf && isalnum( *beginPtr ))
00258 beginPtr--;
00259 if (!isalnum(*beginPtr)) {
00260 beginPtr++;
00261 }
00262
00263
00264
00265
00266 bool foundInList = false;
00267 for (size_t i = 0; i < numUsers; i++) {
00268 const char *word = validUsers[i];
00269 if (SpamUtil().match(beginPtr, endPtr, word)) {
00270 foundValidAddress = true;
00271 foundInList = true;
00272 }
00273 }
00274
00275 if (!foundInList) {
00276 char msg[128];
00277 char user[128];
00278 size_t ix = 0;
00279 for (const char *pCh = beginPtr; pCh < endPtr; pCh++, ix++) {
00280 user[ix] = *pCh;
00281 }
00282 user[ix] = '\0';
00283 sprintf(msg, "Non-valid user \"%s\", email marked as GARBAGE",
00284 user );
00285 mHeadInfo.reason( msg );
00286 log.log(Logger::DEBUG, "checkDomainAddrs", msg );
00287 klass = MailFilter::GARBAGE;
00288 }
00289
00290 pBuf = (endPtr + 1);
00291 }
00292 }
00293 if (klass == MailFilter::UNKNOWN) {
00294 pBuf = pBuf + domainNameLen;
00295 domainPtr = strstr(pBuf, domainName);
00296 }
00297 else {
00298 break;
00299 }
00300 }
00301
00302 log.log(Logger::DEBUG, "checkDomainAddrs", "exit");
00303
00304 return klass;
00305 }
00306
00307
00308
00334 MailFilter::classification MailHeader::checkAddressSection(const char *buf, FILE *fp)
00335 {
00336 log.log(Logger::DEBUG, "checkAddressSection", "enter");
00337
00338 MailFilter::classification klass = MailFilter::UNKNOWN;
00339 vector<const char *> toAddrs = mParams.getSection(SpamParameters::to_list);
00340 vector<const char *> myDomain = mParams.getSection(SpamParameters::my_domain);
00341
00342 const char *domainName = 0;
00343 if (myDomain.size() > 0)
00344 domainName = myDomain[0];
00345
00346 const char *pBuf = buf;
00347 const size_t toAddrLen = toAddrs.size();
00348
00349 char localBuf[1024];
00350 size_t i;
00351 bool done;
00352 do {
00353 done = true;
00354
00355 SpamUtil().toLower(localBuf, pBuf, sizeof(localBuf));
00356 for (i = 0; i < toAddrLen; i++) {
00357 if (strstr(localBuf, toAddrs[i]) != 0) {
00358 const char *hit = toAddrs[i];
00359 char msg[128];
00360 sprintf(msg, "found \"%s\", marked as EMAIL", hit );
00361 log.log(Logger::DEBUG, "checkAddressSection", msg );
00362 klass = MailFilter::EMAIL;
00363 break;
00364 }
00365 }
00366
00367 if (klass == MailFilter::UNKNOWN && domainName != 0) {
00368 klass = checkDomainAddrs( domainName, localBuf );
00369 }
00370
00371 if (klass == MailFilter::UNKNOWN) {
00372 if (addrContinues(localBuf)) {
00373 if ((pBuf = fgets(localBuf, sizeof(localBuf), fp)) != 0) {
00374 done = false;
00375 }
00376 }
00377 }
00378
00379 } while (!done);
00380
00381 log.log(Logger::DEBUG, "checkAddressSection", "exit");
00382
00383 return klass;
00384 }
00385
00386
00416 MailFilter::classification MailHeader::checkReceived(const char *buf,
00417 FILE *fp,
00418 size_t line)
00419 {
00420 log.log(Logger::DEBUG, "checkReceived", "enter");
00421 MailFilter::classification klass = MailFilter::UNKNOWN;
00422
00423 const char *pColon;
00424 do {
00425 pPushBack = fgets(pushBackBuf, sizeof(pushBackBuf), fp);
00426 if (pPushBack != 0) {
00427 if ((klass == MailFilter::UNKNOWN) && (strstr(pPushBack, "forged") != 0)) {
00428 klass = MailFilter::SUSPECT;
00429 }
00430 pColon = SpamUtil().findColon( pPushBack );
00431 }
00432 } while (pPushBack != 0 && !pColon);
00433
00434 log.log(Logger::DEBUG, "checkReceived", "exit");
00435 return klass;
00436 }
00437
00438
00456 void MailHeader::saveBoundary(const char *pBound)
00457 {
00458 log.log(Logger::DEBUG, "saveBoundary", "enter");
00459
00460 if (*pBound == '=') {
00461 pBound++;
00462
00463 pBound = SpamUtil().skipWhiteSpace( pBound );
00464 if (*pBound == '"') {
00465 pBound++;
00466 }
00467 const size_t len = sizeof(boundaryStr) - 1;
00468 size_t ix = 0;
00469 while (*pBound &&
00470 ix < len &&
00471 (! isspace(*pBound)) &&
00472 *pBound != '"') {
00473 boundaryStr[ix] = *pBound;
00474 pBound++;
00475 ix++;
00476 }
00477 boundaryStr[ix] = '\0';
00478
00479 if (ix > 0) {
00480 char msg[128];
00481 sprintf(msg, "boundary str. = \"%s\"", boundaryStr );
00482 log.log(Logger::DEBUG, "saveBoundary", msg);
00483 }
00484 }
00485 else {
00486 log.log(Logger::ERROR, "saveBoundary", "'=' expected");
00487 }
00488
00489 log.log(Logger::DEBUG, "saveBoundary", "exit");
00490 }
00491
00492
00561 MailFilter::classification MailHeader::parseContentType(FILE* fp,
00562 char *contentBuf )
00563 {
00564
00565
00566 const char *BOUNDARY = "boundary";
00567 static size_t BOUNDARY_LEN = strlen( BOUNDARY );
00568 const char *CONTENT_ENCODE = "Content-Transfer-Encoding";
00569 static size_t CONTENT_ENCODE_LEN = strlen(CONTENT_ENCODE);
00570
00571 log.log(Logger::DEBUG, "parseContentType", "enter");
00572
00573 MailFilter::classification klass = MailFilter::UNKNOWN;
00574
00575 SpamUtil::contentType type = SpamUtil().classifySection( contentBuf );
00576
00577 {
00578 char *pBound = strstr(contentBuf, BOUNDARY);
00579 if (pBound && type != SpamUtil::TEXT) {
00580 saveBoundary( pBound + BOUNDARY_LEN );
00581 }
00582 }
00583
00584 pPushBack = 0;
00585 char *pBuf;
00586 if ((pBuf = fgets(pushBackBuf, sizeof(pushBackBuf), fp)) != 0) {
00587
00588
00589
00590
00591
00592 if (SpamUtil().isBlankLine(pBuf) || SpamUtil().findColon(pBuf) != 0) {
00593 pPushBack = pBuf;
00594 }
00595 else {
00596 char *pBound = strstr(pBuf, BOUNDARY);
00597 if (pBound && type != SpamUtil::TEXT) {
00598 saveBoundary( pBound + BOUNDARY_LEN );
00599 }
00600
00601 pBuf = fgets(pushBackBuf, sizeof(pushBackBuf), fp);
00602 pPushBack = pBuf;
00603 }
00604
00605
00606 char *pEncode;
00607 if ((pEncode = strstr(pBuf, CONTENT_ENCODE)) != 0) {
00608 pPushBack = 0;
00609 if (strstr(pEncode + CONTENT_ENCODE_LEN, "base64")) {
00610 type = SpamUtil::BASE64;
00611 }
00612 }
00613
00614 }
00615
00616 if (type == SpamUtil::BASE64) {
00617 if (mParams.hasFlag("kill_base64")) {
00618 mHeadInfo.reason("found base64 encoded information");
00619 klass = MailFilter::GARBAGE;
00620 }
00621 else {
00622 klass = MailFilter::SUSPECT;
00623 }
00624 }
00625 else if (type == SpamUtil::HTML) {
00626 klass = MailFilter::SUSPECT;
00627 }
00628 else if (type == SpamUtil::IMAGE || type == SpamUtil::AUDIO) {
00629 klass = MailFilter::SUSPECT;
00630 }
00631
00632 const char *typeName;
00633 typeName = SpamUtil().typeToStr( type );
00634
00635 char msg[128];
00636 sprintf(msg, "mail type = %s", typeName );
00637 log.log(Logger::DEBUG, "parseContentType", msg );
00638
00639 log.log(Logger::DEBUG, "parseContentType", "exit");
00640 return klass;
00641 }
00642
00643
00644
00659 void MailHeader::fillInSections(FILE *fp)
00660 {
00661 static const char *TO = "to:";
00662 static const size_t TO_LEN = strlen( TO );
00663 static const char *FROM = "from:";
00664 static const size_t FROM_LEN = strlen( FROM );
00665 static const char *SUBJECT = "subject:";
00666 static const size_t SUBJECT_LEN = strlen( SUBJECT );
00667 static const char *DATE = "date:";
00668 static const size_t DATE_LEN = strlen( DATE );
00669 char buf[1024];
00670 char *pBuf = 0;
00671 size_t bufSize = 0;
00672
00673 log.log(Logger::DEBUG, "fillInSections", "enter");
00674
00675 do {
00676 if (pPushBack == 0) {
00677 pushBackBuf[0] = '0';
00678 bufSize = sizeof(buf);
00679 pBuf = fgets(buf, bufSize, fp);
00680 }
00681 else {
00682 pBuf = pPushBack;
00683 bufSize = sizeof( pushBackBuf );
00684 pPushBack = 0;
00685 }
00686 if (pBuf != 0) {
00687 if (! SpamUtil().isBlankLine( pBuf )) {
00688 char *pCopy = 0;
00689 if (SpamUtil().match(pBuf, TO_LEN, TO)) {
00690 pCopy = pBuf + TO_LEN;
00691 mHeadInfo.to( pCopy );
00692 }
00693 else if (SpamUtil().match(pBuf, FROM_LEN, FROM)) {
00694 pCopy = pBuf + FROM_LEN;
00695 mHeadInfo.from( pCopy );
00696 }
00697 else if (SpamUtil().match(pBuf, SUBJECT_LEN, SUBJECT)) {
00698 pCopy = pBuf + SUBJECT_LEN;
00699 mHeadInfo.subject( pCopy );
00700 }
00701 else if (SpamUtil().match(pBuf, DATE_LEN, DATE)) {
00702 pCopy = pBuf + DATE_LEN;
00703 mHeadInfo.date( pCopy );
00704 }
00705 }
00706 else {
00707
00708 break;
00709 }
00710 }
00711 } while (pBuf != 0);
00712
00713 if (pBuf == 0) {
00714 log.log(Logger::DEBUG, "fillInSections", "end-of-file reached");
00715 }
00716 log.log(Logger::DEBUG, "fillInSections", "exit");
00717 }
00718
00719
00804 MailFilter::classification MailHeader::checkHeader(FILE *fp )
00805 {
00806 log.log(Logger::DEBUG, "checkHeader", "enter");
00807 MailFilter::classification klass = MailFilter::BAD_VALUE;
00808 if (!feof(fp)) {
00809 klass = MailFilter::UNKNOWN;
00810 char *pBuf;
00811
00812
00813 while ((pPushBack = fgets(pushBackBuf, sizeof(pushBackBuf), fp)) != 0) {
00814 if (! SpamUtil().isBlankLine( pPushBack )) {
00815 break;
00816 }
00817 }
00818
00819 if (pPushBack != 0) {
00820 size_t line = 1;
00821 const char *RECEIVED = "received";
00822 static const size_t RECEIVED_LEN = strlen(RECEIVED);
00823 const char *SUBJECT = "subject";
00824 static const size_t SUBJECT_LEN = strlen(SUBJECT);
00825 const char *FROM = "from";
00826 static const size_t FROM_LEN = strlen(FROM);
00827 const char *CONTENT_TYPE = "content-type";
00828 const char *TO = "to";
00829 static const size_t TO_LEN = strlen(TO);
00830 const char *CC = "cc";
00831 const char *DATE = "date";
00832 const char *pBound = 0;
00833 const char *pColon = 0;
00834 char buf[1024];
00835 do {
00836 if (pPushBack == 0) {
00837 pushBackBuf[0] = '\0';
00838 pBuf = fgets(buf, sizeof(buf), fp);
00839 }
00840 else {
00841 pBuf = pPushBack;
00842 pPushBack = 0;
00843 }
00844 if (!pBuf) {
00845 break;
00846 }
00847
00848 if (! SpamUtil().isBlankLine( pBuf )) {
00849 pColon = SpamUtil().findColon( pBuf );
00850 if (SpamUtil().match(pBuf, FROM_LEN, FROM)) {
00851 pColon = pBuf + FROM_LEN;
00852 if (*pColon == ':') {
00853 pColon++;
00854 mHeadInfo.from( pColon );
00855 }
00856 else {
00857 mHeadInfo.fromNoColon( pColon );
00858 }
00859 klass = checkFrom( pColon );
00860 }
00861 else if (pColon != 0) {
00862 if (SpamUtil().match(pBuf, RECEIVED_LEN, RECEIVED)) {
00863 pColon = pBuf + RECEIVED_LEN + 1;
00864 klass = checkReceived( pColon, fp, line );
00865 } else if (SpamUtil().match(pBuf, SUBJECT_LEN, SUBJECT)) {
00866 pColon = pBuf + SUBJECT_LEN + 1;
00867 mHeadInfo.subject(pColon);
00868 klass = checkSubject( pColon, fp );
00869 }
00870 else if (SpamUtil().match(pBuf, pColon, CONTENT_TYPE)) {
00871 pColon++;
00872 klass = parseContentType(fp, pBuf);
00873 }
00874 else if (SpamUtil().match(pBuf, TO_LEN, TO)) {
00875 pColon = pBuf + TO_LEN + 1;
00876 mHeadInfo.to( pColon );
00877 klass = checkAddressSection(pColon, fp);
00878 }
00879 else if (SpamUtil().match(pBuf, pColon, CC)) {
00880 pColon++;
00881 klass = checkAddressSection(pColon, fp);
00882 }
00883 else if (SpamUtil().match(pBuf, pColon, DATE)) {
00884 pColon++;
00885 mHeadInfo.date(pColon);
00886 }
00887 }
00888 }
00889 else {
00890
00891 break;
00892 }
00893 } while (klass == MailFilter::UNKNOWN && pBuf != 0);
00894
00895
00896
00897 if (! SpamUtil().isBlankLine( pBuf )) {
00898 fillInSections(fp);
00899 }
00900
00901
00902
00903
00904 if (klass == MailFilter::UNKNOWN && (! foundValidAddress)) {
00905 log.log(Logger::DEBUG, "checkHeader", "Did not find a valid To: or Cc: address");
00906 klass = MailFilter::SUSPECT;
00907 }
00908 }
00909 }
00910
00911 mHeadInfo.klass( klass );
00912
00913 char msg[128];
00914 sprintf(msg, "return value = %s", SpamUtil().classificationToStr( klass ));
00915 log.log(Logger::DEBUG, "checkHeader", msg );
00916 log.log(Logger::DEBUG, "checkHeader", "exit");
00917 return klass;
00918 }