#include <MailBody.h>
Public Member Functions | |
MailBody (SpamParameters ¶m, HeaderInfo &headInfo) | |
MailFilter::classification | checkBody (const char *boundary, FILE *fp) |
Private Types | |
enum | lineType { BAD_VALUE, EndOfFile, LINE, BOUNDARY } |
Private Member Functions | |
MailBody (const MailBody &rhs) | |
lineType | getLine (char *buf, const size_t bufSize, FILE *fp, const char *boundary) |
lineType | getHtmlLine (char *buf, const size_t bufSize, FILE *fp, const char *boundary) |
void | mailBodyMsg (MailFilter::classification klass, const char *foundStr, const char *funcName) |
lineType | findSection (const char *boundary, FILE *fp) |
SpamUtil::contentType | classifyMailSection (FILE *fp) |
MailFilter::classification | processBySection (const char *boundary, FILE *fp) |
MailFilter::classification | processTextBody (FILE *fp) |
Private Attributes | |
SpamParameters & | mParams |
HeaderInfo & | mHeadInfo |
Logger | log |
Definition at line 51 of file MailBody.h.
|
The MailBody constructor is initialized with a reference to the SpamParameters object which is initialized from the SpamFilterParms file. A Logger object is initialized with the class name. Definition at line 45 of file MailBody.C. References Logger::getLogger().
00045 : 00046 mParams( param ), mHeadInfo( headInfo ) 00047 { 00048 log = pLogger->getLogger("MailBody"); 00049 } // MailBody |
|
If the email header had a boundary definition, process the mail by section. Otherwise the structure of the email is simpler and just contains a text section, so just process the text body. Definition at line 521 of file MailBody.C. References processBySection(), and processTextBody(). Referenced by MailFilter::checkMail().
00522 { 00523 MailFilter::classification klass = MailFilter::UNKNOWN; 00524 if (! feof(fp)) { 00525 if (boundary != 0 && boundary[0] != '\0') { 00526 klass = processBySection( boundary, fp ); 00527 } 00528 else { 00529 klass = processTextBody( fp ); 00530 } 00531 } 00532 return klass; 00533 } // checkBody |
|
Return the section type for a section of a multipart email. Find the start of the section (by calling findSection, above). Assuming that we find a section (something beginning with a boundary), check for the content type. If the content is text/plain check to see if the next line is a charset line. Look for Windows (since this is a Linux email filter, Windows character set emails are marked as suspect). Other character sets could be checked for here as well (for example, asian character sets). Check to see if the Content-Transfer-Encoding line which follows is base64. Definition at line 204 of file MailBody.C. References Logger::log(). Referenced by processBySection().
00205 { 00206 SpamUtil::contentType type = SpamUtil::UNKNOWN; 00207 char buf[128]; 00208 char *bufPtr; 00209 00210 // get the line following the boundary line (may be blank) 00211 if ((bufPtr = fgets(buf, sizeof(buf), fp)) != 0) { 00212 if (SpamUtil().isBlankLine(buf)) { 00213 type = SpamUtil::BLANK; 00214 } 00215 else { 00216 static const char *CONTENT_TYPE = "Content-Type:"; 00217 static const size_t content_typeLen = strlen( CONTENT_TYPE ); 00218 const char *ptr; 00219 00220 // see if we can find Content-Type:, if not, its an empty section 00221 if ((ptr = strstr(buf, CONTENT_TYPE)) != 0) { 00222 type = SpamUtil().classifySection( ptr+content_typeLen ); 00223 // get the line after Content-Type 00224 if ((bufPtr = fgets(buf, sizeof(buf), fp)) != 0) { 00225 // if there is a colon, it's probably a Content-Transfer-Encoding line 00226 if (SpamUtil().findColon( bufPtr ) == 0) { 00227 // OK, no colon. If the type was TEXT look for Windows char set. 00228 if (type == SpamUtil::TEXT) { 00229 static const char *CHARSET = "charset"; 00230 static const size_t charsetLen = strlen( CHARSET ); 00231 if ((ptr = strstr(buf, "charset")) != 0) { 00232 if (strstr(ptr+charsetLen, "Windows") != 0) { 00233 type = SpamUtil::WINDOZ; 00234 } // Windows string 00235 } // charset string 00236 } // type == TEXT 00237 bufPtr = fgets(buf, sizeof(buf), fp); 00238 } // find colon 00239 if (bufPtr != 0) { 00240 // look for the base64 in Content-Transfer-Encoding: base64 00241 if (strstr(buf, "base64") != 0) { 00242 type = SpamUtil::BASE64; 00243 } 00244 } 00245 } // fgets != 0 00246 } // Content-Type string 00247 } 00248 } // fgets != 0 00249 else { 00250 log.log(Logger::ERROR, "classifyMailSection", "line expected after boundary"); 00251 } 00252 00253 return type; 00254 } // classifyMailSection |
|
Find the start (boundary line) of a boundary separated section in a multipart email.
Definition at line 170 of file MailBody.C. References getLine(). Referenced by processBySection().
00171 { 00172 00173 const size_t BUF_SIZE = 128; 00174 char buf[ BUF_SIZE ]; 00175 00176 lineType ty; 00177 00178 while ((ty = getLine(buf, sizeof(buf), fp, boundary )) != MailBody::EndOfFile) { 00179 if (ty == BOUNDARY) { 00180 break; 00181 } 00182 } 00183 00184 return ty; 00185 } // findSection |
|
This function operates like getLine, above, but it filters out HTML tags. Note that in the case where the boundar line is found, buf will be returned with a zero length string. Definition at line 112 of file MailBody.C. Referenced by processBySection().
00116 { 00117 char *tmpBuf = new char[ bufSize ]; 00118 lineType type = EndOfFile; 00119 00120 *buf = '\0'; 00121 00122 char *pStr; 00123 // skip any blank lines 00124 do { 00125 pStr = fgets(tmpBuf, bufSize, fp); 00126 } while (pStr != 0 && SpamUtil().isBlankLine(tmpBuf)); 00127 00128 if (pStr != 0) { 00129 00130 type = LINE; 00131 if (boundary != 0) { 00132 if (tmpBuf[0] == '-' && tmpBuf[1] == '-') { 00133 if (strstr(tmpBuf+2, boundary) != 0) { 00134 type = BOUNDARY; 00135 } 00136 } 00137 } 00138 00139 if (type != BOUNDARY) { 00140 size_t i = 0; // buf index 00141 bool copyChar = true; 00142 for (size_t j = 0; j < bufSize-1 && tmpBuf[j] != '\0'; j++) { 00143 char ch = tmpBuf[j]; 00144 if (ch == '<') { 00145 copyChar = false; 00146 } 00147 else if (ch == '>') { 00148 copyChar = true; 00149 } 00150 else if (copyChar) { 00151 buf[i] = tolower(ch); 00152 i++; 00153 } 00154 } // for 00155 buf[i] = '\0'; 00156 } 00157 } 00158 delete [] tmpBuf; 00159 00160 return type; 00161 } // getHtmlLine |
|
Get a non-blank line.
The LINE value will be returned if a non-blank line is read. The BOUNDARY value will be returned if a boundary line is found. Definition at line 72 of file MailBody.C. Referenced by findSection(), processBySection(), and processTextBody().
00076 { 00077 lineType type = EndOfFile; 00078 00079 char *pStr; 00080 // skip any blank lines 00081 do { 00082 pStr = fgets(buf, bufSize, fp); 00083 } while (pStr != 0 && SpamUtil().isBlankLine(buf)); 00084 00085 if (pStr != 0) { 00086 00087 type = LINE; 00088 if (boundary != 0) { 00089 if (buf[0] == '-' && buf[1] == '-') { 00090 if (strstr(buf+2, boundary) != 0) { 00091 type = BOUNDARY; 00092 } 00093 } 00094 } 00095 00096 if (type != BOUNDARY) { 00097 SpamUtil().toLower(buf, buf, bufSize); 00098 } 00099 } 00100 00101 return type; 00102 } // getLine; |
|
Write out a log file message that records the result of processing the email body. This log file message is a DEBUG level message. Definition at line 262 of file MailBody.C. References Logger::log(). Referenced by processBySection(), and processTextBody().
00265 { 00266 char msg[256]; 00267 char startMsg[64]; 00268 const char *pMsg = msg; 00269 00270 const char *klassStr = "email"; 00271 if (klass == MailFilter::SUSPECT) { 00272 klassStr = "suspect"; 00273 } 00274 else if (klass == MailFilter::GARBAGE) { 00275 klassStr = "garbage"; 00276 } 00277 sprintf(startMsg, "email classified as %s", klassStr); 00278 if (foundStr[0] != '\0') { 00279 sprintf(msg, "%s, found \"%s\"", startMsg, foundStr ); 00280 } 00281 else { 00282 pMsg = startMsg; 00283 } 00284 log.log(Logger::DEBUG, funcName, pMsg ); 00285 } // mailBodyMsg |
|
Process a boundary separated e-mail body. The rules used by this function are:
Definition at line 346 of file MailBody.C. References classifyMailSection(), findSection(), getHtmlLine(), getLine(), SpamParameters::hasFlag(), Logger::log(), mailBodyMsg(), and HeaderInfo::reason(). Referenced by checkBody().
00348 { 00349 MailFilter::classification klass = MailFilter::UNKNOWN; 00350 log.log(Logger::DEBUG, "processBySection", "enter"); 00351 00352 log.log(Logger::DEBUG, "processBySection", "processing first section"); 00353 00354 if (findSection(boundary, fp) == BOUNDARY) { 00355 char foundStr[128]; 00356 foundStr[0] = '\0'; 00357 00358 log.log(Logger::DEBUG, "processBySection", "found boundary"); 00359 00360 SpamUtil::contentType type = classifyMailSection(fp); 00361 if (type == SpamUtil::TEXT) { 00362 char buf[4096]; 00363 00364 bool foundNonBlankLine = false; 00365 MailBody::lineType lineTy; 00366 while ((lineTy = getLine(buf, sizeof(buf), fp, boundary)) == LINE) { 00367 klass = SpamUtil().checkLine(buf, 00368 mParams, 00369 foundStr, 00370 sizeof(foundStr)); 00371 foundNonBlankLine = true; 00372 if (klass != MailFilter::UNKNOWN) { 00373 mHeadInfo.reason(foundStr); 00374 break; 00375 } 00376 } // while 00377 00378 // The text section was empty, so it is probably spam, since 00379 // legitimate email usually provides a text version and an HTML 00380 // version. 00381 if (! foundNonBlankLine) { 00382 log.log(Logger::DEBUG, "processBySection", "found blank text section"); 00383 klass = MailFilter::SUSPECT; 00384 } 00385 else { 00386 char msg[128]; 00387 sprintf(msg, "first section type = %s", SpamUtil().typeToStr( type )); 00388 log.log(Logger::DEBUG, "processBySection", msg ); 00389 } 00390 00391 // if we don't know it's spam or garbage, check the type of the 00392 // section that follows the text section (it may, for example, be 00393 // a base64 section). 00394 if (lineTy == BOUNDARY) { 00395 log.log(Logger::DEBUG, "processBySection", "found second section boundary" ); 00396 SpamUtil::contentType secondSecType = classifyMailSection(fp); 00397 type = secondSecType; 00398 char msg[128]; 00399 sprintf(msg, "second section type = %s", SpamUtil().typeToStr( secondSecType )); 00400 log.log(Logger::DEBUG, "processBySection", msg ); 00401 00402 if (secondSecType == SpamUtil::HTML && (foundNonBlankLine)) { 00403 // There was a non-blank text section. Some spammers will fill 00404 // in text from on-line books in this section and then include 00405 // the spam in the HTML section in an attempt to fool spam filters 00406 // (especially baysian filters). So check for spam words here as well. 00407 log.log(Logger::DEBUG, "processBySection", "processing HTML section" ); 00408 while ((lineTy = getHtmlLine(buf, sizeof(buf), fp, boundary)) == LINE) { 00409 klass = SpamUtil().checkLine(buf, 00410 mParams, 00411 foundStr, 00412 sizeof(foundStr)); // true means HTML section 00413 if (klass != MailFilter::UNKNOWN) { 00414 mHeadInfo.reason(foundStr); 00415 break; 00416 } 00417 } // while 00418 } // if section is HTML 00419 00420 } 00421 } 00422 else if (type == SpamUtil::HTML || 00423 type == SpamUtil::BLANK || 00424 type == SpamUtil::WINDOZ) { 00425 // We know it's suspect, the only question now is, is it garbage? 00426 klass = MailFilter::SUSPECT; 00427 if (type == SpamUtil::HTML) { 00428 log.log(Logger::DEBUG, "processBySection", "begins with HTML section"); 00429 } 00430 else if (type == SpamUtil::BLANK) { 00431 log.log(Logger::DEBUG, "processBySection", "found boundary, no Content-Type"); 00432 } 00433 if (findSection(boundary, fp) == BOUNDARY) { 00434 type = classifyMailSection(fp); 00435 } 00436 } 00437 00438 if (type == SpamUtil::WINDOZ || 00439 type == SpamUtil::IMAGE || 00440 type == SpamUtil::AUDIO || 00441 type == SpamUtil::MULTIPART) { 00442 char msg[128]; 00443 sprintf(msg, "found section type %s", SpamUtil().typeToStr( type ) ); 00444 log.log(Logger::DEBUG, "processBySection", msg ); 00445 klass = MailFilter::SUSPECT; 00446 } 00447 else if (type == SpamUtil::BASE64) { 00448 log.log(Logger::DEBUG, "processBySection", "found base64 section"); 00449 if (mParams.hasFlag("kill_base64")) { 00450 klass = MailFilter::GARBAGE; 00451 mHeadInfo.reason("found base64 encoding"); 00452 } 00453 else { 00454 klass = MailFilter::SUSPECT; 00455 } 00456 } 00457 // if the email class is still "UNKNOWN" then we assume it is 00458 // email. 00459 if (klass == MailFilter::UNKNOWN) { 00460 klass = MailFilter::EMAIL; 00461 } 00462 00463 mailBodyMsg( klass, foundStr, "processBySection"); 00464 } 00465 else { 00466 log.log(Logger::ERROR, "processBySection", "boundary not found"); 00467 } 00468 00469 log.log(Logger::DEBUG, "processBySection", "exit"); 00470 return klass; 00471 } // processBySection |
|
Check each line in the email to see if it contains spam words or kill words. If there are no matches, it is assumed to be valid email. Definition at line 480 of file MailBody.C. References getLine(), Logger::log(), mailBodyMsg(), and HeaderInfo::reason(). Referenced by checkBody().
00481 { 00482 MailFilter::classification klass = MailFilter::UNKNOWN; 00483 log.log(Logger::DEBUG, "processTextBody", "enter"); 00484 char buf[256]; 00485 char foundStr[128]; 00486 00487 foundStr[0] = '\0'; 00488 MailBody::lineType lineTy; 00489 while ((lineTy = getLine(buf, sizeof(buf), fp, 0)) == LINE) { 00490 klass = SpamUtil().checkLine(buf, 00491 mParams, 00492 foundStr, 00493 sizeof(foundStr)); 00494 if (klass != MailFilter::UNKNOWN) { 00495 break; 00496 } 00497 } // while 00498 00499 if (klass != MailFilter::UNKNOWN) { 00500 mHeadInfo.reason( foundStr ); 00501 } 00502 00503 // if the email class is still "UNKNOWN" then we assume it is 00504 // email. 00505 if (klass == MailFilter::UNKNOWN) { 00506 klass = MailFilter::EMAIL; 00507 } 00508 mailBodyMsg( klass, foundStr, "processTextBody"); 00509 00510 log.log(Logger::DEBUG, "processTextBody", "exit"); 00511 return klass; 00512 } // processTextBody |