#include <SpamUtil.h>
Public Types | |
enum | contentType { BAD_CONT, UNKNOWN, BLANK, HTML, TEXT, MULTIPART, AUDIO, IMAGE, APPLICATION, WINDOZ, BASE64 } |
Public Member Functions | |
const char * | classificationToStr (MailFilter::classification klass) |
Static Public Member Functions | |
const int | getFileSize (FILE *fp, const char *fileName, char *msgbuf) |
const char * | trimEnd (char *str) |
void | trim (const char *str) |
const char * | findColon (const char *buf) |
const char * | skipWhiteSpace (const char *pBuf) |
bool | isBlankLine (const char *buf) |
void | strncpy (char *pDest, const char *pSrc, size_t destSize) |
void | toLower (char *dest, const char *src) |
void | toLower (char *dest, const char *src, size_t size) |
bool | match (const char *bufStart, const size_t len, const char *word) |
bool | match (const char *bufStart, const char *bufEnd, const char *word) |
contentType | classifySection (const char *buf) |
const char * | typeToStr (contentType type) |
MailFilter::classification | checkLine (const char *buf, SpamParameters ¶ms, char *foundStr, const size_t foundStrSize) |
Private Member Functions | |
SpamUtil (const SpamUtil &rhs) |
SpamUtil().trim( myString );
Definition at line 41 of file SpamUtil.h.
Constructor & Destructor Documentation
|
disallow the copy constructor Referenced by classifySection(). |
|
Check the line in buf against the spam words and the kill words. If a spam word is found, the function returns SUSPECT. If a kill word is found the function returns GARBAGE. Important Note: the input line must be converted into lower case, or the matches against the spam and kill words may fail when they should not. If this function is called on lines read from the HTML section of an email, the HTML tags should be stripped out. Since HTML tags are not stripped in the text section, if the HTMLtag or BODYtag strings are found, the email is marked as suspect (spammers will sometimes include HTML in a text section and count on the flexibility of email software to convert it).
Definition at line 406 of file SpamUtil.C. References SpamParameters::getSection(), and strncpy().
00410 { 00411 const char *HTMLtag = "<html>"; // remember, input line is in lower case 00412 const char *BODYtag = "<body"; // body may have attributes, so the closing 00413 // angle bracket is omitted. 00414 MailFilter::classification klass = MailFilter::UNKNOWN; 00415 std::vector<const char *> spamWords = params.getSection(SpamParameters::spam_words); 00416 std::vector<const char *> killWords = params.getSection(SpamParameters::kill_words); 00417 00418 foundStr[0] = '\0'; 00419 char *pHtmlTag = 0; 00420 00421 if ((pHtmlTag = strstr(buf, HTMLtag)) == 0) { 00422 pHtmlTag = strstr(buf, BODYtag); 00423 } 00424 00425 if (pHtmlTag != 0) { 00426 strncpy(foundStr, pHtmlTag, foundStrSize); 00427 klass = MailFilter::SUSPECT; 00428 } 00429 else { 00430 const size_t len = spamWords.size(); 00431 for (size_t i = 0; i < len; i++) { 00432 if (strstr(buf, spamWords[i]) != 0) { 00433 strncpy(foundStr, spamWords[i], foundStrSize); 00434 klass = MailFilter::SUSPECT; 00435 break; 00436 } 00437 } // for 00438 00439 if (klass == MailFilter::UNKNOWN) { 00440 const size_t len = killWords.size(); 00441 for (size_t i = 0; i < len; i++) { 00442 if (strstr(buf, killWords[i]) != 0) { 00443 strncpy(foundStr, killWords[i], foundStrSize); 00444 klass = MailFilter::GARBAGE; 00445 break; 00446 } 00447 } 00448 } 00449 } 00450 return klass; 00451 } // checkLine |
|
Classify a Content-Type section Definition at line 272 of file SpamUtil.C. References SpamUtil().
00273 { 00274 contentType type = UNKNOWN; 00275 char tempBuf[128]; 00276 00277 SpamUtil().toLower(tempBuf, buf, sizeof(tempBuf)); 00278 00279 const char *typeName = "unknown"; 00280 if (strstr(buf, "multipart") != 0) { 00281 type = MULTIPART; 00282 } 00283 else if (strstr(buf, "html") != 0) { 00284 type = HTML; 00285 } 00286 else if (strstr(buf, "text") != 0) { 00287 type = TEXT; 00288 } 00289 else if (strstr(buf, "image") != 0) { 00290 type = IMAGE; 00291 } 00292 else if (strstr(buf, "audio") != 0) { 00293 type = AUDIO; 00294 } 00295 else if (strstr(buf, "application") != 0) { 00296 type = APPLICATION; 00297 } 00298 00299 return type; 00300 } // classifySection |
|
Return the file size, in bytes or -1 if there is an error. Definition at line 45 of file SpamUtil.C.
00046 { 00047 int fileSize = -1; 00048 if (fp != 0) { 00049 struct stat s; 00050 int fd = fileno( fp ); 00051 if (fstat( fd, &s) == 0) { 00052 fileSize = s.st_size; 00053 } 00054 else { 00055 char *err_reason = strerror( errno); 00056 sprintf( msgbuf, "SpamUtil::getFileSize: could not open %s. Reason: %s", 00057 fileName, err_reason ); 00058 } 00059 } 00060 return fileSize; 00061 } // getFileSize |
|
Return true if the line is blank (only white space) or null terminated at the start. False otherwise. Definition at line 149 of file SpamUtil.C.
00150 { 00151 bool blankLine = true; 00152 if (buf != 0) { 00153 for (; *buf != '\0'; buf++) { 00154 if (! isspace( *buf ) ) { 00155 blankLine = false; 00156 break; 00157 } 00158 } // for 00159 } 00160 return blankLine; 00161 } // isBlankLine |
|
Check whether a region of a string (defined by bufStart and bufEnd) matches the contents of the string in "word". If there is a match, return true, otherwise return false. Definition at line 196 of file SpamUtil.C.
00199 { 00200 assert( bufStart != 0 && word != 0); 00201 00202 bool wordsMatch = false; 00203 int regionLen = bufEnd - bufStart; 00204 int wordLen = strlen( word ); 00205 if (wordLen > 0 && wordLen == regionLen) { 00206 wordsMatch = true; 00207 for (size_t i = 0; i < wordLen; i++) { 00208 if (tolower(bufStart[i]) != word[i]) { 00209 wordsMatch = false; 00210 break; 00211 } 00212 } // for 00213 } 00214 00215 return wordsMatch; 00216 } // match |
|
Check whether the start of the string matches another string (word). If they match, return true, otherwise return false. Note that this function does the match in lower case. Definition at line 170 of file SpamUtil.C.
00173 { 00174 assert( bufStart != 0 && word != 0); 00175 00176 bool wordsMatch = false; 00177 if (len > 0) { 00178 wordsMatch = true; 00179 for (size_t i = 0; i < len; i++) { 00180 if (tolower(bufStart[i]) != word[i]) { 00181 wordsMatch = false; 00182 break; 00183 } 00184 } // for 00185 } 00186 00187 return wordsMatch; 00188 } // match |
|
Skip white spaces at the start of a string. The function returns a pointer to the first non-white space character. Definition at line 89 of file SpamUtil.C. Referenced by trim().
00090 { 00091 while (*pBuf != '\0' && isspace( *pBuf )) { 00092 pBuf++; 00093 } 00094 return pBuf; 00095 } // skipWhiteSpace |
|
Copy pSrc into pDest until a null char is encounted or until destSize-1 characters are copied. Null terminate the string, even if the source is longer than the dest. This is slightly different behavior than the POSIX strncpy, which will not always null terminate. Also, this function only inserts a single null. In constrast, POSIX strncpy will pad the dest with nulls when the length of src is less than dest. Finally, another difference is that this function does not return a char* pointer. Definition at line 74 of file SpamUtil.C. Referenced by checkLine().
00075 { 00076 const size_t maxCopy = destSize-1; 00077 size_t ix; 00078 for (ix = 0; ix < maxCopy && pSrc[ix] != '\0'; ix++) { 00079 pDest[ix] = pSrc[ix]; 00080 } 00081 pDest[ix] = '\0'; 00082 } // strncpy |
|
Convert a string to lower case. The destination and source may be the same address. The size argument is the size of the destination. The source is assumed to be a null terminated string. Definition at line 242 of file SpamUtil.C.
00243 { 00244 // convert to lower case 00245 size_t j; 00246 for (j = 0; j < size-1 && src[j] != '\0'; j++) { 00247 dest[j] = tolower(src[j]); 00248 } // for 00249 dest[j] = '\0'; 00250 } |
|
Convert a string to lower case. The destination and source may be the same address. The source is assumed to be a null terminated string. Definition at line 258 of file SpamUtil.C.
00259 { 00260 // convert to lower case 00261 size_t j; 00262 for (j = 0; src[j] != '\0'; j++) { 00263 dest[j] = tolower(src[j]); 00264 } // for 00265 dest[j] = '\0'; 00266 } |
|
This function modifies the contents of the string pointed to by the str argument. The result is a string from which the leading and trailing white space have been removed. Definition at line 119 of file SpamUtil.C. References skipWhiteSpace().
00120 { 00121 if (str != 0) { 00122 char *start = (char *)skipWhiteSpace( str ); 00123 00124 char *ptr = (char *)str; 00125 if (*start != '\0') { 00126 size_t len = strlen( str ); 00127 char *end = (char *)&str[len-1]; 00128 00129 while (end > start && isspace( *end )) { 00130 end--; 00131 } 00132 size_t trimLen = (end - start) + 1; 00133 for (size_t i = 0; i < trimLen; i++) { 00134 *ptr = *start; 00135 start++; 00136 ptr++; 00137 } 00138 } 00139 *ptr = '\0'; 00140 } 00141 } // trim |
|
Remove white space from the end of a string. Definition at line 101 of file SpamUtil.C.
00102 { 00103 if (str != 0 && str[0] != '\0') { 00104 int len = strlen( str ); 00105 int i; 00106 for (i = len-1; i >=0 && isspace(str[i]); i--) 00107 /* nada */; 00108 str[i+1] = '\0'; 00109 } 00110 return str; 00111 } // trimEnd |
|
Return the character string that corresponds to the contentType enumeration. Definition at line 307 of file SpamUtil.C.
00308 { 00309 const char *typeName; 00310 00311 switch (type) { 00312 case BAD_CONT: 00313 typeName = "bad content"; 00314 break; 00315 case UNKNOWN: 00316 typeName = "unknown"; 00317 break; 00318 case BLANK: 00319 typeName = "blank section"; 00320 break; 00321 case HTML: 00322 typeName = "html"; 00323 break; 00324 case TEXT: 00325 typeName = "text"; 00326 break; 00327 case MULTIPART: 00328 typeName = "multipart"; 00329 break; 00330 case IMAGE: 00331 typeName = "image"; 00332 break; 00333 case APPLICATION: 00334 typeName = "application"; 00335 break; 00336 case AUDIO: 00337 typeName = "audio"; 00338 break; 00339 case WINDOZ: 00340 typeName = "windows chars"; 00341 break; 00342 case BASE64: 00343 typeName = "base64"; 00344 break; 00345 default: 00346 typeName = "bad content type"; 00347 break; 00348 } 00349 return typeName; 00350 } // typeToStr |