Main Page | Compound List | File List | Compound Members | File Members

SpamUtil Class Reference

#include <SpamUtil.h>

List of all members.

Public Types

enum contentType {
  BAD_CONT, UNKNOWN, BLANK, HTML,
  TEXT, MULTIPART, AUDIO, IMAGE,
  APPLICATION, WINDOZ, BASE64
}

Public Member Functions

const char * classificationToStr (MailFilter::classification klass)

Static Public Member Functions

const int getFileSize (FILE *fp, const char *fileName, char *msgbuf)

const char * trimEnd (char *str)

void trim (const char *str)

const char * findColon (const char *buf)

const char * skipWhiteSpace (const char *pBuf)

bool isBlankLine (const char *buf)

void strncpy (char *pDest, const char *pSrc, size_t destSize)

void toLower (char *dest, const char *src)

void toLower (char *dest, const char *src, size_t size)

bool match (const char *bufStart, const size_t len, const char *word)

bool match (const char *bufStart, const char *bufEnd, const char *word)

contentType classifySection (const char *buf)

const char * typeToStr (contentType type)

MailFilter::classification checkLine (const char *buf, SpamParameters &params, char *foundStr, const size_t foundStrSize)

Private Member Functions

SpamUtil (const SpamUtil &rhs)

Detailed Description

Common utility methods for the spam filter. These are all stateless, static functions. The SpamUtil class is also stateless. The functions in SpamUtil can be called by creating a class temporary, since they are all statics. For example:


       SpamUtil().trim( myString );

Definition at line 41 of file SpamUtil.h.

Constructor & Destructor Documentation

SpamUtil::SpamUtil ( const SpamUtil & rhs ) [private]

disallow the copy constructor
Referenced by classifySection().

Member Function Documentation

MailFilter::classification SpamUtil::checkLine ( const char * buf,

SpamParameters & params,

char * foundStr,

const size_t foundStrSize

) [static]

Check the line in buf against the spam words and the kill words. If a spam word is found, the function returns SUSPECT. If a kill word is found the function returns GARBAGE.
Important Note: the input line must be converted into lower case, or the matches against the spam and kill words may fail when they should not.
If this function is called on lines read from the HTML section of an email, the HTML tags should be stripped out. Since HTML tags are not stripped in the text section, if the HTMLtag or BODYtag strings are found, the email is marked as suspect (spammers will sometimes include HTML in a text section and count on the flexibility of email software to convert it).

Parameters:

buf a line, where all the characters in lower case (call SpamUtil::toLower to put in lower case.

params a reference to the SpamParameters object which encapsulates the vectors for spam and kill words.

foundStr a character array that will be used to return the spam word that was found.

foundStrSize the size of foundStr.

Definition at line 406 of file SpamUtil.C.
References SpamParameters::getSection(), and strncpy().

00410 { 00411 const char *HTMLtag = "<html>"; // remember, input line is in lower case 00412 const char *BODYtag = "<body"; // body may have attributes, so the closing 00413 // angle bracket is omitted. 00414 MailFilter::classification klass = MailFilter::UNKNOWN; 00415 std::vector<const char *> spamWords = params.getSection(SpamParameters::spam_words); 00416 std::vector<const char *> killWords = params.getSection(SpamParameters::kill_words); 00417 00418 foundStr[0] = '\0'; 00419 char *pHtmlTag = 0; 00420 00421 if ((pHtmlTag = strstr(buf, HTMLtag)) == 0) { 00422 pHtmlTag = strstr(buf, BODYtag); 00423 } 00424 00425 if (pHtmlTag != 0) { 00426 strncpy(foundStr, pHtmlTag, foundStrSize); 00427 klass = MailFilter::SUSPECT; 00428 } 00429 else { 00430 const size_t len = spamWords.size(); 00431 for (size_t i = 0; i < len; i++) { 00432 if (strstr(buf, spamWords[i]) != 0) { 00433 strncpy(foundStr, spamWords[i], foundStrSize); 00434 klass = MailFilter::SUSPECT; 00435 break; 00436 } 00437 } // for 00438 00439 if (klass == MailFilter::UNKNOWN) { 00440 const size_t len = killWords.size(); 00441 for (size_t i = 0; i < len; i++) { 00442 if (strstr(buf, killWords[i]) != 0) { 00443 strncpy(foundStr, killWords[i], foundStrSize); 00444 klass = MailFilter::GARBAGE; 00445 break; 00446 } 00447 } 00448 } 00449 } 00450 return klass; 00451 } // checkLine

SpamUtil::contentType SpamUtil::classifySection ( const char * buf ) [static]

Classify a Content-Type section
Definition at line 272 of file SpamUtil.C.
References SpamUtil().

00273 { 00274 contentType type = UNKNOWN; 00275 char tempBuf[128]; 00276 00277 SpamUtil().toLower(tempBuf, buf, sizeof(tempBuf)); 00278 00279 const char *typeName = "unknown"; 00280 if (strstr(buf, "multipart") != 0) { 00281 type = MULTIPART; 00282 } 00283 else if (strstr(buf, "html") != 0) { 00284 type = HTML; 00285 } 00286 else if (strstr(buf, "text") != 0) { 00287 type = TEXT; 00288 } 00289 else if (strstr(buf, "image") != 0) { 00290 type = IMAGE; 00291 } 00292 else if (strstr(buf, "audio") != 0) { 00293 type = AUDIO; 00294 } 00295 else if (strstr(buf, "application") != 0) { 00296 type = APPLICATION; 00297 } 00298 00299 return type; 00300 } // classifySection

const int SpamUtil::getFileSize ( FILE * fp,

const char * fileName,

char * msgbuf

) [static]

Return the file size, in bytes or -1 if there is an error.
Definition at line 45 of file SpamUtil.C.

00046 { 00047 int fileSize = -1; 00048 if (fp != 0) { 00049 struct stat s; 00050 int fd = fileno( fp ); 00051 if (fstat( fd, &s) == 0) { 00052 fileSize = s.st_size; 00053 } 00054 else { 00055 char *err_reason = strerror( errno); 00056 sprintf( msgbuf, "SpamUtil::getFileSize: could not open %s. Reason: %s", 00057 fileName, err_reason ); 00058 } 00059 } 00060 return fileSize; 00061 } // getFileSize

bool SpamUtil::isBlankLine ( const char * buf ) [static]

Return true if the line is blank (only white space) or null terminated at the start. False otherwise.
Definition at line 149 of file SpamUtil.C.

00150 { 00151 bool blankLine = true; 00152 if (buf != 0) { 00153 for (; *buf != '\0'; buf++) { 00154 if (! isspace( *buf ) ) { 00155 blankLine = false; 00156 break; 00157 } 00158 } // for 00159 } 00160 return blankLine; 00161 } // isBlankLine

bool SpamUtil::match ( const char * bufStart,

const char * bufEnd,

const char * word

) [static]

Check whether a region of a string (defined by bufStart and bufEnd) matches the contents of the string in "word". If there is a match, return true, otherwise return false.
Definition at line 196 of file SpamUtil.C.

00199 { 00200 assert( bufStart != 0 && word != 0); 00201 00202 bool wordsMatch = false; 00203 int regionLen = bufEnd - bufStart; 00204 int wordLen = strlen( word ); 00205 if (wordLen > 0 && wordLen == regionLen) { 00206 wordsMatch = true; 00207 for (size_t i = 0; i < wordLen; i++) { 00208 if (tolower(bufStart[i]) != word[i]) { 00209 wordsMatch = false; 00210 break; 00211 } 00212 } // for 00213 } 00214 00215 return wordsMatch; 00216 } // match

bool SpamUtil::match ( const char * bufStart,

const size_t len,

const char * word

) [static]

Check whether the start of the string matches another string (word). If they match, return true, otherwise return false. Note that this function does the match in lower case.
Definition at line 170 of file SpamUtil.C.

00173 { 00174 assert( bufStart != 0 && word != 0); 00175 00176 bool wordsMatch = false; 00177 if (len > 0) { 00178 wordsMatch = true; 00179 for (size_t i = 0; i < len; i++) { 00180 if (tolower(bufStart[i]) != word[i]) { 00181 wordsMatch = false; 00182 break; 00183 } 00184 } // for 00185 } 00186 00187 return wordsMatch; 00188 } // match

const char * SpamUtil::skipWhiteSpace ( const char * pBuf ) [static]

Skip white spaces at the start of a string. The function returns a pointer to the first non-white space character.
Definition at line 89 of file SpamUtil.C.
Referenced by trim().

00090 { 00091 while (*pBuf != '\0' && isspace( *pBuf )) { 00092 pBuf++; 00093 } 00094 return pBuf; 00095 } // skipWhiteSpace

void SpamUtil::strncpy ( char * pDest,

const char * pSrc,

size_t destSize

) [static]

Copy pSrc into pDest until a null char is encounted or until destSize-1 characters are copied. Null terminate the string, even if the source is longer than the dest. This is slightly different behavior than the POSIX strncpy, which will not always null terminate. Also, this function only inserts a single null. In constrast, POSIX strncpy will pad the dest with nulls when the length of src is less than dest. Finally, another difference is that this function does not return a char* pointer.
Definition at line 74 of file SpamUtil.C.
Referenced by checkLine().

00075 { 00076 const size_t maxCopy = destSize-1; 00077 size_t ix; 00078 for (ix = 0; ix < maxCopy && pSrc[ix] != '\0'; ix++) { 00079 pDest[ix] = pSrc[ix]; 00080 } 00081 pDest[ix] = '\0'; 00082 } // strncpy

void SpamUtil::toLower ( char * dest,

const char * src,

size_t size

) [static]

Convert a string to lower case. The destination and source may be the same address. The size argument is the size of the destination. The source is assumed to be a null terminated string.
Definition at line 242 of file SpamUtil.C.

00243 { 00244 // convert to lower case 00245 size_t j; 00246 for (j = 0; j < size-1 && src[j] != '\0'; j++) { 00247 dest[j] = tolower(src[j]); 00248 } // for 00249 dest[j] = '\0'; 00250 }

void SpamUtil::toLower ( char * dest,

const char * src

) [static]

Convert a string to lower case. The destination and source may be the same address. The source is assumed to be a null terminated string.
Definition at line 258 of file SpamUtil.C.

00259 { 00260 // convert to lower case 00261 size_t j; 00262 for (j = 0; src[j] != '\0'; j++) { 00263 dest[j] = tolower(src[j]); 00264 } // for 00265 dest[j] = '\0'; 00266 }

void SpamUtil::trim ( const char * str ) [static]

This function modifies the contents of the string pointed to by the str argument. The result is a string from which the leading and trailing white space have been removed.
Definition at line 119 of file SpamUtil.C.
References skipWhiteSpace().

00120 { 00121 if (str != 0) { 00122 char *start = (char *)skipWhiteSpace( str ); 00123 00124 char *ptr = (char *)str; 00125 if (*start != '\0') { 00126 size_t len = strlen( str ); 00127 char *end = (char *)&str[len-1]; 00128 00129 while (end > start && isspace( *end )) { 00130 end--; 00131 } 00132 size_t trimLen = (end - start) + 1; 00133 for (size_t i = 0; i < trimLen; i++) { 00134 *ptr = *start; 00135 start++; 00136 ptr++; 00137 } 00138 } 00139 *ptr = '\0'; 00140 } 00141 } // trim

const char * SpamUtil::trimEnd ( char * str ) [static]

Remove white space from the end of a string.
Definition at line 101 of file SpamUtil.C.

00102 { 00103 if (str != 0 && str[0] != '\0') { 00104 int len = strlen( str ); 00105 int i; 00106 for (i = len-1; i >=0 && isspace(str[i]); i--) 00107 /* nada */; 00108 str[i+1] = '\0'; 00109 } 00110 return str; 00111 } // trimEnd

const char * SpamUtil::typeToStr ( contentType type ) [static]

Return the character string that corresponds to the contentType enumeration.
Definition at line 307 of file SpamUtil.C.

00308 { 00309 const char *typeName; 00310 00311 switch (type) { 00312 case BAD_CONT: 00313 typeName = "bad content"; 00314 break; 00315 case UNKNOWN: 00316 typeName = "unknown"; 00317 break; 00318 case BLANK: 00319 typeName = "blank section"; 00320 break; 00321 case HTML: 00322 typeName = "html"; 00323 break; 00324 case TEXT: 00325 typeName = "text"; 00326 break; 00327 case MULTIPART: 00328 typeName = "multipart"; 00329 break; 00330 case IMAGE: 00331 typeName = "image"; 00332 break; 00333 case APPLICATION: 00334 typeName = "application"; 00335 break; 00336 case AUDIO: 00337 typeName = "audio"; 00338 break; 00339 case WINDOZ: 00340 typeName = "windows chars"; 00341 break; 00342 case BASE64: 00343 typeName = "base64"; 00344 break; 00345 default: 00346 typeName = "bad content type"; 00347 break; 00348 } 00349 return typeName; 00350 } // typeToStr

The documentation for this class was generated from the following files:

Generated on Sat Mar 27 13:07:38 2004 for Mail Filter by

1.3.3


Public Types
enum	contentType { BAD_CONT, UNKNOWN, BLANK, HTML, TEXT, MULTIPART, AUDIO, IMAGE, APPLICATION, WINDOZ, BASE64 }
Public Member Functions
const char *	classificationToStr (MailFilter::classification klass)
Static Public Member Functions
const int	getFileSize (FILE fp, const char fileName, char *msgbuf)
const char *	trimEnd (char *str)
void	trim (const char *str)
const char *	findColon (const char *buf)
const char *	skipWhiteSpace (const char *pBuf)
bool	isBlankLine (const char *buf)
void	strncpy (char pDest, const char pSrc, size_t destSize)
void	toLower (char dest, const char src)
void	toLower (char dest, const char src, size_t size)
bool	match (const char bufStart, const size_t len, const char word)
bool	match (const char bufStart, const char bufEnd, const char *word)
contentType	classifySection (const char *buf)
const char *	typeToStr (contentType type)
MailFilter::classification	checkLine (const char buf, SpamParameters &params, char foundStr, const size_t foundStrSize)
Private Member Functions
	SpamUtil (const SpamUtil &rhs)