Advertisement
Guest User

Classification.py

a guest
Jan 5th, 2012
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.79 KB | None | 0 0
  1. '''Now given a mail, split it in terms of spaces  ,  then ,  add up the log probability of each .  Multiply it with the spam probability  . Do the same thing for non-spam
  2.   Whichever is higher  wins .  Lets start
  3. '''
  4. import sys,os
  5. from math import *
  6. def makeDict(f):
  7.     temp = {}
  8.     data = open(f,'r')
  9.     for line in data:
  10.         prob = line.split(" ")
  11.         temp[prob[0]] = prob[1]
  12.     return temp
  13. def predict(basepath,f):
  14.     toClassify = open(os.path.join(basepath,f),'r')
  15.     for line in toClassify:
  16.         words = line.split(" ")
  17.         #print 'words are',words
  18.         spamP = 0
  19.         nonspamP = 0
  20.         for w in words:
  21.             try:
  22.                 spamP = spamP + float(spamProbs[w].strip("\n"))
  23.             except:
  24.                 continue
  25.                
  26.             try:
  27.                 nonspamP = nonspamP + float(nonspamProbs[w].strip("\n"))
  28.             except:
  29.                 continue
  30.                
  31.     totalSpamP = spamP + log ( 0.5 )
  32.     totalnonSpamP = nonspamP + log ( 0.5 )
  33.     #print 'TOtal spam and non-spam probs are ',totalSpamP,totalnonSpamP
  34.     if(totalSpamP > totalnonSpamP):
  35.         return True
  36.     else:
  37.         return False
  38.  
  39. spamProbs = makeDict(sys.argv[1]) #Pass the spam log probs here
  40. nonspamProbs = makeDict(sys.argv[2]) #Pass the non-spam log probs here
  41. #print spamProbs
  42. spamCount = 0
  43. nonspamCount = 0
  44. print 'No of files in spam is',len(os.listdir(sys.argv[3]))
  45. for f in os.listdir(sys.argv[3]):
  46.    
  47.     if(predict(sys.argv[3],f)):
  48.         spamCount = spamCount + 1
  49.     else:
  50.         nonspamCount = nonspamCount + 1
  51. print 'No. of spam in ',sys.argv[3],' is ',str(spamCount),' no. of non-spam',str(nonspamCount)
  52. print 'No of files in non-spam is',len(os.listdir(sys.argv[4]))
  53. spamCount = 0
  54. nonspamCount =0
  55. for f in os.listdir(sys.argv[4]):
  56.    
  57.     if(predict(sys.argv[4],f)):
  58.         spamCount = spamCount + 1
  59.     else:
  60.         nonspamCount = nonspamCount + 1
  61. print 'No. of spam in ',sys.argv[4],' is ',spamCount,' no. of non-spam',nonspamCount
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement