Advertisement
Guest User

Untitled

a guest
Oct 6th, 2015
149
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.53 KB | None | 0 0
  1. import pickle
  2. import glob
  3. import time
  4. from multiprocessing.pool import Pool
  5. import enchant
  6. import smtplib
  7. from email.mime.text import MIMEText
  8.  
  9. def classify(year):
  10.     if (str(year).isdigit()==False):
  11.         print "year error"
  12.         return -1        
  13.     binn = ['1860','1880','1900','1920','1940','1960','1980']
  14.     i=0
  15.     think = 0
  16.     notfound = True
  17.     if int(year)<=1860:
  18.         return '1860'
  19.     for i in binn:
  20.         if ((int(year)-int(i))<19):
  21.             return i
  22.     return '1980'
  23.  
  24. #.arrf encoding
  25. def getheader(firstrow):
  26.     header = "@RELATION docdata\n"
  27.    
  28.     for i in firstrow.split(','):
  29.         if (i=='year_interval'):
  30.             header+= "\n@ATTRIBUTE "+str(i)+" {1860s,1880s,1900s,1920s,1940s,1960s,1980s}"
  31.         else:
  32.             header+= "\n@ATTRIBUTE "+str(i)+" NUMERIC"
  33.        
  34.    
  35.     header+="\n\n@DATA\n"
  36.    
  37.     return header
  38.  
  39.  
  40. def writerow(rowdata,cleansedvector):
  41.     data="\n"
  42.     spell = enchant.Dict("en_US")
  43.     oldhash = rowdata[2]
  44.     newhash = dict()
  45.     for jtem in oldhash.keys():
  46.         word = jtem.replace('.','').replace('!','').replace(',','').lower()
  47.         hexcheck = True
  48.         if (word.isdigit()==False):
  49.             if (((len(word)>1)or(word.lower()=="i"))or(word.lower()=="a")):
  50.                 newhash[word] = oldhash[jtem]    
  51.     year = classify(str(rowdata[-1]))
  52.     try:
  53.         if (int(year)>0):
  54.             #data+=str(year)+"s,"+str(rowdata[0])+","+str(rowdata[1])+","+str(rowdata[3])+","+str(rowdata[4])+","+str(rowdata[5])
  55.             data+=str(rowdata[0])+","+str(rowdata[1])+","+str(rowdata[3])+","+str(rowdata[4])+","+str(rowdata[5])
  56.         else:
  57.             print "row error"
  58.             return ""
  59.     except:
  60.         print "bigger row error"
  61.         return ""
  62.     for item in data.split(','):
  63.         try:
  64.             float(item)
  65.         except:
  66.             print "row error"
  67.             return ""
  68.     #vector stuff
  69.     #for jtem in cleansedvector.keys():
  70.         #data+=","
  71.         #try:
  72.             #data+=str(newhash[jtem])
  73.         #except:
  74.             #data+="0"
  75.     data+=","+str(year)+"s"
  76.     return data
  77.  
  78. def makemastervector():
  79.     hashlist = pickle.load(open('hashlist.backup','rb'))
  80.     overallvector = dict()
  81.     for item in hashlist:
  82.         word = item.replace('.','').replace('!','').replace(',','')
  83.         if (word.isdigit()==False):
  84.             if (((len(word)>1)or(word.lower()=="i"))or(word.lower()=="a")):
  85.                 overallvector[word.lower()] = 1
  86.     return len(overallvector)
  87.     #pickle.dump(overallvector,open('mastervector','wb'))
  88.  
  89. def getmastervector():
  90.     return pickle.load(open('mastervector','rb'))
  91.  
  92. def getrowlist():
  93.     return pickle.load(open('rowlist.backup','rb'))
  94.  
  95.  
  96. if __name__ == "__main__":
  97.     print "Getting master vector..."
  98.     overallvector = getmastervector()
  99.     unicount = 0
  100.     cleansedvector = dict()
  101.     #for item in overallvector.keys()[:2000]:
  102.         #try:
  103.             #item.decode('ascii')
  104.             #if (item.isalpha()):
  105.                 #cleansedvector[item.replace("'",'')] = 1
  106.         #except:
  107.             #unicount+=1        
  108.        
  109.        
  110.     print "Importing rowlist..."
  111.     firstline = "raw_fogindex,raw_wordcount,money_refs,raw_sentcount,raw_length"
  112.     #for item in cleansedvector.keys():
  113.         #firstline+=","+item
  114.     firstline+=",year_interval"
  115.     header = getheader(firstline)
  116.     print header.split('\n')[-1]
  117.     with open('dataset.arff','w') as f:
  118.         f.write(header)
  119.         f.close()
  120.    
  121.     rowlist = getrowlist()
  122.     print "rowlist imported!"
  123.     pool = Pool(processes = 4)
  124.     worker0 = pool.apply_async(writerow, [rowlist[0],cleansedvector])
  125.     worker1 = pool.apply_async(writerow, [rowlist[1],cleansedvector])
  126.     worker2 = pool.apply_async(writerow, [rowlist[2],cleansedvector])
  127.     worker3 = pool.apply_async(writerow, [rowlist[3],cleansedvector])
  128.     i = 4
  129.     ts = time.time()
  130.     avgtime = 4
  131.     sumtime = 0
  132.     totalcount = len(rowlist)
  133.     while(i<totalcount):
  134.         if (worker0.ready()==True):
  135.             estimate = (totalcount-i+1)*avgtime  
  136.             duration = estimate/60
  137.             if (duration<3):
  138.                 duration = str(duration*60)[:4]+" seconds."
  139.             else:
  140.                 duration = str(duration)[:4]+" minutes."
  141.             with open('dataset.arff','a') as f:
  142.                 f.write(worker0.get())
  143.                 f.close()
  144.             print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
  145.             worker0 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
  146.             i+=1
  147.             sumtime+=time.time()-ts
  148.             avgtime = float(sumtime)/(i+1)            
  149.             ts = time.time()                                        
  150.         if (worker1.ready()==True):
  151.             estimate = (totalcount-i+1)*avgtime  
  152.             duration = estimate/60
  153.             if (duration<3):
  154.                 duration = str(duration*60)[:4]+" seconds."
  155.             else:
  156.                 duration = str(duration)[:4]+" minutes."
  157.             with open('dataset.arff','a') as f:
  158.                 f.write(worker1.get())
  159.                 f.close()
  160.             print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
  161.             worker1 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
  162.             i+=1
  163.             sumtime+=time.time()-ts
  164.             avgtime = float(sumtime)/(i+1)            
  165.             ts = time.time()                                    
  166.         if (worker2.ready()==True):
  167.             estimate = (totalcount-i+1)*avgtime  
  168.             duration = estimate/60
  169.             if (duration<3):
  170.                 duration = str(duration*60)[:4]+" seconds."
  171.             else:
  172.                 duration = str(duration)[:4]+" minutes."
  173.             with open('dataset.arff','a') as f:
  174.                 f.write(worker2.get())
  175.                 f.close()
  176.             print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
  177.             worker2 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
  178.             i+=1
  179.             sumtime+=time.time()-ts
  180.             avgtime = float(sumtime)/(i+1)            
  181.             ts = time.time()                                                                        
  182.         if (worker3.ready()==True):
  183.             estimate = (totalcount-i+1)*avgtime  
  184.             duration = estimate/60
  185.             if (duration<3):
  186.                 duration = str(duration*60)[:4]+" seconds."
  187.             else:
  188.                 duration = str(duration)[:4]+" minutes."
  189.             with open('dataset.arff','a') as f:
  190.                 f.write(worker3.get())
  191.                 f.close()
  192.             print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
  193.             worker3 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
  194.             i+=1
  195.             sumtime+=time.time()-ts
  196.             avgtime = float(sumtime)/(i+1)            
  197.             ts = time.time()    
  198.    
  199.  
  200.     msg = MIMEText("The program has finished extracting the dataset.")
  201.     msg['Subject'] = 'Dataset Extraction Complete'
  202.     msg['From'] = 'reniat314@gmail.com'
  203.     msg['To'] = '5159433976@messaging.sprintpcs.com'
  204.     s = smtplib.SMTP( "smtp.gmail.com", 587 )
  205.     s.starttls()
  206.     s.login( 'reniat314@gmail.com', 'iowatrombone2718' )
  207.     s.sendmail('reniat314@gmail.com', ['5159433976@messaging.sprintpcs.com'], msg.as_string())
  208.     s.quit()          
  209.        
  210.     print "done"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement