Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pickle
- import glob
- import time
- from multiprocessing.pool import Pool
- import enchant
- import smtplib
- from email.mime.text import MIMEText
- def classify(year):
- if (str(year).isdigit()==False):
- print "year error"
- return -1
- binn = ['1860','1880','1900','1920','1940','1960','1980']
- i=0
- think = 0
- notfound = True
- if int(year)<=1860:
- return '1860'
- for i in binn:
- if ((int(year)-int(i))<19):
- return i
- return '1980'
- #.arrf encoding
- def getheader(firstrow):
- header = "@RELATION docdata\n"
- for i in firstrow.split(','):
- if (i=='year_interval'):
- header+= "\n@ATTRIBUTE "+str(i)+" {1860s,1880s,1900s,1920s,1940s,1960s,1980s}"
- else:
- header+= "\n@ATTRIBUTE "+str(i)+" NUMERIC"
- header+="\n\n@DATA\n"
- return header
- def writerow(rowdata,cleansedvector):
- data="\n"
- spell = enchant.Dict("en_US")
- oldhash = rowdata[2]
- newhash = dict()
- for jtem in oldhash.keys():
- word = jtem.replace('.','').replace('!','').replace(',','').lower()
- hexcheck = True
- if (word.isdigit()==False):
- if (((len(word)>1)or(word.lower()=="i"))or(word.lower()=="a")):
- newhash[word] = oldhash[jtem]
- year = classify(str(rowdata[-1]))
- try:
- if (int(year)>0):
- #data+=str(year)+"s,"+str(rowdata[0])+","+str(rowdata[1])+","+str(rowdata[3])+","+str(rowdata[4])+","+str(rowdata[5])
- data+=str(rowdata[0])+","+str(rowdata[1])+","+str(rowdata[3])+","+str(rowdata[4])+","+str(rowdata[5])
- else:
- print "row error"
- return ""
- except:
- print "bigger row error"
- return ""
- for item in data.split(','):
- try:
- float(item)
- except:
- print "row error"
- return ""
- #vector stuff
- #for jtem in cleansedvector.keys():
- #data+=","
- #try:
- #data+=str(newhash[jtem])
- #except:
- #data+="0"
- data+=","+str(year)+"s"
- return data
- def makemastervector():
- hashlist = pickle.load(open('hashlist.backup','rb'))
- overallvector = dict()
- for item in hashlist:
- word = item.replace('.','').replace('!','').replace(',','')
- if (word.isdigit()==False):
- if (((len(word)>1)or(word.lower()=="i"))or(word.lower()=="a")):
- overallvector[word.lower()] = 1
- return len(overallvector)
- #pickle.dump(overallvector,open('mastervector','wb'))
- def getmastervector():
- return pickle.load(open('mastervector','rb'))
- def getrowlist():
- return pickle.load(open('rowlist.backup','rb'))
- if __name__ == "__main__":
- print "Getting master vector..."
- overallvector = getmastervector()
- unicount = 0
- cleansedvector = dict()
- #for item in overallvector.keys()[:2000]:
- #try:
- #item.decode('ascii')
- #if (item.isalpha()):
- #cleansedvector[item.replace("'",'')] = 1
- #except:
- #unicount+=1
- print "Importing rowlist..."
- firstline = "raw_fogindex,raw_wordcount,money_refs,raw_sentcount,raw_length"
- #for item in cleansedvector.keys():
- #firstline+=","+item
- firstline+=",year_interval"
- header = getheader(firstline)
- print header.split('\n')[-1]
- with open('dataset.arff','w') as f:
- f.write(header)
- f.close()
- rowlist = getrowlist()
- print "rowlist imported!"
- pool = Pool(processes = 4)
- worker0 = pool.apply_async(writerow, [rowlist[0],cleansedvector])
- worker1 = pool.apply_async(writerow, [rowlist[1],cleansedvector])
- worker2 = pool.apply_async(writerow, [rowlist[2],cleansedvector])
- worker3 = pool.apply_async(writerow, [rowlist[3],cleansedvector])
- i = 4
- ts = time.time()
- avgtime = 4
- sumtime = 0
- totalcount = len(rowlist)
- while(i<totalcount):
- if (worker0.ready()==True):
- estimate = (totalcount-i+1)*avgtime
- duration = estimate/60
- if (duration<3):
- duration = str(duration*60)[:4]+" seconds."
- else:
- duration = str(duration)[:4]+" minutes."
- with open('dataset.arff','a') as f:
- f.write(worker0.get())
- f.close()
- print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
- worker0 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
- i+=1
- sumtime+=time.time()-ts
- avgtime = float(sumtime)/(i+1)
- ts = time.time()
- if (worker1.ready()==True):
- estimate = (totalcount-i+1)*avgtime
- duration = estimate/60
- if (duration<3):
- duration = str(duration*60)[:4]+" seconds."
- else:
- duration = str(duration)[:4]+" minutes."
- with open('dataset.arff','a') as f:
- f.write(worker1.get())
- f.close()
- print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
- worker1 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
- i+=1
- sumtime+=time.time()-ts
- avgtime = float(sumtime)/(i+1)
- ts = time.time()
- if (worker2.ready()==True):
- estimate = (totalcount-i+1)*avgtime
- duration = estimate/60
- if (duration<3):
- duration = str(duration*60)[:4]+" seconds."
- else:
- duration = str(duration)[:4]+" minutes."
- with open('dataset.arff','a') as f:
- f.write(worker2.get())
- f.close()
- print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
- worker2 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
- i+=1
- sumtime+=time.time()-ts
- avgtime = float(sumtime)/(i+1)
- ts = time.time()
- if (worker3.ready()==True):
- estimate = (totalcount-i+1)*avgtime
- duration = estimate/60
- if (duration<3):
- duration = str(duration*60)[:4]+" seconds."
- else:
- duration = str(duration)[:4]+" minutes."
- with open('dataset.arff','a') as f:
- f.write(worker3.get())
- f.close()
- print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
- worker3 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
- i+=1
- sumtime+=time.time()-ts
- avgtime = float(sumtime)/(i+1)
- ts = time.time()
- msg = MIMEText("The program has finished extracting the dataset.")
- msg['Subject'] = 'Dataset Extraction Complete'
- msg['From'] = 'reniat314@gmail.com'
- msg['To'] = '5159433976@messaging.sprintpcs.com'
- s = smtplib.SMTP( "smtp.gmail.com", 587 )
- s.starttls()
- s.login( 'reniat314@gmail.com', 'iowatrombone2718' )
- s.sendmail('reniat314@gmail.com', ['5159433976@messaging.sprintpcs.com'], msg.as_string())
- s.quit()
- print "done"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement