Untitled

import pickle
import glob
import time
from multiprocessing.pool import Pool
import enchant
import smtplib
from email.mime.text import MIMEText

def classify(year):
    if (str(year).isdigit()==False):
        print "year error"
        return -1
    binn = ['1860','1880','1900','1920','1940','1960','1980']
    i=0
    think = 0
    notfound = True
    if int(year)<=1860:
        return '1860'
    for i in binn:
        if ((int(year)-int(i))<19):
            return i
    return '1980'

#.arrf encoding
def getheader(firstrow):
    header = "@RELATION docdata\n"

    for i in firstrow.split(','):
        if (i=='year_interval'):
            header+= "\n@ATTRIBUTE "+str(i)+" {1860s,1880s,1900s,1920s,1940s,1960s,1980s}"
        else:
            header+= "\n@ATTRIBUTE "+str(i)+" NUMERIC"


    header+="\n\n@DATA\n"

    return header


def writerow(rowdata,cleansedvector):
    data="\n"
    spell = enchant.Dict("en_US")
    oldhash = rowdata[2]
    newhash = dict()
    for jtem in oldhash.keys():
        word = jtem.replace('.','').replace('!','').replace(',','').lower()
        hexcheck = True
        if (word.isdigit()==False):
            if (((len(word)>1)or(word.lower()=="i"))or(word.lower()=="a")):
                newhash[word] = oldhash[jtem]
    year = classify(str(rowdata[-1]))
    try:
        if (int(year)>0):
            #data+=str(year)+"s,"+str(rowdata[0])+","+str(rowdata[1])+","+str(rowdata[3])+","+str(rowdata[4])+","+str(rowdata[5])
            data+=str(rowdata[0])+","+str(rowdata[1])+","+str(rowdata[3])+","+str(rowdata[4])+","+str(rowdata[5])
        else:
            print "row error"
            return ""
    except:
        print "bigger row error"
        return ""
    for item in data.split(','):
        try:
            float(item)
        except:
            print "row error"
            return ""
    #vector stuff
    #for jtem in cleansedvector.keys():
        #data+=","
        #try:
            #data+=str(newhash[jtem])
        #except:
            #data+="0"
    data+=","+str(year)+"s"
    return data

def makemastervector():
    hashlist = pickle.load(open('hashlist.backup','rb'))
    overallvector = dict()
    for item in hashlist:
        word = item.replace('.','').replace('!','').replace(',','')
        if (word.isdigit()==False):
            if (((len(word)>1)or(word.lower()=="i"))or(word.lower()=="a")):
                overallvector[word.lower()] = 1
    return len(overallvector)
    #pickle.dump(overallvector,open('mastervector','wb'))

def getmastervector():
    return pickle.load(open('mastervector','rb'))

def getrowlist():
    return pickle.load(open('rowlist.backup','rb'))


if __name__ == "__main__":
    print "Getting master vector..."
    overallvector = getmastervector()
    unicount = 0
    cleansedvector = dict()
    #for item in overallvector.keys()[:2000]:
        #try:
            #item.decode('ascii')
            #if (item.isalpha()):
                #cleansedvector[item.replace("'",'')] = 1
        #except:
            #unicount+=1


    print "Importing rowlist..."
    firstline = "raw_fogindex,raw_wordcount,money_refs,raw_sentcount,raw_length"
    #for item in cleansedvector.keys():
        #firstline+=","+item
    firstline+=",year_interval"
    header = getheader(firstline)
    print header.split('\n')[-1]
    with open('dataset.arff','w') as f:
        f.write(header)
        f.close()

    rowlist = getrowlist()
    print "rowlist imported!"
    pool = Pool(processes = 4)
    worker0 = pool.apply_async(writerow, [rowlist[0],cleansedvector])
    worker1 = pool.apply_async(writerow, [rowlist[1],cleansedvector])
    worker2 = pool.apply_async(writerow, [rowlist[2],cleansedvector])
    worker3 = pool.apply_async(writerow, [rowlist[3],cleansedvector])
    i = 4
    ts = time.time()
    avgtime = 4
    sumtime = 0
    totalcount = len(rowlist)
    while(i<totalcount):
        if (worker0.ready()==True):
            estimate = (totalcount-i+1)*avgtime
            duration = estimate/60
            if (duration<3):
                duration = str(duration*60)[:4]+" seconds."
            else:
                duration = str(duration)[:4]+" minutes."
            with open('dataset.arff','a') as f:
                f.write(worker0.get())
                f.close()
            print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
            worker0 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
            i+=1
            sumtime+=time.time()-ts
            avgtime = float(sumtime)/(i+1)
            ts = time.time()
        if (worker1.ready()==True):
            estimate = (totalcount-i+1)*avgtime
            duration = estimate/60
            if (duration<3):
                duration = str(duration*60)[:4]+" seconds."
            else:
                duration = str(duration)[:4]+" minutes."
            with open('dataset.arff','a') as f:
                f.write(worker1.get())
                f.close()
            print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
            worker1 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
            i+=1
            sumtime+=time.time()-ts
            avgtime = float(sumtime)/(i+1)
            ts = time.time()
        if (worker2.ready()==True):
            estimate = (totalcount-i+1)*avgtime
            duration = estimate/60
            if (duration<3):
                duration = str(duration*60)[:4]+" seconds."
            else:
                duration = str(duration)[:4]+" minutes."
            with open('dataset.arff','a') as f:
                f.write(worker2.get())
                f.close()
            print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
            worker2 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
            i+=1
            sumtime+=time.time()-ts
            avgtime = float(sumtime)/(i+1)
            ts = time.time()
        if (worker3.ready()==True):
            estimate = (totalcount-i+1)*avgtime
            duration = estimate/60
            if (duration<3):
                duration = str(duration*60)[:4]+" seconds."
            else:
                duration = str(duration)[:4]+" minutes."
            with open('dataset.arff','a') as f:
                f.write(worker3.get())
                f.close()
            print "Processing file "+str(i+1)+" of "+str(totalcount)+". Estimated time to completion: "+duration
            worker3 = pool.apply_async(writerow, [rowlist[i],cleansedvector])
            i+=1
            sumtime+=time.time()-ts
            avgtime = float(sumtime)/(i+1)
            ts = time.time()


    msg = MIMEText("The program has finished extracting the dataset.")
    msg['Subject'] = 'Dataset Extraction Complete'
    msg['From'] = 'reniat314@gmail.com'
    msg['To'] = '5159433976@messaging.sprintpcs.com'
    s = smtplib.SMTP( "smtp.gmail.com", 587 )
    s.starttls()
    s.login( 'reniat314@gmail.com', 'iowatrombone2718' )
    s.sendmail('reniat314@gmail.com', ['5159433976@messaging.sprintpcs.com'], msg.as_string())
    s.quit()

    print "done"