Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #This is the main file of the projekt and it is from here that you are setting all variables from an UI
- import tweetsRunner
- import bitlyRunner
- import UpdateClicks
- import htmlToText
- import json
- import bitlydatahandler
- import naiveBayesPipeline
- import Queue as NormalQueue
- import time
- import os
- from multiprocessing import Process, Pool, Manager
- #from memory_profiler import profile
- #global turnSplit
- #turnSplit = 4
- def b(tweetsPath, lock):
- lock.acquire()
- res = bitlyRunner.run(tweetsPath)
- lock.release()
- print "b has been released"
- return res
- def u(q, sleep, turns, lock, runBlockSplitInto1h24h,):
- bajs= None
- updatingQueue = NormalQueue.PriorityQueue()
- timeOfUpdateQueue = NormalQueue.PriorityQueue()
- timeOfNextUpdate = 0
- run = True
- turnCounter =0
- print "Starting update process"
- while run:
- turns1=turns
- k=1
- # timeOfNextUpdate never becomes not null.
- # print "The time of the next update is: %.2f" % timeOfNextUpdate
- if(timeOfNextUpdate != 0):
- # print "The next update is in: %.2f" % (timeOfNextUpdate - time.time())
- if time.time() >= timeOfNextUpdate:
- path = updatingQueue.get()[1]
- print("turns: " +str(turns))
- print("path: " + path)
- #print(updatingQueue.queue)
- #for line in updatingQueue.queue:
- #print("time: " + str(line[0]-time.time())+" path: " + str(line[1]))
- #print "The next news-file is: " + str(path)
- startGetting = time.time()
- lock.acquire()
- startBitly = time.time()
- # Make faster/getmore threads
- ## uppstyckning av fil
- ## gor en tupel med newspath och flagga,
- ## eller flera saker i priokon
- #ifsats som kollar om det ar en onehourpath, samt om de ar 12e uppdateringen
- #om sa ar fallet kolla i jamfor path med path gor uppdateringen av clicks
- # om globalclicks ar over threshold sa flytta line fran 1h till 24.
- # ta bort raden fran 1h
- res = bitlydatahandler.updateClicks(path)
- lock.release()
- for file_name in os.listdir("./data/"):
- if path[:(len(path) - 4)] in "./data/" + file_name and "1h_" in file_name:
- turnCounter = turnCounter + 1
- if turnCounter == 3 and startRoundTwo(path):
- hourlydata=''
- dailydata=''
- with open(path,"r") as f:
- content = f.readlines()
- f.close()
- ##res ar en lista
- for line in res:
- for con in content:
- print("ASFASOIJFOIAS")
- print(type(line['global_clicks']))
- if(int(line['global_clicks']) < TRESHHOLD):
- print("DAILYYY")
- print(type(line['global_clicks']))
- if(str(line['short_url']) in con):
- print("TRUE " + str(line['global_clicks']) + "<" +str(TRESHHOLD))
- print(str(line['global_clicks']))
- print(str(line['long_url']))
- print("daily url appended")
- dailydata=dailydata + con
- else:
- print("HOURHLYLY")
- print(type(line['global_clicks']))
- if(str(line['short_url']) in con):
- print("TRUE " + str(line['global_clicks']) + " > " +str(TRESHHOLD))
- print(str(line['global_clicks']))
- print(str(line['long_url']))
- print("hourly url appended")
- hourlydata= hourlydata + con
- # for line in res:
- # for con in content:
- # if(int(line['global_clicks']) > TRESHHOLD):
- print("dailydata: " + dailydata)
- print("hourlydata: " + hourlydata)
- hourlywriter = open(path, "w")
- hourlywriter.write(hourlydata)
- hourlywriter.close()
- dailypath = path[:-7]+"24h.txt"
- # print("dailypath: " + dailypath)
- o = open(dailypath,"w")
- o.write(dailydata)
- o.close()
- dailydata=''
- hourlydata=''
- content =[]
- time_for_filename = time.strftime("%Y-%m-%d_%H%M%S")
- #print "u has been released, %.2f seconds after requesting lock" % (time.time() - startGetting)
- updatePath = path[:(len(path)-4)] + "_(" + time_for_filename + ").txt"
- f = open(updatePath, "w")
- f.write(str(res))
- f.close()
- print "bitlys updated in %.2f seconds" % (time.time() - startBitly)
- #print ("updatepath: " +updatePath)
- print "Done with all IO after %.2f seconds, for now." % (time.time() - startGetting)
- #print "Is the time queue empty: " , timeOfUpdateQueue.empty()
- #print "Is the updating queue empty: " , updatingQueue.empty()
- if timeOfUpdateQueue.empty() is False:
- timeOfNextUpdate = timeOfUpdateQueue.get()
- print "Next update is in %.2f seconds." % (timeOfNextUpdate - time.time())
- else:
- run = False
- print "Done updating"
- return "DONE"
- else:
- #print "Short sleep"
- time.sleep(1)
- else:
- #print "Long sleep"
- time.sleep(2)
- #print "Is the queue empty: " , q.empty()
- counter=1
- if q.empty() is False:
- newsPath = q.get()
- timeConstant= 0
- if(newsPath.endswith("1h.txt")):
- timeConstant=1
- else:
- timeConstant=1
- turns=turns
- print("24 HOUR")
- #print ("Path to the original news-file (NEWSPATH): " + newsPath)
- #print "Turns to save:", turns
- t = time.time()
- i = 1
- while i <= turns:
- ##if sats om vi behover annat intervall, for alla i storre an nat, stoppa in modiferad newspath
- ## for mindre far vi skapa nya newspathen och dela upp
- updatingQueue.put(((t +(sleep * timeConstant*i)),newsPath))
- timeOfUpdateQueue.put(t +(sleep * timeConstant*i))
- if timeOfNextUpdate == 0:
- timeOfNextUpdate = timeOfUpdateQueue.get()
- print "First update time added. It is in %.2f seconds." % (timeOfNextUpdate - time.time())
- i = i + 1
- turns=turns1
- return "DONE"
- TRESHHOLD = 200
- alreadyDone = []
- def startRoundTwo(path):
- if path in alreadyDone:
- return False
- else:
- alreadyDone.append(path)
- #split() pa textdokumenten, utifran updatePath som man kan fa fran u (skicka med hit o kolla pa den, eller kalla pa den fran u med denna funkation som villkor)
- print("PRINT123 " + path)
- return True
- #@profile
- def runfunc(sleeplength, timeout, runs, runBitly, runHtmlExtractor, runNaiveBayes, saveClicks):
- runBlockSplitInto1h24h = True
- manager = Manager()
- q = manager.Queue()
- lock = manager.Lock()
- #if sleeplength > 0:
- #print("HEEEEJ JESPER ")
- p = Process(target=u, args=(q,sleeplength,turns,lock, runBlockSplitInto1h24h))
- p.start()
- print p.pid
- pool = Pool(processes=1,maxtasksperchild=2)
- i = 0
- counter = 0
- while i < runs:
- tweetsPath = tweetsRunner.collectTweets(timeout)
- print(tweetsPath)
- i = i + 1
- if runBitly == "y":
- print ("running bitlys: " + str(counter))
- print "Running bitlys"
- # Which version do we want here?
- #pool.apply_async(b, (tweetsPath,lock,), callback=q.put)
- shit= b(tweetsPath,lock)
- hourlyPath =shit[:-4]+"01h.txt"
- dailyPath =shit[:-4]+"24h.txt"
- f = open(shit,"r")
- string = f.read()
- hourly = open(hourlyPath,"w+")
- daily = open(dailyPath,"w+")
- array = string.split('\n')
- for line in array[:-1]:
- hourly.write(line+'\n')
- # array2=line.split(",")
- # string2 = array2[1]
- # array3 = string2.split(":")
- # if(int(array3[1])>TRESHHOLD):
- # print("Global_clicks hourly: " +array3[1])
- # #hourlyUpdate.append(line)
- # hourly.write(line+"\n")
- #
- # else:
- # print("Global_clicks daily: " +array3[1])
- #
- # daily.write(line+"\n")
- #hourly.write(str(hourlyUpdate))
- #daily.write(str(dailyUpdate))
- hourly.close()
- daily.close()
- f.close()
- #q.put(hourlyPath)
- #q.put(dailyPath)
- q.put(hourlyPath)
- q.put(dailyPath)
- ## else:
- ## time.sleep(timeout)
- ## i = i + 1
- pool.close()
- pool.join()
- # time.sleep(sleeplength)
- if sleeplength > 0:
- p.join()
- print "P should have joined now"
- print p.is_alive()
- if p.is_alive():
- p.terminate()
- if runHtmlExtractor == "y":
- print "Runing html"
- htmlToText.run()
- if runNaiveBayes == "y":
- print "Runing Bayes"
- naiveBayesPipeline.run()
- if saveClicks == "y":
- print "Runing updates"
- UpdateClicks.main()
- print "FINISHED"
- if (__name__ == '__main__'):
- sleeplength = 600
- timeout = 20 # input("In seconds, for how long would you like to collect tweets? This one is the 10-minute we gather tweets ", )
- runs = 2 #input("For how many runs would you like to collect tweets? used to be 288", )
- runBitly = "y" #raw_input("Would you like to extract bitly info too? (y/n)")
- runHtmlExtractor = "y" #raw_input("Would you like to extract articles from identified links? (y/n)")
- runNaiveBayes = "y" #raw_input("Would you like to classify extracted articles? (y/n)")
- saveClicks = "y" #raw_input("Would you like to save clicks to excelfile? (y/n)")
- #sleeplength = 3 #int(raw_input("How often would you like to update clicks, answer in seconds "))
- #turns = 4 #int(raw_input("How many times would you like to update? used to be 84"))
- if saveClicks == "y":
- sleeplength = 10 #int(raw_input("How often would you like to update clicks, answer in seconds "))
- turns = 6 #int(raw_input("How many times would you like to update? used to be 84"))
- open('./data/seenShortURLs.txt', 'w').close()
- open('./data/expanded.txt', 'w').close()
- open('./data/runme.txt', 'w').close()
- open('./data/links/UnknownArticlesToBeExtracted.txt', 'w').close()
- open('./data/links/articleURLAndTitle.txt', 'w').close()
- open('./data/news/classifications.txt', 'w').close()
- #Erase contents in some .txt
- runfunc(sleeplength, timeout, runs, runBitly, runHtmlExtractor, runNaiveBayes, saveClicks)
- #done = raw_input("You can close the program now by pressing any key")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement