Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2018
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.26 KB | None | 0 0
  1. #This is the main file of the projekt and it is from here that you are setting all variables from an UI
  2. import tweetsRunner
  3. import bitlyRunner
  4. import UpdateClicks
  5. import htmlToText
  6. import json
  7. import bitlydatahandler
  8. import naiveBayesPipeline
  9. import Queue as NormalQueue
  10. import time
  11. import os
  12. from multiprocessing import Process, Pool, Manager
  13. #from memory_profiler import profile
  14. #global turnSplit
  15. #turnSplit = 4
  16. def b(tweetsPath, lock):
  17. lock.acquire()
  18. res = bitlyRunner.run(tweetsPath)
  19. lock.release()
  20. print "b has been released"
  21.  
  22. return res
  23.  
  24. def u(q, sleep, turns, lock, runBlockSplitInto1h24h,):
  25. bajs= None
  26. updatingQueue = NormalQueue.PriorityQueue()
  27. timeOfUpdateQueue = NormalQueue.PriorityQueue()
  28. timeOfNextUpdate = 0
  29. run = True
  30. turnCounter =0
  31.  
  32. print "Starting update process"
  33. while run:
  34.  
  35. turns1=turns
  36. k=1
  37. # timeOfNextUpdate never becomes not null.
  38. # print "The time of the next update is: %.2f" % timeOfNextUpdate
  39. if(timeOfNextUpdate != 0):
  40. # print "The next update is in: %.2f" % (timeOfNextUpdate - time.time())
  41. if time.time() >= timeOfNextUpdate:
  42. path = updatingQueue.get()[1]
  43.  
  44. print("turns: " +str(turns))
  45. print("path: " + path)
  46. #print(updatingQueue.queue)
  47. #for line in updatingQueue.queue:
  48. #print("time: " + str(line[0]-time.time())+" path: " + str(line[1]))
  49.  
  50.  
  51. #print "The next news-file is: " + str(path)
  52. startGetting = time.time()
  53. lock.acquire()
  54. startBitly = time.time()
  55. # Make faster/getmore threads
  56. ## uppstyckning av fil
  57. ## gor en tupel med newspath och flagga,
  58. ## eller flera saker i priokon
  59.  
  60. #ifsats som kollar om det ar en onehourpath, samt om de ar 12e uppdateringen
  61. #om sa ar fallet kolla i jamfor path med path gor uppdateringen av clicks
  62. # om globalclicks ar over threshold sa flytta line fran 1h till 24.
  63. # ta bort raden fran 1h
  64.  
  65. res = bitlydatahandler.updateClicks(path)
  66. lock.release()
  67.  
  68. for file_name in os.listdir("./data/"):
  69. if path[:(len(path) - 4)] in "./data/" + file_name and "1h_" in file_name:
  70. turnCounter = turnCounter + 1
  71. if turnCounter == 3 and startRoundTwo(path):
  72. hourlydata=''
  73. dailydata=''
  74. with open(path,"r") as f:
  75. content = f.readlines()
  76. f.close()
  77. ##res ar en lista
  78. for line in res:
  79. for con in content:
  80. print("ASFASOIJFOIAS")
  81. print(type(line['global_clicks']))
  82. if(int(line['global_clicks']) < TRESHHOLD):
  83. print("DAILYYY")
  84. print(type(line['global_clicks']))
  85. if(str(line['short_url']) in con):
  86. print("TRUE " + str(line['global_clicks']) + "<" +str(TRESHHOLD))
  87. print(str(line['global_clicks']))
  88. print(str(line['long_url']))
  89. print("daily url appended")
  90. dailydata=dailydata + con
  91. else:
  92. print("HOURHLYLY")
  93. print(type(line['global_clicks']))
  94.  
  95. if(str(line['short_url']) in con):
  96. print("TRUE " + str(line['global_clicks']) + " > " +str(TRESHHOLD))
  97. print(str(line['global_clicks']))
  98. print(str(line['long_url']))
  99. print("hourly url appended")
  100. hourlydata= hourlydata + con
  101.  
  102. # for line in res:
  103. # for con in content:
  104. # if(int(line['global_clicks']) > TRESHHOLD):
  105.  
  106.  
  107. print("dailydata: " + dailydata)
  108. print("hourlydata: " + hourlydata)
  109.  
  110. hourlywriter = open(path, "w")
  111. hourlywriter.write(hourlydata)
  112. hourlywriter.close()
  113. dailypath = path[:-7]+"24h.txt"
  114. # print("dailypath: " + dailypath)
  115.  
  116. o = open(dailypath,"w")
  117. o.write(dailydata)
  118. o.close()
  119. dailydata=''
  120. hourlydata=''
  121. content =[]
  122.  
  123. time_for_filename = time.strftime("%Y-%m-%d_%H%M%S")
  124.  
  125. #print "u has been released, %.2f seconds after requesting lock" % (time.time() - startGetting)
  126. updatePath = path[:(len(path)-4)] + "_(" + time_for_filename + ").txt"
  127. f = open(updatePath, "w")
  128. f.write(str(res))
  129. f.close()
  130.  
  131.  
  132.  
  133.  
  134.  
  135. print "bitlys updated in %.2f seconds" % (time.time() - startBitly)
  136. #print ("updatepath: " +updatePath)
  137.  
  138. print "Done with all IO after %.2f seconds, for now." % (time.time() - startGetting)
  139. #print "Is the time queue empty: " , timeOfUpdateQueue.empty()
  140. #print "Is the updating queue empty: " , updatingQueue.empty()
  141. if timeOfUpdateQueue.empty() is False:
  142. timeOfNextUpdate = timeOfUpdateQueue.get()
  143. print "Next update is in %.2f seconds." % (timeOfNextUpdate - time.time())
  144. else:
  145. run = False
  146. print "Done updating"
  147. return "DONE"
  148. else:
  149. #print "Short sleep"
  150. time.sleep(1)
  151. else:
  152. #print "Long sleep"
  153. time.sleep(2)
  154. #print "Is the queue empty: " , q.empty()
  155. counter=1
  156. if q.empty() is False:
  157.  
  158. newsPath = q.get()
  159.  
  160. timeConstant= 0
  161. if(newsPath.endswith("1h.txt")):
  162. timeConstant=1
  163. else:
  164. timeConstant=1
  165. turns=turns
  166. print("24 HOUR")
  167.  
  168. #print ("Path to the original news-file (NEWSPATH): " + newsPath)
  169. #print "Turns to save:", turns
  170. t = time.time()
  171. i = 1
  172. while i <= turns:
  173. ##if sats om vi behover annat intervall, for alla i storre an nat, stoppa in modiferad newspath
  174. ## for mindre far vi skapa nya newspathen och dela upp
  175. updatingQueue.put(((t +(sleep * timeConstant*i)),newsPath))
  176. timeOfUpdateQueue.put(t +(sleep * timeConstant*i))
  177. if timeOfNextUpdate == 0:
  178. timeOfNextUpdate = timeOfUpdateQueue.get()
  179. print "First update time added. It is in %.2f seconds." % (timeOfNextUpdate - time.time())
  180. i = i + 1
  181. turns=turns1
  182.  
  183. return "DONE"
  184. TRESHHOLD = 200
  185. alreadyDone = []
  186. def startRoundTwo(path):
  187. if path in alreadyDone:
  188. return False
  189. else:
  190. alreadyDone.append(path)
  191. #split() pa textdokumenten, utifran updatePath som man kan fa fran u (skicka med hit o kolla pa den, eller kalla pa den fran u med denna funkation som villkor)
  192. print("PRINT123 " + path)
  193. return True
  194. #@profile
  195. def runfunc(sleeplength, timeout, runs, runBitly, runHtmlExtractor, runNaiveBayes, saveClicks):
  196. runBlockSplitInto1h24h = True
  197. manager = Manager()
  198. q = manager.Queue()
  199. lock = manager.Lock()
  200. #if sleeplength > 0:
  201. #print("HEEEEJ JESPER ")
  202. p = Process(target=u, args=(q,sleeplength,turns,lock, runBlockSplitInto1h24h))
  203. p.start()
  204. print p.pid
  205.  
  206. pool = Pool(processes=1,maxtasksperchild=2)
  207. i = 0
  208. counter = 0
  209. while i < runs:
  210. tweetsPath = tweetsRunner.collectTweets(timeout)
  211.  
  212. print(tweetsPath)
  213. i = i + 1
  214. if runBitly == "y":
  215. print ("running bitlys: " + str(counter))
  216. print "Running bitlys"
  217. # Which version do we want here?
  218. #pool.apply_async(b, (tweetsPath,lock,), callback=q.put)
  219. shit= b(tweetsPath,lock)
  220.  
  221. hourlyPath =shit[:-4]+"01h.txt"
  222. dailyPath =shit[:-4]+"24h.txt"
  223. f = open(shit,"r")
  224. string = f.read()
  225.  
  226. hourly = open(hourlyPath,"w+")
  227. daily = open(dailyPath,"w+")
  228.  
  229.  
  230. array = string.split('\n')
  231. for line in array[:-1]:
  232. hourly.write(line+'\n')
  233. # array2=line.split(",")
  234. # string2 = array2[1]
  235. # array3 = string2.split(":")
  236. # if(int(array3[1])>TRESHHOLD):
  237. # print("Global_clicks hourly: " +array3[1])
  238. # #hourlyUpdate.append(line)
  239. # hourly.write(line+"\n")
  240. #
  241. # else:
  242. # print("Global_clicks daily: " +array3[1])
  243. #
  244. # daily.write(line+"\n")
  245.  
  246.  
  247. #hourly.write(str(hourlyUpdate))
  248. #daily.write(str(dailyUpdate))
  249.  
  250. hourly.close()
  251. daily.close()
  252. f.close()
  253. #q.put(hourlyPath)
  254. #q.put(dailyPath)
  255.  
  256. q.put(hourlyPath)
  257. q.put(dailyPath)
  258. ## else:
  259. ## time.sleep(timeout)
  260. ## i = i + 1
  261. pool.close()
  262. pool.join()
  263. # time.sleep(sleeplength)
  264. if sleeplength > 0:
  265. p.join()
  266. print "P should have joined now"
  267. print p.is_alive()
  268. if p.is_alive():
  269. p.terminate()
  270.  
  271. if runHtmlExtractor == "y":
  272. print "Runing html"
  273. htmlToText.run()
  274.  
  275. if runNaiveBayes == "y":
  276. print "Runing Bayes"
  277. naiveBayesPipeline.run()
  278.  
  279. if saveClicks == "y":
  280. print "Runing updates"
  281. UpdateClicks.main()
  282.  
  283. print "FINISHED"
  284.  
  285. if (__name__ == '__main__'):
  286.  
  287.  
  288. sleeplength = 600
  289. timeout = 20 # input("In seconds, for how long would you like to collect tweets? This one is the 10-minute we gather tweets ", )
  290.  
  291. runs = 2 #input("For how many runs would you like to collect tweets? used to be 288", )
  292.  
  293. runBitly = "y" #raw_input("Would you like to extract bitly info too? (y/n)")
  294.  
  295. runHtmlExtractor = "y" #raw_input("Would you like to extract articles from identified links? (y/n)")
  296.  
  297. runNaiveBayes = "y" #raw_input("Would you like to classify extracted articles? (y/n)")
  298.  
  299. saveClicks = "y" #raw_input("Would you like to save clicks to excelfile? (y/n)")
  300. #sleeplength = 3 #int(raw_input("How often would you like to update clicks, answer in seconds "))
  301. #turns = 4 #int(raw_input("How many times would you like to update? used to be 84"))
  302. if saveClicks == "y":
  303. sleeplength = 10 #int(raw_input("How often would you like to update clicks, answer in seconds "))
  304. turns = 6 #int(raw_input("How many times would you like to update? used to be 84"))
  305.  
  306. open('./data/seenShortURLs.txt', 'w').close()
  307. open('./data/expanded.txt', 'w').close()
  308. open('./data/runme.txt', 'w').close()
  309. open('./data/links/UnknownArticlesToBeExtracted.txt', 'w').close()
  310. open('./data/links/articleURLAndTitle.txt', 'w').close()
  311. open('./data/news/classifications.txt', 'w').close()
  312. #Erase contents in some .txt
  313. runfunc(sleeplength, timeout, runs, runBitly, runHtmlExtractor, runNaiveBayes, saveClicks)
  314.  
  315. #done = raw_input("You can close the program now by pressing any key")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement