Advertisement
Guest User

Untitled

a guest
Jan 27th, 2015
161
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.96 KB | None | 0 0
  1. import urllib2
  2. import time
  3. import re
  4. from bs4 import BeautifulSoup
  5. from Tkinter import *
  6. import threading
  7.  
  8.  
  9. class What_is_p:
  10.     def __init__(self):
  11.         self.window = Tk()
  12.         self.window.title("What is P?")
  13.  
  14.         self.scrollbar = Scrollbar(self.window)
  15.         self.scrollbar.pack(side=RIGHT, fill=Y)
  16.  
  17.         self.firstTime = True
  18.  
  19.         self.textbox = Text(self.window,
  20.                             height = 6, width = 50,
  21.                             state='disabled',
  22.                             font = ('Courier', 10),
  23.                             bg = 'black',
  24.                             fg = 'green',
  25.                             wrap = 'word',
  26.                             yscrollcommand=self.scrollbar.set)
  27.         self.textbox.pack()
  28.  
  29.         self.scrollbar.config(command=self.textbox.yview)
  30.         self.writeLog("Program Opened. Search begun")
  31.  
  32.         self.journal = 34
  33.         self.issue = 49
  34.              
  35.         self.run()
  36.  
  37.  
  38.  
  39.     def run(self):
  40.         threads = []
  41.         if self.firstTime: # just to allow GUI to open before hard searching begins
  42.             self.firstTime = False
  43.             print "First loop over"
  44.         else:        
  45.             pages = self.get_pages(str(self.journal), str(self.issue))
  46.             print "Got Pages"
  47.             if isinstance(pages, int): #returned a 404 error or such, rather than a normal list, because we came to last issue
  48.                 print "Got to end of journal"
  49.                 self.journal += 1
  50.                 self.issue = 1
  51.             else:
  52.                 for page in pages:
  53.                     print "active threads =" + str(threading.active_count())
  54.                     print "Working Article", str(self.journal), str(self.issue), page
  55.                     t = threading.Thread(target=self.workArticle, args = (str(self.journal),str(self.issue), page))
  56.                     t.daemon = True
  57.                     #threads.append(t)
  58.                     t.start()
  59.                     print "active threads =" + str(threading.active_count())
  60.                     #NON THREADED VERSION
  61.                     #self.workArticle(str(self.journal), str(self.issue), page)
  62.                     #self.textbox.update_idletasks()
  63.                 self.issue += 1 #when done with that issue, step to next
  64.                 print "NEW ISSUE"
  65.             if self.journal >= 35 and self.issue > 2: #if we reach the current issue
  66.                 self.writeLog("Finished!!!!!")
  67.                 print "FINISHED!!!!"
  68.                 return
  69.        
  70.         while threading.active_count() > 3:
  71.             #print self.checkThread(threads)
  72.             print "active threads =" + str(threading.active_count())
  73.             time.sleep(0.5)
  74.  
  75.         self.window.after(100, self.run)
  76.  
  77.     def checkThread(self, threads):
  78.         num_alive_threads = 0
  79.         for t in threads:
  80.             if t.isAlive():
  81.                 num_alive_threads += 1
  82.         return num_alive_threads
  83.  
  84.     def get_pages(self, journal, issue):
  85.         self.writeLog("Working on Journal "+ journal + "/" + issue)
  86.  
  87.         url = "http://www.jneurosci.org/content/" + journal + "/" + issue + ".toc"
  88.  
  89.         req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.91 Safari/537.36' })
  90.  
  91.         try:
  92.             #page = urllib2.urlopen(url)
  93.             page = urllib2.urlopen(req).read()
  94.         except urllib2.HTTPError, e:
  95.             print e.code
  96.             return e.code
  97.          
  98.         #soup = BeautifulSoup(page.read())
  99.         soup = BeautifulSoup(page)
  100.         articles = soup.find_all("a", {"rel": "full-text"})
  101.         pages = []
  102.         for article in articles:
  103.             page_start_ind =  article['href'].rfind("/") + 1
  104.             page_end_ind = article['href'].rfind(".full")
  105.             page = article['href'][page_start_ind:page_end_ind]
  106.             if page.isdigit():
  107.                 if isinstance(page, unicode):
  108.                     pages.append( page.encode("utf-8") )
  109.                 else:
  110.                     pages.append( page )
  111.         return pages
  112.  
  113.     def workArticle(self, journal, issue, page):
  114.         article = self.getPage(journal, issue, page)
  115.         p_list = self.find_p(article)
  116.         self.write_p_to_file(p_list, journal, issue, page)
  117.         print "1 thread done"
  118.  
  119.     def getPage(self, journal, issue, page):
  120.         url = "http://www.jneurosci.org/content/" + journal + "/" + issue + "/" + page + ".full"
  121.  
  122.         req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
  123.         try:
  124.             page = urllib2.urlopen(req).read()
  125.         except urllib2.HTTPError, e:
  126.             print e.code
  127.        
  128.         #page=urllib2.urlopen(url, timeout=10)
  129.         #soup = BeautifulSoup(page.read())
  130.         soup = BeautifulSoup(page)
  131.         sections = soup.find_all("div", {"class": "section"})
  132.         article = ""
  133.         for section in sections:
  134.             article += section.get_text()
  135.         print "Returned Whole Page"
  136.         return article.lower() #return lower case so no need to search for "p =" and "P ="
  137.  
  138.     def find_p(self, article):
  139.         unicode_p_vals = [match.group(0) for match in re.finditer("p ?= ?(0?(\.\d+)+)", article)]
  140.         p_vals = []
  141.         for p_val in unicode_p_vals:
  142.             if isinstance(p_val, unicode):
  143.                 p_vals.append( p_val.encode("utf-8") )
  144.             else:
  145.                 p_vals.append( p_val )
  146.  
  147.         p_greater_05 = ["p>0.05" for match in re.finditer(re.compile("(p>0.05|p > 0.05)"), article)]
  148.         p_less_05 = ["p<0.05" for match in re.finditer(re.compile("(p<0.05|p < 0.05)"), article)]
  149.         p_less_01 = ["p<0.01" for match in re.finditer(re.compile("(p<0.01|p < 0.01)"), article)]
  150.         p_less_005 = ["p<0.005" for match in re.finditer(re.compile("(p<0.005|p < 0.005)"), article)]
  151.         p_less_001 = ["p<0.001" for match in re.finditer(re.compile("(p<0.001|p < 0.001)"), article)]
  152.         p_less_0005 = ["p<0.0005" for match in re.finditer(re.compile("(p<0.0005|p < 0.0005)"), article)]
  153.         p_less_0001 = ["p<0.0001" for match in re.finditer(re.compile("(p<0.0001|p < 0.0001)"), article)]
  154.  
  155.         print p_vals + p_less_05 + p_less_01 + p_less_005 + p_less_001 + p_less_0005 + p_less_0001
  156.  
  157.         return p_vals + p_less_05 + p_less_01 + p_less_005 + p_less_001 + p_less_0005 + p_less_0001
  158.        
  159.     def write_p_to_file(self, p_list, journal, issue, page):
  160.         f = open("p_out.txt", "a")
  161.         for p in p_list:
  162.             f.write(journal + "\t" + issue + "\t" + page + "\t" + p + "\n")
  163.             self.writeLog(journal + "\t" + issue + "\t" + page + "\t" + p)
  164.         f.close()
  165.        
  166.  
  167.     def writeLog(self, msg):
  168.         self.textbox['state'] = 'normal'
  169.         self.textbox.insert('end', "\n"+time.strftime('%H:%M:%S') + " " + msg)
  170.         self.textbox.yview(END)
  171.         self.textbox['state'] = 'disabled'
  172.         self.textbox.update_idletasks()
  173.  
  174. app = What_is_p()
  175. app.window.mainloop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement