Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib2
- import time
- import re
- from bs4 import BeautifulSoup
- from Tkinter import *
- import threading
- class What_is_p:
- def __init__(self):
- self.window = Tk()
- self.window.title("What is P?")
- self.scrollbar = Scrollbar(self.window)
- self.scrollbar.pack(side=RIGHT, fill=Y)
- self.firstTime = True
- self.textbox = Text(self.window,
- height = 6, width = 50,
- state='disabled',
- font = ('Courier', 10),
- bg = 'black',
- fg = 'green',
- wrap = 'word',
- yscrollcommand=self.scrollbar.set)
- self.textbox.pack()
- self.scrollbar.config(command=self.textbox.yview)
- self.writeLog("Program Opened. Search begun")
- self.journal = 34
- self.issue = 49
- self.run()
- def run(self):
- threads = []
- if self.firstTime: # just to allow GUI to open before hard searching begins
- self.firstTime = False
- print "First loop over"
- else:
- pages = self.get_pages(str(self.journal), str(self.issue))
- print "Got Pages"
- if isinstance(pages, int): #returned a 404 error or such, rather than a normal list, because we came to last issue
- print "Got to end of journal"
- self.journal += 1
- self.issue = 1
- else:
- for page in pages:
- print "active threads =" + str(threading.active_count())
- print "Working Article", str(self.journal), str(self.issue), page
- t = threading.Thread(target=self.workArticle, args = (str(self.journal),str(self.issue), page))
- t.daemon = True
- #threads.append(t)
- t.start()
- print "active threads =" + str(threading.active_count())
- #NON THREADED VERSION
- #self.workArticle(str(self.journal), str(self.issue), page)
- #self.textbox.update_idletasks()
- self.issue += 1 #when done with that issue, step to next
- print "NEW ISSUE"
- if self.journal >= 35 and self.issue > 2: #if we reach the current issue
- self.writeLog("Finished!!!!!")
- print "FINISHED!!!!"
- return
- while threading.active_count() > 3:
- #print self.checkThread(threads)
- print "active threads =" + str(threading.active_count())
- time.sleep(0.5)
- self.window.after(100, self.run)
- def checkThread(self, threads):
- num_alive_threads = 0
- for t in threads:
- if t.isAlive():
- num_alive_threads += 1
- return num_alive_threads
- def get_pages(self, journal, issue):
- self.writeLog("Working on Journal "+ journal + "/" + issue)
- url = "http://www.jneurosci.org/content/" + journal + "/" + issue + ".toc"
- req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.91 Safari/537.36' })
- try:
- #page = urllib2.urlopen(url)
- page = urllib2.urlopen(req).read()
- except urllib2.HTTPError, e:
- print e.code
- return e.code
- #soup = BeautifulSoup(page.read())
- soup = BeautifulSoup(page)
- articles = soup.find_all("a", {"rel": "full-text"})
- pages = []
- for article in articles:
- page_start_ind = article['href'].rfind("/") + 1
- page_end_ind = article['href'].rfind(".full")
- page = article['href'][page_start_ind:page_end_ind]
- if page.isdigit():
- if isinstance(page, unicode):
- pages.append( page.encode("utf-8") )
- else:
- pages.append( page )
- return pages
- def workArticle(self, journal, issue, page):
- article = self.getPage(journal, issue, page)
- p_list = self.find_p(article)
- self.write_p_to_file(p_list, journal, issue, page)
- print "1 thread done"
- def getPage(self, journal, issue, page):
- url = "http://www.jneurosci.org/content/" + journal + "/" + issue + "/" + page + ".full"
- req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
- try:
- page = urllib2.urlopen(req).read()
- except urllib2.HTTPError, e:
- print e.code
- #page=urllib2.urlopen(url, timeout=10)
- #soup = BeautifulSoup(page.read())
- soup = BeautifulSoup(page)
- sections = soup.find_all("div", {"class": "section"})
- article = ""
- for section in sections:
- article += section.get_text()
- print "Returned Whole Page"
- return article.lower() #return lower case so no need to search for "p =" and "P ="
- def find_p(self, article):
- unicode_p_vals = [match.group(0) for match in re.finditer("p ?= ?(0?(\.\d+)+)", article)]
- p_vals = []
- for p_val in unicode_p_vals:
- if isinstance(p_val, unicode):
- p_vals.append( p_val.encode("utf-8") )
- else:
- p_vals.append( p_val )
- p_greater_05 = ["p>0.05" for match in re.finditer(re.compile("(p>0.05|p > 0.05)"), article)]
- p_less_05 = ["p<0.05" for match in re.finditer(re.compile("(p<0.05|p < 0.05)"), article)]
- p_less_01 = ["p<0.01" for match in re.finditer(re.compile("(p<0.01|p < 0.01)"), article)]
- p_less_005 = ["p<0.005" for match in re.finditer(re.compile("(p<0.005|p < 0.005)"), article)]
- p_less_001 = ["p<0.001" for match in re.finditer(re.compile("(p<0.001|p < 0.001)"), article)]
- p_less_0005 = ["p<0.0005" for match in re.finditer(re.compile("(p<0.0005|p < 0.0005)"), article)]
- p_less_0001 = ["p<0.0001" for match in re.finditer(re.compile("(p<0.0001|p < 0.0001)"), article)]
- print p_vals + p_less_05 + p_less_01 + p_less_005 + p_less_001 + p_less_0005 + p_less_0001
- return p_vals + p_less_05 + p_less_01 + p_less_005 + p_less_001 + p_less_0005 + p_less_0001
- def write_p_to_file(self, p_list, journal, issue, page):
- f = open("p_out.txt", "a")
- for p in p_list:
- f.write(journal + "\t" + issue + "\t" + page + "\t" + p + "\n")
- self.writeLog(journal + "\t" + issue + "\t" + page + "\t" + p)
- f.close()
- def writeLog(self, msg):
- self.textbox['state'] = 'normal'
- self.textbox.insert('end', "\n"+time.strftime('%H:%M:%S') + " " + msg)
- self.textbox.yview(END)
- self.textbox['state'] = 'disabled'
- self.textbox.update_idletasks()
- app = What_is_p()
- app.window.mainloop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement