Untitled

import urllib2
import time
import re
from bs4 import BeautifulSoup
from Tkinter import *
import threading


class What_is_p:
    def __init__(self):
        self.window = Tk()
        self.window.title("What is P?")

        self.scrollbar = Scrollbar(self.window)
        self.scrollbar.pack(side=RIGHT, fill=Y)

        self.firstTime = True

        self.textbox = Text(self.window,
                            height = 6, width = 50,
                            state='disabled',
                            font = ('Courier', 10),
                            bg = 'black',
                            fg = 'green',
                            wrap = 'word',
                            yscrollcommand=self.scrollbar.set)
        self.textbox.pack()

        self.scrollbar.config(command=self.textbox.yview)
        self.writeLog("Program Opened. Search begun")

        self.journal = 34
        self.issue = 49

        self.run()


    def run(self):
        threads = []
        if self.firstTime: # just to allow GUI to open before hard searching begins
            self.firstTime = False
            print "First loop over"
        else:
            pages = self.get_pages(str(self.journal), str(self.issue))
            print "Got Pages"
            if isinstance(pages, int): #returned a 404 error or such, rather than a normal list, because we came to last issue
                print "Got to end of journal"
                self.journal += 1
                self.issue = 1
            else:
                for page in pages:
                    print "active threads =" + str(threading.active_count())
                    print "Working Article", str(self.journal), str(self.issue), page
                    t = threading.Thread(target=self.workArticle, args = (str(self.journal),str(self.issue), page))
                    t.daemon = True
                    #threads.append(t)
                    t.start()
                    print "active threads =" + str(threading.active_count())
                    #NON THREADED VERSION
                    #self.workArticle(str(self.journal), str(self.issue), page)
                    #self.textbox.update_idletasks()
                self.issue += 1 #when done with that issue, step to next
                print "NEW ISSUE"
            if self.journal >= 35 and self.issue > 2: #if we reach the current issue
                self.writeLog("Finished!!!!!")
                print "FINISHED!!!!"
                return

        while threading.active_count() > 3:
            #print self.checkThread(threads)
            print "active threads =" + str(threading.active_count())
            time.sleep(0.5)

        self.window.after(100, self.run)

    def checkThread(self, threads):
        num_alive_threads = 0
        for t in threads:
            if t.isAlive():
                num_alive_threads += 1
        return num_alive_threads

    def get_pages(self, journal, issue):
        self.writeLog("Working on Journal "+ journal + "/" + issue)

        url = "http://www.jneurosci.org/content/" + journal + "/" + issue + ".toc"

        req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.91 Safari/537.36' })

        try:
            #page = urllib2.urlopen(url)
            page = urllib2.urlopen(req).read()
        except urllib2.HTTPError, e:
            print e.code
            return e.code

        #soup = BeautifulSoup(page.read())
        soup = BeautifulSoup(page)
        articles = soup.find_all("a", {"rel": "full-text"})
        pages = []
        for article in articles:
            page_start_ind =  article['href'].rfind("/") + 1
            page_end_ind = article['href'].rfind(".full")
            page = article['href'][page_start_ind:page_end_ind]
            if page.isdigit():
                if isinstance(page, unicode):
                    pages.append( page.encode("utf-8") )
                else:
                    pages.append( page )
        return pages

    def workArticle(self, journal, issue, page):
        article = self.getPage(journal, issue, page)
        p_list = self.find_p(article)
        self.write_p_to_file(p_list, journal, issue, page)
        print "1 thread done"

    def getPage(self, journal, issue, page):
        url = "http://www.jneurosci.org/content/" + journal + "/" + issue + "/" + page + ".full"

        req = urllib2.Request(url, headers={ 'User-Agent': 'Mozilla/5.0' })
        try:
            page = urllib2.urlopen(req).read()
        except urllib2.HTTPError, e:
            print e.code

        #page=urllib2.urlopen(url, timeout=10)
        #soup = BeautifulSoup(page.read())
        soup = BeautifulSoup(page)
        sections = soup.find_all("div", {"class": "section"})
        article = ""
        for section in sections:
            article += section.get_text()
        print "Returned Whole Page"
        return article.lower() #return lower case so no need to search for "p =" and "P ="

    def find_p(self, article):
        unicode_p_vals = [match.group(0) for match in re.finditer("p ?= ?(0?(\.\d+)+)", article)]
        p_vals = []
        for p_val in unicode_p_vals:
            if isinstance(p_val, unicode):
                p_vals.append( p_val.encode("utf-8") )
            else:
                p_vals.append( p_val )

        p_greater_05 = ["p>0.05" for match in re.finditer(re.compile("(p>0.05|p > 0.05)"), article)]
        p_less_05 = ["p<0.05" for match in re.finditer(re.compile("(p<0.05|p < 0.05)"), article)]
        p_less_01 = ["p<0.01" for match in re.finditer(re.compile("(p<0.01|p < 0.01)"), article)]
        p_less_005 = ["p<0.005" for match in re.finditer(re.compile("(p<0.005|p < 0.005)"), article)]
        p_less_001 = ["p<0.001" for match in re.finditer(re.compile("(p<0.001|p < 0.001)"), article)]
        p_less_0005 = ["p<0.0005" for match in re.finditer(re.compile("(p<0.0005|p < 0.0005)"), article)]
        p_less_0001 = ["p<0.0001" for match in re.finditer(re.compile("(p<0.0001|p < 0.0001)"), article)]

        print p_vals + p_less_05 + p_less_01 + p_less_005 + p_less_001 + p_less_0005 + p_less_0001

        return p_vals + p_less_05 + p_less_01 + p_less_005 + p_less_001 + p_less_0005 + p_less_0001

    def write_p_to_file(self, p_list, journal, issue, page):
        f = open("p_out.txt", "a")
        for p in p_list:
            f.write(journal + "\t" + issue + "\t" + page + "\t" + p + "\n")
            self.writeLog(journal + "\t" + issue + "\t" + page + "\t" + p)
        f.close()


    def writeLog(self, msg):
        self.textbox['state'] = 'normal'
        self.textbox.insert('end', "\n"+time.strftime('%H:%M:%S') + " " + msg)
        self.textbox.yview(END)
        self.textbox['state'] = 'disabled'
        self.textbox.update_idletasks()

app = What_is_p()
app.window.mainloop()