Shout Box Search

'''HTML searching for Funny Pro Elites shoutbox
Author: Joe McBobski
Should be in same folder as Parser.py
Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
Python must be downloaded on your computer to run.
Attribution:
Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
Also the user xperroni on stackoverflow for Parser.py'''
### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###

#Imports necessary modules
import Parser
from urllib import urlopen
import sys

#Search is now a function.
def wordsearch():
    '''Searches for the word.'''

    #sets up variable for words to go in. Also sets up term and page count.
    WORDS = []
    TERMCOUNT = 0
    PAGENUM = 0
    WORDS_PARSED = []
    URLS = []
    URLS_TEXT = []
    UNPARSED = []

    #asks you what to look for, places them in search.
    print "What term to look for?"
    SEARCH = raw_input("> ")
    print "How many pages back?" #Might be removed later, or just do it yourself.
    PAGES = raw_input("> ") #Please do not set pages too high.

    #puts all the pages up in variables.
    for pagenum in range(1, (int(PAGES) + 1)):
        #takes all the pages you want to search, and puts them in a list
        EXTRACT_URL = "http://funnyproelites.com/index.php?action=full_shoutbox&page=%d" % pagenum #"index.php_%d.html" % pagenum #
        URLS.append(EXTRACT_URL)

    #extracts shout data from pages
    PAGES = 0
    for url in URLS:
        URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
        print PAGES
        print
        UNPARSED.append( "Page %d" % (URLS.index(url) + 1)) # adds a note for the page number
        for word in URL_OPENED.readlines(): #begins a loop and performs it on each line in URL_OPENED
            #MAKE THIS A VARIABLE:
            if "<tr id=\'" in word: #Makes sure you're getting just the shouts
                UNPARSED.append(word) #adds the data to UNPARSED
            else:
                pass #AKA do nothing
        PAGES +=1
    #OPERABLE!

    SHOUTS = [] #create new empty set
    for arg in UNPARSED:
        if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
            unbroken = arg.split("</tr>") #breaks up every term
            for arg in unbroken: #adds them together again to make the shouts.
                shout = "".join([arg, "</tr>"])
                SHOUTS.append(shout)
        elif "Page" in arg: #exception for page numbers
            SHOUTS.append(arg)
        else: #do nothing
            pass

    THREEPARTSHOUTS = []
    SHOUTNUM = 0
    for x in SHOUTS:
        WORDS_PARSED.append(Parser.dehtml(x))
    for x in WORDS_PARSED:
        if " - " in x:
            startdate = x.index(" - ")
            date = x[startdate:(startdate + 18)]
            di = x.index(date)
            username = x[0:di]
            shout = x[(di + 18):-1] + x[-1]
            THREEPARTSHOUTS.append([username, date, shout])
        else:
            THREEPARTSHOUTS.append(x)
    for shout in THREEPARTSHOUTS: #Takes this all on
        if isinstance(shout, str):
            WORDS.append(shout)
        elif SEARCH in shout[2]:
                TERMCOUNT += 1
                WORDS.append(shout) #adds shout to WORDS.
        else:
            pass


    #Returns words, with another one of those fancy "message" things.
    print "Words matching search term:", TERMCOUNT
    print "Words:"
    YESTERMS = FALSE
    for word in WORDS:
        if isinstance(word, str) and word != "\n" and (YESTERMS or word == "Page 1"):
            print word
            NEWPAGE = True

        else:
            print word[0], word[1], word[2]
    print "What would you like to do?\n1: try another search\n2: exit"
    NEXT = int(raw_input("> ")) #converts to integer
    if NEXT == 1:
        wordsearch()
    if NEXT == 2:
        exit()
    else:
        print "INVALID. EXITING"
        exit()
wordsearch()