Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- '''HTML searching for Funny Pro Elites shoutbox
- Author: Joe McBobski
- Should be in same folder as Parser.py
- Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
- Python must be downloaded on your computer to run.
- Attribution:
- Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
- Also the user xperroni on stackoverflow for Parser.py'''
- ### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###
- #Imports necessary modules
- import Parser
- from urllib import urlopen
- import sys
- def clearscreen(numlines=100):
- import os
- if os.name == "posix":
- # Unix/Linux/MacOS/BSD/etc
- os.system('clear')
- elif os.name in ("nt", "dos", "ce"):
- # DOS/Windows
- os.system('CLS')
- else:
- # Fallback for other operating systems.
- print '\n' * numlines
- #Search is now a function.
- def wordsearch():
- '''Searches for the word.'''
- #sets up variable for words to go in. Also sets up term and page count.
- WORDS = []
- PAGENUM = 0
- #asks you what to look for, places them in search.
- SEARCH = raw_input("What term to look for?\n> ")
- START = raw_input("Start?\n> ")
- PAGES = raw_input("How far back?\n> ") #Please do not set pages too high.
- #puts all the pages up in variables.
- URLS = []
- for pagenum in range(int(START), (int(PAGES) + 1)):
- #takes all the pages you want to search, and puts them in a list
- EXTRACT_URL = "http://funnyproelites.com/index.php?action=full_shoutbox&page=%d" % pagenum #"index.php_%d.html" % pagenum #
- URLS.append(EXTRACT_URL)
- #extracts shout data from pages
- UNPARSED = []
- for url in URLS:
- URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
- LINES = URL_OPENED.readlines()
- URL_OPENED.close()
- PAGES = URLS.index(url) + 1
- clearscreen()
- print "Loading page:", PAGES
- UNPARSED.append( "Page %d" % (PAGES)) # adds a note for the page number
- for word in LINES: #begins a loop and performs it on each line in URL_OPENED
- #MAKE THIS A VARIABLE:
- if "<tr id=\'" in word: #Makes sure you're getting just the shouts
- UNPARSED.append(word) #adds the data to UNPARSED
- else:
- pass #AKA do nothing
- PAGES +=1
- #OPERABLE!
- SHOUTS = [] #create new empty set
- for arg in UNPARSED:
- if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
- unbroken = arg.split("</tr>") #breaks up every term
- for arg in unbroken: #adds them together again to make the shouts.
- shout = "".join([arg, "</tr>"])
- SHOUTS.append(shout)
- elif "Page" in arg: #exception for page numbers
- SHOUTS.append(arg)
- else: #do nothing
- pass
- THREEPARTSHOUTS = []
- SHOUTNUM = 0
- WORDS_PARSED = []
- for x in SHOUTS:
- WORDS_PARSED.append(Parser.dehtml(x))
- for x in WORDS_PARSED:
- if " - " in x:
- startdate = x.index(" - ")
- date = x[startdate:(startdate + 18)]
- di = x.index(date)
- username = x[0:di]
- shout = x[(di + 18):-1] + x[-1]
- THREEPARTSHOUTS.append([username, date, shout])
- else:
- THREEPARTSHOUTS.append(x)
- TERMCOUNT = 0
- for shout in THREEPARTSHOUTS: #Takes this all on
- if isinstance(shout, str):
- WORDS.append(shout)
- elif SEARCH in shout[2]:
- TERMCOUNT += 1
- WORDS.append(shout) #adds shout to WORDS.
- else:
- pass
- #Returns words, with another one of those fancy "message" things.
- print "You searched:", SEARCH
- print "Pages searched:", PAGES
- print "Words matching search term:", TERMCOUNT
- print "Words:"
- YESTERMS = False
- for word in WORDS:
- if isinstance(word, list):
- print word[0], word[1], word[2]
- YESTERMS = True
- elif word != "" and WORDS[(WORDS.index(word) + 1)] != "":
- print word
- else:
- pass
- print "What would you like to do?\n1: try another search\n2: exit"
- NEXT = int(raw_input("> ")) #converts to integer
- if NEXT == 1:
- wordsearch()
- if NEXT == 2:
- exit()
- else:
- print "INVALID. EXITING"
- exit()
- wordsearch()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement