Advertisement
Joe_McBobski

Shout Box Search

Feb 17th, 2013
19
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.50 KB | None | 0 0
  1. '''HTML searching for Funny Pro Elites shoutbox
  2. Author: Joe McBobski
  3. Should be in same folder as Parser.py
  4. Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
  5. Python must be downloaded on your computer to run.
  6. Attribution:
  7. Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
  8. Also the user xperroni on stackoverflow for Parser.py'''
  9. ### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###
  10.  
  11. #Imports necessary modules
  12. import Parser
  13. from urllib import urlopen
  14. import sys
  15.  
  16. #Search is now a function.
  17. def wordsearch():
  18.     '''Searches for the word.'''
  19.    
  20.     #sets up variable for words to go in. Also sets up term and page count.
  21.     WORDS = []
  22.     TERMCOUNT = 0
  23.     PAGENUM = 0
  24.     WORDS_PARSED = []
  25.     URLS = []
  26.     URLS_TEXT = []
  27.     UNPARSED = []
  28.    
  29.     #asks you what to look for, places them in search.
  30.     print "What term to look for?"
  31.     SEARCH = raw_input("> ")
  32.     print "How many pages back?" #Might be removed later, or just do it yourself.
  33.     PAGES = raw_input("> ") #Please do not set pages too high.
  34.    
  35.     #puts all the pages up in variables.
  36.     for pagenum in range(1, (int(PAGES) + 1)):
  37.         #takes all the pages you want to search, and puts them in a list
  38.         EXTRACT_URL = "http://funnyproelites.com/index.php?action=full_shoutbox&page=%d" % pagenum #"index.php_%d.html" % pagenum #
  39.         URLS.append(EXTRACT_URL)
  40.    
  41.     #extracts shout data from pages
  42.     PAGES = 0
  43.     for url in URLS:
  44.         URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
  45.         print PAGES
  46.         print
  47.         UNPARSED.append( "Page %d" % (URLS.index(url) + 1)) # adds a note for the page number
  48.         for word in URL_OPENED.readlines(): #begins a loop and performs it on each line in URL_OPENED
  49.             #MAKE THIS A VARIABLE:
  50.             if "<tr id=\'" in word: #Makes sure you're getting just the shouts
  51.                 UNPARSED.append(word) #adds the data to UNPARSED
  52.             else:
  53.                 pass #AKA do nothing
  54.         PAGES +=1  
  55.     #OPERABLE!
  56.    
  57.     SHOUTS = [] #create new empty set
  58.     for arg in UNPARSED:
  59.         if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
  60.             unbroken = arg.split("</tr>") #breaks up every term
  61.             for arg in unbroken: #adds them together again to make the shouts.             
  62.                 shout = "".join([arg, "</tr>"])
  63.                 SHOUTS.append(shout)
  64.         elif "Page" in arg: #exception for page numbers
  65.             SHOUTS.append(arg)
  66.         else: #do nothing
  67.             pass
  68.                        
  69.     THREEPARTSHOUTS = []
  70.     SHOUTNUM = 0
  71.     for x in SHOUTS:
  72.         WORDS_PARSED.append(Parser.dehtml(x))
  73.     for x in WORDS_PARSED:
  74.         if " - " in x:
  75.             startdate = x.index(" - ")
  76.             date = x[startdate:(startdate + 18)]
  77.             di = x.index(date)
  78.             username = x[0:di]
  79.             shout = x[(di + 18):-1] + x[-1]
  80.             THREEPARTSHOUTS.append([username, date, shout])
  81.         else:
  82.             THREEPARTSHOUTS.append(x)
  83.     for shout in THREEPARTSHOUTS: #Takes this all on
  84.         if isinstance(shout, str):
  85.             WORDS.append(shout)
  86.         elif SEARCH in shout[2]:
  87.                 TERMCOUNT += 1
  88.                 WORDS.append(shout) #adds shout to WORDS.
  89.         else:
  90.             pass
  91.  
  92.            
  93.     #Returns words, with another one of those fancy "message" things.
  94.     print "Words matching search term:", TERMCOUNT
  95.     print "Words:"
  96.     YESTERMS = FALSE
  97.     for word in WORDS:
  98.         if isinstance(word, str) and word != "\n" and (YESTERMS or word == "Page 1"):
  99.             print word
  100.             NEWPAGE = True
  101.    
  102.         else:          
  103.             print word[0], word[1], word[2]
  104.     print "What would you like to do?\n1: try another search\n2: exit"
  105.     NEXT = int(raw_input("> ")) #converts to integer
  106.     if NEXT == 1:
  107.         wordsearch()
  108.     if NEXT == 2:
  109.         exit()
  110.     else:
  111.         print "INVALID. EXITING"
  112.         exit()
  113. wordsearch()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement