Advertisement
Joe_McBobski

Shout Box Search 0.1

Feb 23rd, 2013
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. '''HTML searching for Funny Pro Elites shoutbox
  2. Author: Joe McBobski
  3. Should be in same folder as Parser.py
  4. Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
  5. Python must be downloaded on your computer to run.
  6. Attribution:
  7. Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
  8. Also the user xperroni on stackoverflow for Parser.py'''
  9. ### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###
  10.  
  11. #Imports necessary modules
  12. import Parser
  13. from urllib import urlopen
  14. import sys
  15.  
  16. def clearscreen(numlines=100):
  17.     import os
  18.     if os.name == "posix":
  19.         # Unix/Linux/MacOS/BSD/etc
  20.         os.system('clear')
  21.     elif os.name in ("nt", "dos", "ce"):
  22.         # DOS/Windows
  23.         os.system('CLS')
  24.     else:
  25.             # Fallback for other operating systems.
  26.         print '\n' * numlines
  27.  
  28. #Search is now a function.
  29. def wordsearch():
  30.     '''Searches for the word.'''
  31.    
  32.     #sets up variable for words to go in. Also sets up term and page count.
  33.     WORDS = []
  34.     PAGENUM = 0
  35.    
  36.     #asks you what to look for, places them in search.
  37.     print "What term to look for?"
  38.     SEARCH = raw_input("> ")
  39.     print "How many pages back?" #Might be removed later, or just do it yourself.
  40.     PAGES = raw_input("> ") #Please do not set pages too high.
  41.    
  42.     #puts all the pages up in variables.
  43.     URLS = []  
  44.     for pagenum in range(1, (int(PAGES) + 1)):
  45.         #takes all the pages you want to search, and puts them in a list
  46.         EXTRACT_URL = "http://funnyproelites.com/index.php?action=full_shoutbox&page=%d" % pagenum #"index.php_%d.html" % pagenum #
  47.         URLS.append(EXTRACT_URL)
  48.    
  49.     #extracts shout data from pages
  50.         UNPARSED = []  
  51.     for url in URLS:
  52.         URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
  53.         LINES = URL_OPENED.readlines()
  54.         URL_OPENED.close()
  55.         PAGES = URLS.index(url) + 1
  56.         clearscreen()
  57.         print "Loading page:", PAGES
  58.         UNPARSED.append( "Page %d" % (PAGES)) # adds a note for the page number
  59.         for word in LINES: #begins a loop and performs it on each line in URL_OPENED
  60.             #MAKE THIS A VARIABLE:
  61.             if "<tr id=\'" in word: #Makes sure you're getting just the shouts
  62.                 UNPARSED.append(word) #adds the data to UNPARSED
  63.             else:
  64.                 pass #AKA do nothing
  65.                
  66.         PAGES +=1  
  67.     #OPERABLE!
  68.    
  69.     SHOUTS = [] #create new empty set
  70.     for arg in UNPARSED:
  71.         if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
  72.             unbroken = arg.split("</tr>") #breaks up every term
  73.             for arg in unbroken: #adds them together again to make the shouts.             
  74.                 shout = "".join([arg, "</tr>"])
  75.                 SHOUTS.append(shout)
  76.         elif "Page" in arg: #exception for page numbers
  77.             SHOUTS.append(arg)
  78.         else: #do nothing
  79.             pass
  80.                        
  81.     THREEPARTSHOUTS = []
  82.     SHOUTNUM = 0
  83.     WORDS_PARSED = []  
  84.     for x in SHOUTS:
  85.         WORDS_PARSED.append(Parser.dehtml(x))
  86.     for x in WORDS_PARSED:
  87.         if " - " in x:
  88.             startdate = x.index(" - ")
  89.             date = x[startdate:(startdate + 18)]
  90.             di = x.index(date)
  91.             username = x[0:di]
  92.             shout = x[(di + 18):-1] + x[-1]
  93.             THREEPARTSHOUTS.append([username, date, shout])
  94.         else:
  95.             THREEPARTSHOUTS.append(x)
  96.    
  97.    
  98.     TERMCOUNT = 0  
  99.     for shout in THREEPARTSHOUTS: #Takes this all on
  100.         if isinstance(shout, str):
  101.             WORDS.append(shout)
  102.         elif SEARCH in shout[2]:
  103.                 TERMCOUNT += 1
  104.                 WORDS.append(shout) #adds shout to WORDS.
  105.         else:
  106.             pass
  107.  
  108.            
  109.     #Returns words, with another one of those fancy "message" things.
  110.     print "You searched:", SEARCH
  111.     print "Pages searched:", PAGES
  112.     print "Words matching search term:", TERMCOUNT
  113.     print "Words:"
  114.     YESTERMS = False
  115.     for word in WORDS:
  116.         if isinstance(word, list):
  117.             print word[0], word[1], word[2]
  118.             YESTERMS = True
  119.         elif word != "" and WORDS[(WORDS.index(word) + 1)] != "":
  120.             print word
  121.         else:
  122.             pass
  123.     print "What would you like to do?\n1: try another search\n2: exit"
  124.     NEXT = int(raw_input("> ")) #converts to integer
  125.     if NEXT == 1:
  126.         wordsearch()
  127.     if NEXT == 2:
  128.         exit()
  129.     else:
  130.         print "INVALID. EXITING"
  131.         exit()
  132. wordsearch()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement