Advertisement
Guest User

Shout Box Search v0.2

a guest
Dec 7th, 2013
53
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. '''HTML searching for Funny Pro Elites shoutbox
  2. Author: Joe McBobski
  3. Should be in same folder as Parser.py
  4. Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
  5. Python must be downloaded on your computer to run.
  6. Attribution:
  7. Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
  8. Also the user xperroni on stackoverflow for Parser.py'''
  9. ### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###
  10.  
  11. #Imports necessary modules
  12. import Parser
  13. from urllib import urlopen
  14. import sys
  15.  
  16. def clearscreen(numlines=100):
  17.     import os
  18.     if os.name == "posix":
  19.         # Unix/Linux/MacOS/BSD/etc
  20.         os.system('clear')
  21.     elif os.name in ("nt", "dos", "ce"):
  22.         # DOS/Windows
  23.         os.system('CLS')
  24.     else:
  25.             # Fallback for other operating systems.
  26.         print '\n' * numlines
  27.  
  28. #Search is now a function.
  29. def wordsearch():
  30.     '''Searches for the word.'''
  31.    
  32.     #sets up variable for words to go in. Also sets up term and page count.
  33.     WORDS = []
  34.     PAGENUM = 0
  35.    
  36.     #asks you what to look for, places them in search.
  37.     SEARCH = raw_input("What term to look for?\n> ")
  38.     START = raw_input("Start?\n> ")
  39.     PAGES = raw_input("How far back?\n> ") #Please do not set pages too high.
  40.  
  41.     #puts all the pages up in variables.
  42.     URLS = []  
  43.     for pagenum in range(int(START), (int(PAGES) + 1)):
  44.         #takes all the pages you want to search, and puts them in a list
  45.         EXTRACT_URL = "http://funnyproelites.com/index.php?action=full_shoutbox&page=%d" % pagenum #"index.php_%d.html" % pagenum #
  46.         URLS.append(EXTRACT_URL)
  47.    
  48.     #extracts shout data from pages
  49.         UNPARSED = []  
  50.     for url in URLS:
  51.         URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
  52.         LINES = URL_OPENED.readlines()
  53.         URL_OPENED.close()
  54.         PAGES = URLS.index(url) + 1
  55.         clearscreen()
  56.         print "Loading page:", PAGES
  57.         UNPARSED.append( "Page %d" % (PAGES)) # adds a note for the page number
  58.         for word in LINES: #begins a loop and performs it on each line in URL_OPENED
  59.             #MAKE THIS A VARIABLE:
  60.             if "<tr id=\'" in word: #Makes sure you're getting just the shouts
  61.                 UNPARSED.append(word) #adds the data to UNPARSED
  62.             else:
  63.                 pass #AKA do nothing
  64.                
  65.         PAGES +=1  
  66.     #OPERABLE!
  67.    
  68.     SHOUTS = [] #create new empty set
  69.     for arg in UNPARSED:
  70.         if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
  71.             unbroken = arg.split("</tr>") #breaks up every term
  72.             for arg in unbroken: #adds them together again to make the shouts.             
  73.                 shout = "".join([arg, "</tr>"])
  74.                 SHOUTS.append(shout)
  75.         elif "Page" in arg: #exception for page numbers
  76.             SHOUTS.append(arg)
  77.         else: #do nothing
  78.             pass
  79.                        
  80.     THREEPARTSHOUTS = []
  81.     SHOUTNUM = 0
  82.     WORDS_PARSED = []  
  83.     for x in SHOUTS:
  84.         WORDS_PARSED.append(Parser.dehtml(x))
  85.     for x in WORDS_PARSED:
  86.         if " - " in x:
  87.             startdate = x.index(" - ")
  88.             date = x[startdate:(startdate + 18)]
  89.             di = x.index(date)
  90.             username = x[0:di]
  91.             shout = x[(di + 18):-1] + x[-1]
  92.             THREEPARTSHOUTS.append([username, date, shout])
  93.         else:
  94.             THREEPARTSHOUTS.append(x)
  95.    
  96.    
  97.     TERMCOUNT = 0  
  98.     for shout in THREEPARTSHOUTS: #Takes this all on
  99.         if isinstance(shout, str):
  100.             WORDS.append(shout)
  101.         elif SEARCH in shout[2]:
  102.                 TERMCOUNT += 1
  103.                 WORDS.append(shout) #adds shout to WORDS.
  104.         else:
  105.             pass
  106.  
  107.            
  108.     #Returns words, with another one of those fancy "message" things.
  109.     print "You searched:", SEARCH
  110.     print "Pages searched:", PAGES
  111.     print "Words matching search term:", TERMCOUNT
  112.     print "Words:"
  113.     YESTERMS = False
  114.     for word in WORDS:
  115.         if isinstance(word, list):
  116.             print word[0], word[1], word[2]
  117.             YESTERMS = True
  118.         elif word != "" and WORDS[(WORDS.index(word) + 1)] != "":
  119.             print word
  120.         else:
  121.             pass
  122.     print "What would you like to do?\n1: try another search\n2: exit"
  123.     NEXT = int(raw_input("> ")) #converts to integer
  124.     if NEXT == 1:
  125.         wordsearch()
  126.     if NEXT == 2:
  127.         exit()
  128.     else:
  129.         print "INVALID. EXITING"
  130.         exit()
  131. wordsearch()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement