SHOW:
|
|
- or go back to the newest paste.
1 | '''HTML searching for Funny Pro Elites shoutbox | |
2 | Author: Joe McBobski | |
3 | Should be in same folder as Parser.py | |
4 | Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py". | |
5 | Python must be downloaded on your computer to run. | |
6 | Attribution: | |
7 | Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it. | |
8 | Also the user xperroni on stackoverflow for Parser.py''' | |
9 | ### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ### | |
10 | ||
11 | #Imports necessary modules | |
12 | import Parser | |
13 | from urllib import urlopen | |
14 | import sys | |
15 | ||
16 | def clearscreen(numlines=100): | |
17 | import os | |
18 | if os.name == "posix": | |
19 | # Unix/Linux/MacOS/BSD/etc | |
20 | os.system('clear') | |
21 | elif os.name in ("nt", "dos", "ce"): | |
22 | - | TERMCOUNT = 0 |
22 | + | # DOS/Windows |
23 | os.system('CLS') | |
24 | - | WORDS_PARSED = [] |
24 | + | |
25 | - | URLS = [] |
25 | + | # Fallback for other operating systems. |
26 | - | URLS_TEXT = [] |
26 | + | print '\n' * numlines |
27 | - | UNPARSED = [] |
27 | + | |
28 | #Search is now a function. | |
29 | def wordsearch(): | |
30 | '''Searches for the word.''' | |
31 | ||
32 | #sets up variable for words to go in. Also sets up term and page count. | |
33 | WORDS = [] | |
34 | PAGENUM = 0 | |
35 | ||
36 | #asks you what to look for, places them in search. | |
37 | print "What term to look for?" | |
38 | SEARCH = raw_input("> ") | |
39 | print "How many pages back?" #Might be removed later, or just do it yourself. | |
40 | PAGES = raw_input("> ") #Please do not set pages too high. | |
41 | ||
42 | - | PAGES = 0 |
42 | + | |
43 | URLS = [] | |
44 | for pagenum in range(1, (int(PAGES) + 1)): | |
45 | - | print PAGES |
45 | + | |
46 | - | |
46 | + | |
47 | - | UNPARSED.append( "Page %d" % (URLS.index(url) + 1)) # adds a note for the page number |
47 | + | |
48 | - | for word in URL_OPENED.readlines(): #begins a loop and performs it on each line in URL_OPENED |
48 | + | |
49 | #extracts shout data from pages | |
50 | UNPARSED = [] | |
51 | for url in URLS: | |
52 | URL_OPENED = urlopen(url) #stores the page data in URL_OPENED | |
53 | LINES = URL_OPENED.readlines() | |
54 | URL_OPENED.close() | |
55 | PAGES = URLS.index(url) + 1 | |
56 | clearscreen() | |
57 | print "Loading page:", PAGES | |
58 | UNPARSED.append( "Page %d" % (PAGES)) # adds a note for the page number | |
59 | for word in LINES: #begins a loop and performs it on each line in URL_OPENED | |
60 | #MAKE THIS A VARIABLE: | |
61 | if "<tr id=\'" in word: #Makes sure you're getting just the shouts | |
62 | UNPARSED.append(word) #adds the data to UNPARSED | |
63 | else: | |
64 | pass #AKA do nothing | |
65 | ||
66 | PAGES +=1 | |
67 | #OPERABLE! | |
68 | ||
69 | SHOUTS = [] #create new empty set | |
70 | for arg in UNPARSED: | |
71 | if "<tr id=\'" in arg: #make sure it only tries this on actual shout data | |
72 | unbroken = arg.split("</tr>") #breaks up every term | |
73 | for arg in unbroken: #adds them together again to make the shouts. | |
74 | shout = "".join([arg, "</tr>"]) | |
75 | SHOUTS.append(shout) | |
76 | elif "Page" in arg: #exception for page numbers | |
77 | SHOUTS.append(arg) | |
78 | else: #do nothing | |
79 | pass | |
80 | ||
81 | THREEPARTSHOUTS = [] | |
82 | SHOUTNUM = 0 | |
83 | WORDS_PARSED = [] | |
84 | for x in SHOUTS: | |
85 | WORDS_PARSED.append(Parser.dehtml(x)) | |
86 | for x in WORDS_PARSED: | |
87 | if " - " in x: | |
88 | startdate = x.index(" - ") | |
89 | date = x[startdate:(startdate + 18)] | |
90 | di = x.index(date) | |
91 | username = x[0:di] | |
92 | shout = x[(di + 18):-1] + x[-1] | |
93 | THREEPARTSHOUTS.append([username, date, shout]) | |
94 | else: | |
95 | THREEPARTSHOUTS.append(x) | |
96 | - | YESTERMS = FALSE |
96 | + | |
97 | ||
98 | - | if isinstance(word, str) and word != "\n" and (YESTERMS or word == "Page 1"): |
98 | + | TERMCOUNT = 0 |
99 | for shout in THREEPARTSHOUTS: #Takes this all on | |
100 | - | NEWPAGE = True |
100 | + | |
101 | WORDS.append(shout) | |
102 | - | else: |
102 | + | |
103 | TERMCOUNT += 1 | |
104 | WORDS.append(shout) #adds shout to WORDS. | |
105 | else: | |
106 | pass | |
107 | ||
108 | ||
109 | #Returns words, with another one of those fancy "message" things. | |
110 | print "You searched:", SEARCH | |
111 | print "Pages searched:", PAGES | |
112 | print "Words matching search term:", TERMCOUNT | |
113 | print "Words:" | |
114 | YESTERMS = False | |
115 | for word in WORDS: | |
116 | if isinstance(word, list): | |
117 | print word[0], word[1], word[2] | |
118 | YESTERMS = True | |
119 | elif word != "" and WORDS[(WORDS.index(word) + 1)] != "": | |
120 | print word | |
121 | else: | |
122 | pass | |
123 | print "What would you like to do?\n1: try another search\n2: exit" | |
124 | NEXT = int(raw_input("> ")) #converts to integer | |
125 | if NEXT == 1: | |
126 | wordsearch() | |
127 | if NEXT == 2: | |
128 | exit() | |
129 | else: | |
130 | print "INVALID. EXITING" | |
131 | exit() | |
132 | wordsearch() |