View difference between Paste ID: WmMR6HcC and mWgxd2g1
SHOW: | | - or go back to the newest paste.
1
'''HTML searching for Funny Pro Elites shoutbox
2
Author: Joe McBobski
3
Should be in same folder as Parser.py
4
Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
5
Python must be downloaded on your computer to run.
6
Attribution:
7
Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
8
Also the user xperroni on stackoverflow for Parser.py'''
9
### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###
10
11
#Imports necessary modules
12
import Parser
13
from urllib import urlopen
14
import sys
15
16
def clearscreen(numlines=100):
17
	import os
18
	if os.name == "posix":
19
		# Unix/Linux/MacOS/BSD/etc
20
		os.system('clear')
21
	elif os.name in ("nt", "dos", "ce"):
22-
	TERMCOUNT = 0
22+
		# DOS/Windows
23
		os.system('CLS')
24-
	WORDS_PARSED = []
24+
25-
	URLS = []
25+
			# Fallback for other operating systems.
26-
	URLS_TEXT = []
26+
		print '\n' * numlines
27-
	UNPARSED = []
27+
28
#Search is now a function.
29
def wordsearch():
30
	'''Searches for the word.'''
31
	
32
	#sets up variable for words to go in. Also sets up term and page count.
33
	WORDS = []
34
	PAGENUM = 0
35
	
36
	#asks you what to look for, places them in search.
37
	print "What term to look for?"
38
	SEARCH = raw_input("> ")
39
	print "How many pages back?" #Might be removed later, or just do it yourself.
40
	PAGES = raw_input("> ") #Please do not set pages too high.
41
	
42-
	PAGES = 0
42+
43
	URLS = []	
44
	for pagenum in range(1, (int(PAGES) + 1)):
45-
		print PAGES
45+
46-
		print 
46+
47-
		UNPARSED.append( "Page %d" % (URLS.index(url) + 1)) # adds a note for the page number
47+
48-
		for word in URL_OPENED.readlines(): #begins a loop and performs it on each line in URL_OPENED
48+
49
	#extracts shout data from pages
50
		UNPARSED = []	
51
	for url in URLS:
52
		URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
53
		LINES = URL_OPENED.readlines()
54
		URL_OPENED.close()
55
		PAGES = URLS.index(url) + 1
56
		clearscreen()
57
		print "Loading page:", PAGES
58
		UNPARSED.append( "Page %d" % (PAGES)) # adds a note for the page number
59
		for word in LINES: #begins a loop and performs it on each line in URL_OPENED
60
			#MAKE THIS A VARIABLE:
61
			if "<tr id=\'" in word: #Makes sure you're getting just the shouts
62
				UNPARSED.append(word) #adds the data to UNPARSED
63
			else:
64
				pass #AKA do nothing
65
				
66
		PAGES +=1	
67
	#OPERABLE!
68
	
69
	SHOUTS = [] #create new empty set
70
	for arg in UNPARSED:
71
		if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
72
			unbroken = arg.split("</tr>") #breaks up every term
73
			for arg in unbroken: #adds them together again to make the shouts.				
74
				shout = "".join([arg, "</tr>"]) 
75
				SHOUTS.append(shout)
76
		elif "Page" in arg: #exception for page numbers
77
			SHOUTS.append(arg)
78
		else: #do nothing
79
			pass
80
						
81
	THREEPARTSHOUTS = []
82
	SHOUTNUM = 0
83
	WORDS_PARSED = []	
84
	for x in SHOUTS:
85
		WORDS_PARSED.append(Parser.dehtml(x))
86
	for x in WORDS_PARSED:
87
		if " - " in x:
88
			startdate = x.index(" - ")
89
			date = x[startdate:(startdate + 18)]
90
			di = x.index(date)
91
			username = x[0:di]
92
			shout = x[(di + 18):-1] + x[-1]
93
			THREEPARTSHOUTS.append([username, date, shout])
94
		else:
95
			THREEPARTSHOUTS.append(x)
96-
	YESTERMS = FALSE
96+
97
	
98-
		if isinstance(word, str) and word != "\n" and (YESTERMS or word == "Page 1"):
98+
	TERMCOUNT = 0	
99
	for shout in THREEPARTSHOUTS: #Takes this all on
100-
			NEWPAGE = True
100+
101
			WORDS.append(shout)
102-
		else:			
102+
103
				TERMCOUNT += 1
104
				WORDS.append(shout) #adds shout to WORDS.
105
		else:
106
			pass
107
108
			
109
	#Returns words, with another one of those fancy "message" things.
110
	print "You searched:", SEARCH
111
	print "Pages searched:", PAGES
112
	print "Words matching search term:", TERMCOUNT
113
	print "Words:"
114
	YESTERMS = False
115
	for word in WORDS:
116
		if isinstance(word, list):
117
			print word[0], word[1], word[2]
118
			YESTERMS = True
119
		elif word != "" and WORDS[(WORDS.index(word) + 1)] != "":
120
			print word
121
		else:
122
			pass
123
	print "What would you like to do?\n1: try another search\n2: exit"
124
	NEXT = int(raw_input("> ")) #converts to integer
125
	if NEXT == 1:
126
		wordsearch()
127
	if NEXT == 2:
128
		exit()
129
	else:
130
		print "INVALID. EXITING"
131
		exit()
132
wordsearch()