View difference between Paste ID: h0hL2qLz and WmMR6HcC
SHOW: | | - or go back to the newest paste.
1
'''HTML searching for Funny Pro Elites shoutbox
2
Author: Joe McBobski
3
Should be in same folder as Parser.py
4
Call from any shell with support for python with "cd (foldername)", and then "python wordsearch.py".
5
Python must be downloaded on your computer to run.
6
Attribution:
7
Zed Shaw (Writer of Learn Python the Hard Way) taught me how to do it.
8
Also the user xperroni on stackoverflow for Parser.py'''
9
### IF YOU FIND ANY ERRORS OR BUGS, PLEASE REPORT THEM TO JOE ###
10
11
#Imports necessary modules
12
import Parser
13
from urllib import urlopen
14
import sys
15
16
def clearscreen(numlines=100):
17
	import os
18
	if os.name == "posix":
19
		# Unix/Linux/MacOS/BSD/etc
20
		os.system('clear')
21
	elif os.name in ("nt", "dos", "ce"):
22
		# DOS/Windows
23
		os.system('CLS')
24
	else:
25
			# Fallback for other operating systems.
26
		print '\n' * numlines
27
28
#Search is now a function.
29
def wordsearch():
30
	'''Searches for the word.'''
31
	
32
	#sets up variable for words to go in. Also sets up term and page count.
33
	WORDS = []
34
	PAGENUM = 0
35
	
36
	#asks you what to look for, places them in search.
37-
	print "What term to look for?"
37+
	SEARCH = raw_input("What term to look for?\n> ")
38-
	SEARCH = raw_input("> ")
38+
	START = raw_input("Start?\n> ")
39-
	print "How many pages back?" #Might be removed later, or just do it yourself.
39+
	PAGES = raw_input("How far back?\n> ") #Please do not set pages too high.
40-
	PAGES = raw_input("> ") #Please do not set pages too high.
40+
41
	#puts all the pages up in variables.
42
	URLS = []	
43
	for pagenum in range(int(START), (int(PAGES) + 1)):
44-
	for pagenum in range(1, (int(PAGES) + 1)):
44+
45
		EXTRACT_URL = "http://funnyproelites.com/index.php?action=full_shoutbox&page=%d" % pagenum #"index.php_%d.html" % pagenum #
46
		URLS.append(EXTRACT_URL)
47
	
48
	#extracts shout data from pages
49
		UNPARSED = []	
50
	for url in URLS:
51
		URL_OPENED = urlopen(url) #stores the page data in URL_OPENED
52
		LINES = URL_OPENED.readlines()
53
		URL_OPENED.close()
54
		PAGES = URLS.index(url) + 1
55
		clearscreen()
56
		print "Loading page:", PAGES
57
		UNPARSED.append( "Page %d" % (PAGES)) # adds a note for the page number
58
		for word in LINES: #begins a loop and performs it on each line in URL_OPENED
59
			#MAKE THIS A VARIABLE:
60
			if "<tr id=\'" in word: #Makes sure you're getting just the shouts
61
				UNPARSED.append(word) #adds the data to UNPARSED
62
			else:
63
				pass #AKA do nothing
64
				
65
		PAGES +=1	
66
	#OPERABLE!
67
	
68
	SHOUTS = [] #create new empty set
69
	for arg in UNPARSED:
70
		if "<tr id=\'" in arg: #make sure it only tries this on actual shout data
71
			unbroken = arg.split("</tr>") #breaks up every term
72
			for arg in unbroken: #adds them together again to make the shouts.				
73
				shout = "".join([arg, "</tr>"]) 
74
				SHOUTS.append(shout)
75
		elif "Page" in arg: #exception for page numbers
76
			SHOUTS.append(arg)
77
		else: #do nothing
78
			pass
79
						
80
	THREEPARTSHOUTS = []
81
	SHOUTNUM = 0
82
	WORDS_PARSED = []	
83
	for x in SHOUTS:
84
		WORDS_PARSED.append(Parser.dehtml(x))
85
	for x in WORDS_PARSED:
86
		if " - " in x:
87
			startdate = x.index(" - ")
88
			date = x[startdate:(startdate + 18)]
89
			di = x.index(date)
90
			username = x[0:di]
91
			shout = x[(di + 18):-1] + x[-1]
92
			THREEPARTSHOUTS.append([username, date, shout])
93
		else:
94
			THREEPARTSHOUTS.append(x)
95
	
96
	
97
	TERMCOUNT = 0	
98
	for shout in THREEPARTSHOUTS: #Takes this all on
99
		if isinstance(shout, str):
100
			WORDS.append(shout)
101
		elif SEARCH in shout[2]:
102
				TERMCOUNT += 1
103
				WORDS.append(shout) #adds shout to WORDS.
104
		else:
105
			pass
106
107
			
108
	#Returns words, with another one of those fancy "message" things.
109
	print "You searched:", SEARCH
110
	print "Pages searched:", PAGES
111
	print "Words matching search term:", TERMCOUNT
112
	print "Words:"
113
	YESTERMS = False
114
	for word in WORDS:
115
		if isinstance(word, list):
116
			print word[0], word[1], word[2]
117
			YESTERMS = True
118
		elif word != "" and WORDS[(WORDS.index(word) + 1)] != "":
119
			print word
120
		else:
121
			pass
122
	print "What would you like to do?\n1: try another search\n2: exit"
123
	NEXT = int(raw_input("> ")) #converts to integer
124
	if NEXT == 1:
125
		wordsearch()
126
	if NEXT == 2:
127
		exit()
128
	else:
129
		print "INVALID. EXITING"
130
		exit()
131
wordsearch()