View difference between Paste ID: <a href="/aEXiUihw">aEXiUihw</a> and <a href="/f5jKPu0f">f5jKPu0f</a>

import praw
1		import praw
2		import re
3		from urllib.parse import *
4		import threading
5		import requests
6		import time
7		from bs4 import BeautifulSoup
8
9
10		posts = []
11		foundLinks = []
12		newLinks = []
13		workingLinks = []
14		threads = 0
15		maxThreads = 250
16		postsToLoad = 10
17		timeoutForTesting = 10
18
19		URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}\|[a-z0-9%])\|[a-z0-9.\-]+[.](?:com\|net\|org\|edu\|gov\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|name\|post\|pro\|tel\|travel\|xxx\|ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|Ja\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sx\|sy\|sz\|tc\|td\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|za\|zm\|zw)/)(?:[^\s()<>{}\[\]]+\|\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\))+(?:\([^\s()]?\([^\s()]+\)[^\s()]?\)\|\([^\s]+?\)\|[^\s`!()\[\]{};:'".,<>?«»“”‘’])\|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com\|net\|org\|edu\|gov\|mil\|aero\|asia\|biz\|cat\|coop\|info\|int\|jobs\|mobi\|museum\|name\|post\|pro\|tel\|travel\|xxx\|ac\|ad\|ae\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|as\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|bh\|bi\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|cr\|cs\|cu\|cv\|cx\|cy\|cz\|dd\|de\|dj\|dk\|dm\|do\|dz\|ec\|ee\|eg\|eh\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|mk\|ml\|mm\|mn\|mo\|mp\|mq\|mr\|ms\|mt\|mu\|mv\|mw\|mx\|my\|mz\|na\|nc\|ne\|nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|Ja\|sk\|sl\|sm\|sn\|so\|sr\|ss\|st\|su\|sv\|sx\|sy\|sz\|tc\|td\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|za\|zm\|zw)\b/?(?!@)))"""
20
21		blackList = ["ch0c",
22		"pastebin.com",
23		"anotherBlacklistedItem",
24		"filepursuit",
25		"github.com",
26		"reddit.com",
27		"shodan.io",
28		"wikipedia",
29		"the-eye",
30		"twitter",
31		"facebook",
32		"youtube",
33		"tumblr.com",
34		"archive.org",
35		"archive.org",
36		"i.redd.it",
37		"redditmedia.com",
38		"rg.to",
39		]
40
41
42		# disabled
43		"""
44		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=10000",
45		"""
46
47		# the number after before= is the date in epoch format
48		# I found the values below by checking the last result at each link
49
50		pagesToCheckForLinks = [
51		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&after=1543663785",
52		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1543663785",
53		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1526916567",
54		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1513931734",
55		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1497490290",
56		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1484223692",
57		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1460743035",
58		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1444658067",
59		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440908907",
60		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440996579",
61		"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440991337",
62		]
63
64		# reddit settings
65		bot = praw.Reddit(client_id='---',
66		client_secret='---',
67		user_agent='MySimpleBot v0.1',
68		username='---',
69		password='---')
70
71		print('logged in to Reddit as: ' + str(bot.user.me()))
72
73
74		# to read existing file
75		def updateTextInFileVar():
76		global textInFile
77		file = open("opendirectories.txt", "r")
78		textInFile = file.read()
79		file.close()
80
81
82		# function for sorting on domain and subdomain
83		def domain(link):
84		try:
85		netlocParts = urlsplit(link).netloc.split(".")
86		sortTerm = netlocParts[-2]
87		try:
88		sub = netlocParts[-3]
89		sortTerm += sub
90		except:
91		pass
92		return sortTerm
93		except:
94		return link
95
96
97		def checkDirectory(linkToCheck):
98		global workingLinks
99		global threads
100		netLoc = urlsplit(linkToCheck).netloc
101		dirLinks = []
102
103		try:
104		page = requests.get(linkToCheck, timeout=timeoutForTesting).text
105
106		if "</script>" in page and "bitdl" not in linkToCheck and 'bitdownload' not in linkToCheck and "Index of /" not in page: # bitdl is an od with js
107		print(linkToCheck, "has javascript. Skipping")
108		raise ZeroDivisionError
109
110		soup = BeautifulSoup(page, 'html.parser')
111
112		for link in soup.findAll('a'):
113		href = str(link.get('href'))
114
115		if "?C" in href: # skip sorting links
116		continue
117
118		fullLink = urljoin(linkToCheck, href)
119		if "?C=" not in fullLink and len(fullLink) > len(link):
120		if netLoc in fullLink and fullLink not in dirLinks:
121		dirLinks.append(fullLink)
122
123		# print(len(dirLinks), "found on", linkToCheck)
124
125		if len(dirLinks) >= 2:
126		workingLinks.append(linkToCheck)
127		if linkToCheck not in textInFile:
128		print(linkToCheck, "is new")
129
130		pFile = open("opendirectories.txt", "a")
131		pFile.write(linkToCheck + "\n")
132		pFile.close()
133
134		except Exception as e:
135		print(linkToCheck, "failed:")
136
137		threads -= 1
138
139		# if there is 123.com/files/123 and 123.com/files keep only the shortest
140		def shortestPartialDuplicateLink(links):
141		linksPerNetloc = {}
142
143		for link in links:
144		nl = urlsplit(link).netloc
145
146		if nl not in linksPerNetloc:
147		linksPerNetloc[nl] = []
148
149		linksPerNetloc[nl].append(link)
150
151		shortestLinks = []
152		for key in linksPerNetloc:
153		linksPerNetloc[key].sort(key=len)
154		shortestLinks.append(linksPerNetloc[key][0])
155
156		return shortestLinks
157
158
159		# read existing file
160		updateTextInFileVar()
161
162		postsToScan = []
163
164		# get posts
165		RedditPosts = bot.subreddit('opendirectories').new(limit=postsToLoad)
166
167
168		# add custom links content to posts
169		for link in pagesToCheckForLinks:
170		postsToScan.append(requests.get(link).text)
171		print("loaded 1000 posts from pushshift")
172
173		for post in RedditPosts:
174		# postsToScan.append(post)
175		pass
176
177		# search for links in each post
178		for post in postsToScan:
179
180		try:
181		text = post.selftext
182		except:
183		text = post
184
185		try:
186		text += " " + post.url
187		except:
188		pass
189
190		posts.append(text)
191
192		urls = re.findall(URL_REGEX, text)
193
194		print(len(posts), "posts read. Found", len(urls), "links in this one")
195		# print(urls)
196
197		blackListed = False
198
199		for url in urls:
200		# don't add links that contain blacklist terms
201		for term in blackList:
202		if term.lower() in url.lower():
203		print(term, "found in ", url)
204		blackListed = True
205		break
206		else:
207		blackListed = False
208
209		if not url.endswith("/"):
210		url += "/"
211
212		nl = urlsplit(url).netloc
213
214		if url not in foundLinks and not blackListed and nl not in textInFile:
215		foundLinks.append(url)
216		print("adding", url)
217		else:
218		# print("not adding", url)
219		pass
220
221		print("foundlinks", foundLinks)
222
223		# start checking threads for each url
224		for url in foundLinks:
225		while threads >= maxThreads:
226		print(threads, "threads already running. Waiting...")
227		time.sleep(1)
228
229		if url not in textInFile and url not in newLinks:
230		threading.Thread(target=checkDirectory, args=(url,)).start()
231		threads += 1
232		print("started new thread", threads, "threads running. ", len(workingLinks), "working directories found")
233		else:
234		print(url, "is already in the file")
235
236		# wait for threads to finish
237		while threads > 0:
238		try:
239		time.sleep(1)
240		print("Waiting for {} threads to finish. ctrl+c to stop".format(threads))
241		except KeyboardInterrupt:
242		print("keyboard interrupt")
243		break
244
245		# read links that have been added to the file
246		file = open("opendirectories.txt", "r")
247		urls = file.read().split("\n")
248		file.close()
249
250		# keep only the shortest link of each web server
251		urls = (shortestPartialDuplicateLink(urls))
252
253		# sort list on (sub) domain
254		urls.sort(key=domain)
255
256		# write sorted links to file
257		file = open("opendirectories.txt", "w+")
258		for url in urls:
259		if len(url) > 5:
260		file.write(url + "\n")
261		file.close()
262
263		print("Got {} working directories with links from {} posts with {} links".format(len(workingLinks), len(posts),
264		len(foundLinks)))
265		input("\nCOMPLETED\n")