SHOW:
|
|
- or go back to the newest paste.
1 | import praw | |
2 | import re | |
3 | from urllib.parse import * | |
4 | import threading | |
5 | import requests | |
6 | import time | |
7 | from bs4 import BeautifulSoup | |
8 | ||
9 | ||
10 | posts = [] | |
11 | foundLinks = [] | |
12 | newLinks = [] | |
13 | workingLinks = [] | |
14 | threads = 0 | |
15 | maxThreads = 250 | |
16 | postsToLoad = 10 | |
17 | timeoutForTesting = 10 | |
18 | ||
19 | URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""" | |
20 | ||
21 | blackList = ["ch0c", | |
22 | "pastebin.com", | |
23 | "anotherBlacklistedItem", | |
24 | "filepursuit", | |
25 | "github.com", | |
26 | "reddit.com", | |
27 | "shodan.io", | |
28 | "wikipedia", | |
29 | "the-eye", | |
30 | "twitter", | |
31 | "facebook", | |
32 | "youtube", | |
33 | "tumblr.com", | |
34 | "archive.org", | |
35 | "archive.org", | |
36 | "i.redd.it", | |
37 | "redditmedia.com", | |
38 | "rg.to", | |
39 | ] | |
40 | ||
41 | ||
42 | # disabled | |
43 | """ | |
44 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=10000", | |
45 | """ | |
46 | ||
47 | # the number after before= is the date in epoch format | |
48 | # I found the values below by checking the last result at each link | |
49 | ||
50 | pagesToCheckForLinks = [ | |
51 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&after=1543663785", | |
52 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1543663785", | |
53 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1526916567", | |
54 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1513931734", | |
55 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1497490290", | |
56 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1484223692", | |
57 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1460743035", | |
58 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1444658067", | |
59 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440908907", | |
60 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440996579", | |
61 | "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440991337", | |
62 | ] | |
63 | ||
64 | # reddit settings | |
65 | bot = praw.Reddit(client_id='---', | |
66 | client_secret='---', | |
67 | user_agent='MySimpleBot v0.1', | |
68 | username='---', | |
69 | password='---') | |
70 | ||
71 | print('logged in to Reddit as: ' + str(bot.user.me())) | |
72 | ||
73 | ||
74 | # to read existing file | |
75 | def updateTextInFileVar(): | |
76 | global textInFile | |
77 | file = open("opendirectories.txt", "r") | |
78 | textInFile = file.read() | |
79 | file.close() | |
80 | ||
81 | ||
82 | # function for sorting on domain and subdomain | |
83 | def domain(link): | |
84 | try: | |
85 | netlocParts = urlsplit(link).netloc.split(".") | |
86 | sortTerm = netlocParts[-2] | |
87 | try: | |
88 | sub = netlocParts[-3] | |
89 | sortTerm += sub | |
90 | except: | |
91 | pass | |
92 | return sortTerm | |
93 | except: | |
94 | return link | |
95 | ||
96 | ||
97 | def checkDirectory(linkToCheck): | |
98 | global workingLinks | |
99 | global threads | |
100 | netLoc = urlsplit(linkToCheck).netloc | |
101 | dirLinks = [] | |
102 | ||
103 | try: | |
104 | page = requests.get(linkToCheck, timeout=timeoutForTesting).text | |
105 | ||
106 | if "</script>" in page and "bitdl" not in linkToCheck and 'bitdownload' not in linkToCheck and "Index of /" not in page: # bitdl is an od with js | |
107 | print(linkToCheck, "has javascript. Skipping") | |
108 | raise ZeroDivisionError | |
109 | ||
110 | soup = BeautifulSoup(page, 'html.parser') | |
111 | ||
112 | for link in soup.findAll('a'): | |
113 | href = str(link.get('href')) | |
114 | ||
115 | if "?C" in href: # skip sorting links | |
116 | continue | |
117 | ||
118 | fullLink = urljoin(linkToCheck, href) | |
119 | if "?C=" not in fullLink and len(fullLink) > len(link): | |
120 | if netLoc in fullLink and fullLink not in dirLinks: | |
121 | dirLinks.append(fullLink) | |
122 | ||
123 | # print(len(dirLinks), "found on", linkToCheck) | |
124 | ||
125 | if len(dirLinks) >= 2: | |
126 | workingLinks.append(linkToCheck) | |
127 | if linkToCheck not in textInFile: | |
128 | print(linkToCheck, "is new") | |
129 | ||
130 | pFile = open("opendirectories.txt", "a") | |
131 | pFile.write(linkToCheck + "\n") | |
132 | pFile.close() | |
133 | ||
134 | except Exception as e: | |
135 | print(linkToCheck, "failed:") | |
136 | ||
137 | threads -= 1 | |
138 | ||
139 | # if there is 123.com/files/123 and 123.com/files keep only the shortest | |
140 | def shortestPartialDuplicateLink(links): | |
141 | linksPerNetloc = {} | |
142 | ||
143 | for link in links: | |
144 | nl = urlsplit(link).netloc | |
145 | ||
146 | if nl not in linksPerNetloc: | |
147 | linksPerNetloc[nl] = [] | |
148 | ||
149 | linksPerNetloc[nl].append(link) | |
150 | ||
151 | shortestLinks = [] | |
152 | for key in linksPerNetloc: | |
153 | linksPerNetloc[key].sort(key=len) | |
154 | shortestLinks.append(linksPerNetloc[key][0]) | |
155 | ||
156 | return shortestLinks | |
157 | ||
158 | ||
159 | # read existing file | |
160 | updateTextInFileVar() | |
161 | ||
162 | postsToScan = [] | |
163 | ||
164 | # get posts | |
165 | RedditPosts = bot.subreddit('opendirectories').new(limit=postsToLoad) | |
166 | ||
167 | ||
168 | # add custom links content to posts | |
169 | for link in pagesToCheckForLinks: | |
170 | postsToScan.append(requests.get(link).text) | |
171 | print("loaded 1000 posts from pushshift") | |
172 | ||
173 | for post in RedditPosts: | |
174 | # postsToScan.append(post) | |
175 | pass | |
176 | ||
177 | # search for links in each post | |
178 | for post in postsToScan: | |
179 | ||
180 | try: | |
181 | text = post.selftext | |
182 | except: | |
183 | text = post | |
184 | ||
185 | try: | |
186 | text += " " + post.url | |
187 | except: | |
188 | pass | |
189 | ||
190 | posts.append(text) | |
191 | ||
192 | urls = re.findall(URL_REGEX, text) | |
193 | ||
194 | print(len(posts), "posts read. Found", len(urls), "links in this one") | |
195 | # print(urls) | |
196 | ||
197 | blackListed = False | |
198 | ||
199 | for url in urls: | |
200 | # don't add links that contain blacklist terms | |
201 | for term in blackList: | |
202 | if term.lower() in url.lower(): | |
203 | print(term, "found in ", url) | |
204 | blackListed = True | |
205 | break | |
206 | else: | |
207 | blackListed = False | |
208 | ||
209 | if not url.endswith("/"): | |
210 | url += "/" | |
211 | ||
212 | nl = urlsplit(url).netloc | |
213 | ||
214 | if url not in foundLinks and not blackListed and nl not in textInFile: | |
215 | foundLinks.append(url) | |
216 | print("adding", url) | |
217 | else: | |
218 | # print("not adding", url) | |
219 | pass | |
220 | ||
221 | print("foundlinks", foundLinks) | |
222 | ||
223 | # start checking threads for each url | |
224 | for url in foundLinks: | |
225 | while threads >= maxThreads: | |
226 | print(threads, "threads already running. Waiting...") | |
227 | time.sleep(1) | |
228 | ||
229 | if url not in textInFile and url not in newLinks: | |
230 | threading.Thread(target=checkDirectory, args=(url,)).start() | |
231 | threads += 1 | |
232 | print("started new thread", threads, "threads running. ", len(workingLinks), "working directories found") | |
233 | else: | |
234 | print(url, "is already in the file") | |
235 | ||
236 | # wait for threads to finish | |
237 | while threads > 0: | |
238 | try: | |
239 | time.sleep(1) | |
240 | print("Waiting for {} threads to finish. ctrl+c to stop".format(threads)) | |
241 | except KeyboardInterrupt: | |
242 | print("keyboard interrupt") | |
243 | break | |
244 | ||
245 | # read links that have been added to the file | |
246 | file = open("opendirectories.txt", "r") | |
247 | urls = file.read().split("\n") | |
248 | file.close() | |
249 | ||
250 | # keep only the shortest link of each web server | |
251 | urls = (shortestPartialDuplicateLink(urls)) | |
252 | ||
253 | # sort list on (sub) domain | |
254 | urls.sort(key=domain) | |
255 | ||
256 | # write sorted links to file | |
257 | file = open("opendirectories.txt", "w+") | |
258 | for url in urls: | |
259 | if len(url) > 5: | |
260 | file.write(url + "\n") | |
261 | file.close() | |
262 | ||
263 | print("Got {} working directories with links from {} posts with {} links".format(len(workingLinks), len(posts), | |
264 | len(foundLinks))) | |
265 | input("\nCOMPLETED\n") |