View difference between Paste ID: aEXiUihw and f5jKPu0f
SHOW: | | - or go back to the newest paste.
1
import praw
2
import re
3
from urllib.parse import *
4
import threading
5
import requests
6
import time
7
from bs4 import BeautifulSoup
8
9
10
posts = []
11
foundLinks = []
12
newLinks = []
13
workingLinks = []
14
threads = 0
15
maxThreads = 250
16
postsToLoad = 10
17
timeoutForTesting = 10
18
19
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
20
21
blackList = ["ch0c",
22
             "pastebin.com",
23
             "anotherBlacklistedItem",
24
             "filepursuit",
25
             "github.com",
26
             "reddit.com",
27
             "shodan.io",
28
             "wikipedia",
29
             "the-eye",
30
             "twitter",
31
             "facebook",
32
             "youtube",
33
             "tumblr.com",
34
             "archive.org",
35
             "archive.org",
36
             "i.redd.it",
37
             "redditmedia.com",
38
             "rg.to",
39
             ]
40
41
42
# disabled
43
"""
44
"https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=10000",
45
"""
46
47
# the number after before=  is the date in epoch format
48
# I found the values below by checking the last result at each link
49
50
pagesToCheckForLinks = [
51
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&after=1543663785",
52
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1543663785",
53
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1526916567",
54
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1513931734",
55
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1497490290",
56
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1484223692",
57
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1460743035",
58
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1444658067",
59
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440908907",
60
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440996579",
61
    "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440991337",
62
]
63
64
# reddit settings
65
bot = praw.Reddit(client_id='---',
66
                  client_secret='---',
67
                  user_agent='MySimpleBot v0.1',
68
                  username='---',
69
                  password='---')
70
71
print('logged in to Reddit as: ' + str(bot.user.me()))
72
73
74
# to read existing file
75
def updateTextInFileVar():
76
    global textInFile
77
    file = open("opendirectories.txt", "r")
78
    textInFile = file.read()
79
    file.close()
80
81
82
# function for sorting on domain and subdomain
83
def domain(link):
84
    try:
85
        netlocParts = urlsplit(link).netloc.split(".")
86
        sortTerm = netlocParts[-2]
87
        try:
88
            sub = netlocParts[-3]
89
            sortTerm += sub
90
        except:
91
            pass
92
        return sortTerm
93
    except:
94
        return link
95
96
97
def checkDirectory(linkToCheck):
98
    global workingLinks
99
    global threads
100
    netLoc = urlsplit(linkToCheck).netloc
101
    dirLinks = []
102
103
    try:
104
        page = requests.get(linkToCheck, timeout=timeoutForTesting).text
105
106
        if "</script>" in page and "bitdl" not in linkToCheck and 'bitdownload' not in linkToCheck and "Index of /" not in page:  # bitdl is an od with js
107
            print(linkToCheck, "has javascript. Skipping")
108
            raise ZeroDivisionError
109
110
        soup = BeautifulSoup(page, 'html.parser')
111
112
        for link in soup.findAll('a'):
113
            href = str(link.get('href'))
114
115
            if "?C" in href:  # skip sorting links
116
                continue
117
118
            fullLink = urljoin(linkToCheck, href)
119
            if "?C=" not in fullLink and len(fullLink) > len(link):
120
                if netLoc in fullLink and fullLink not in dirLinks:
121
                    dirLinks.append(fullLink)
122
123
        # print(len(dirLinks), "found on", linkToCheck)
124
125
        if len(dirLinks) >= 2:
126
            workingLinks.append(linkToCheck)
127
            if linkToCheck not in textInFile:
128
                print(linkToCheck, "is new")
129
130
                pFile = open("opendirectories.txt", "a")
131
                pFile.write(linkToCheck + "\n")
132
                pFile.close()
133
134
    except Exception as e:
135
        print(linkToCheck, "failed:")
136
137
    threads -= 1
138
139
# if there is 123.com/files/123 and 123.com/files keep only the shortest
140
def shortestPartialDuplicateLink(links):
141
    linksPerNetloc = {}
142
143
    for link in links:
144
        nl = urlsplit(link).netloc
145
146
        if nl not in linksPerNetloc:
147
            linksPerNetloc[nl] = []
148
149
        linksPerNetloc[nl].append(link)
150
151
    shortestLinks = []
152
    for key in linksPerNetloc:
153
        linksPerNetloc[key].sort(key=len)
154
        shortestLinks.append(linksPerNetloc[key][0])
155
156
    return shortestLinks
157
158
159
# read existing file
160
updateTextInFileVar()
161
162
postsToScan = []
163
164
# get posts
165
RedditPosts = bot.subreddit('opendirectories').new(limit=postsToLoad)
166
167
168
# add custom links content to posts
169
for link in pagesToCheckForLinks:
170
    postsToScan.append(requests.get(link).text)
171
    print("loaded 1000 posts from pushshift")
172
173
for post in RedditPosts:
174
    # postsToScan.append(post)
175
    pass
176
177
# search for links in each post
178
for post in postsToScan:
179
180
    try:
181
        text = post.selftext
182
    except:
183
        text = post
184
185
    try:
186
        text += " " + post.url
187
    except:
188
        pass
189
190
    posts.append(text)
191
192
    urls = re.findall(URL_REGEX, text)
193
194
    print(len(posts), "posts read. Found", len(urls), "links in this one")
195
    # print(urls)
196
197
    blackListed = False
198
199
    for url in urls:
200
        # don't add links that contain blacklist terms
201
        for term in blackList:
202
            if term.lower() in url.lower():
203
                print(term, "found in ", url)
204
                blackListed = True
205
                break
206
            else:
207
                blackListed = False
208
209
        if not url.endswith("/"):
210
            url += "/"
211
212
        nl = urlsplit(url).netloc
213
214
        if url not in foundLinks and not blackListed and nl not in textInFile:
215
            foundLinks.append(url)
216
            print("adding", url)
217
        else:
218
            # print("not adding", url)
219
            pass
220
221
print("foundlinks", foundLinks)
222
223
# start checking threads for each url
224
for url in foundLinks:
225
    while threads >= maxThreads:
226
        print(threads, "threads already running. Waiting...")
227
        time.sleep(1)
228
229
    if url not in textInFile and url not in newLinks:
230
        threading.Thread(target=checkDirectory, args=(url,)).start()
231
        threads += 1
232
        print("started new thread", threads, "threads running. ", len(workingLinks), "working directories found")
233
    else:
234
        print(url, "is already in the file")
235
236
# wait for threads to finish
237
while threads > 0:
238
    try:
239
        time.sleep(1)
240
        print("Waiting for {} threads to finish. ctrl+c to stop".format(threads))
241
    except KeyboardInterrupt:
242
        print("keyboard interrupt")
243
        break
244
245
# read links that have been added to the file
246
file = open("opendirectories.txt", "r")
247
urls = file.read().split("\n")
248
file.close()
249
250
# keep only the shortest link of each web server
251
urls = (shortestPartialDuplicateLink(urls))
252
253
# sort list on (sub) domain
254
urls.sort(key=domain)
255
256
# write sorted links to file
257
file = open("opendirectories.txt", "w+")
258
for url in urls:
259
    if len(url) > 5:
260
        file.write(url + "\n")
261
file.close()
262
263
print("Got {} working directories with links from {} posts with {} links".format(len(workingLinks), len(posts),
264
                                                                                 len(foundLinks)))
265
input("\nCOMPLETED\n")