Wannabe99

Skumma igenom

Apr 6th, 2019
148
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import praw
  2. import re
  3. from urllib.parse import *
  4. import threading
  5. import requests
  6. import time
  7. from bs4 import BeautifulSoup
  8.  
  9.  
  10. posts = []
  11. foundLinks = []
  12. newLinks = []
  13. workingLinks = []
  14. threads = 0
  15. maxThreads = 250
  16. postsToLoad = 10
  17. timeoutForTesting = 10
  18.  
  19. URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
  20.  
  21. blackList = ["ch0c",
  22.              "pastebin.com",
  23.              "anotherBlacklistedItem",
  24.              "filepursuit",
  25.              "github.com",
  26.              "reddit.com",
  27.              "shodan.io",
  28.              "wikipedia",
  29.              "the-eye",
  30.              "twitter",
  31.              "facebook",
  32.              "youtube",
  33.              "tumblr.com",
  34.              "archive.org",
  35.              "archive.org",
  36.              "i.redd.it",
  37.              "redditmedia.com",
  38.              "rg.to",
  39.              ]
  40.  
  41.  
  42. # disabled
  43. """
  44. "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=10000",
  45. """
  46.  
  47. # the number after before=  is the date in epoch format
  48. # I found the values below by checking the last result at each link
  49.  
  50. pagesToCheckForLinks = [
  51.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&after=1543663785",
  52.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1543663785",
  53.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1526916567",
  54.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1513931734",
  55.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1497490290",
  56.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1484223692",
  57.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1460743035",
  58.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1444658067",
  59.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440908907",
  60.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440996579",
  61.     "https://api.pushshift.io/reddit/search/submission?subreddit=opendirectories&limit=1000&before=1440991337",
  62. ]
  63.  
  64. # reddit settings
  65. bot = praw.Reddit(client_id='---',
  66.                   client_secret='---',
  67.                   user_agent='MySimpleBot v0.1',
  68.                   username='---',
  69.                   password='---')
  70.  
  71. print('logged in to Reddit as: ' + str(bot.user.me()))
  72.  
  73.  
  74. # to read existing file
  75. def updateTextInFileVar():
  76.     global textInFile
  77.     file = open("opendirectories.txt", "r")
  78.     textInFile = file.read()
  79.     file.close()
  80.  
  81.  
  82. # function for sorting on domain and subdomain
  83. def domain(link):
  84.     try:
  85.         netlocParts = urlsplit(link).netloc.split(".")
  86.         sortTerm = netlocParts[-2]
  87.         try:
  88.             sub = netlocParts[-3]
  89.             sortTerm += sub
  90.         except:
  91.             pass
  92.         return sortTerm
  93.     except:
  94.         return link
  95.  
  96.  
  97. def checkDirectory(linkToCheck):
  98.     global workingLinks
  99.     global threads
  100.     netLoc = urlsplit(linkToCheck).netloc
  101.     dirLinks = []
  102.  
  103.     try:
  104.         page = requests.get(linkToCheck, timeout=timeoutForTesting).text
  105.  
  106.         if "</script>" in page and "bitdl" not in linkToCheck and 'bitdownload' not in linkToCheck and "Index of /" not in page:  # bitdl is an od with js
  107.             print(linkToCheck, "has javascript. Skipping")
  108.             raise ZeroDivisionError
  109.  
  110.         soup = BeautifulSoup(page, 'html.parser')
  111.  
  112.         for link in soup.findAll('a'):
  113.             href = str(link.get('href'))
  114.  
  115.             if "?C" in href:  # skip sorting links
  116.                 continue
  117.  
  118.             fullLink = urljoin(linkToCheck, href)
  119.             if "?C=" not in fullLink and len(fullLink) > len(link):
  120.                 if netLoc in fullLink and fullLink not in dirLinks:
  121.                     dirLinks.append(fullLink)
  122.  
  123.         # print(len(dirLinks), "found on", linkToCheck)
  124.  
  125.         if len(dirLinks) >= 2:
  126.             workingLinks.append(linkToCheck)
  127.             if linkToCheck not in textInFile:
  128.                 print(linkToCheck, "is new")
  129.  
  130.                 pFile = open("opendirectories.txt", "a")
  131.                 pFile.write(linkToCheck + "\n")
  132.                 pFile.close()
  133.  
  134.     except Exception as e:
  135.         print(linkToCheck, "failed:")
  136.  
  137.     threads -= 1
  138.  
  139. # if there is 123.com/files/123 and 123.com/files keep only the shortest
  140. def shortestPartialDuplicateLink(links):
  141.     linksPerNetloc = {}
  142.  
  143.     for link in links:
  144.         nl = urlsplit(link).netloc
  145.  
  146.         if nl not in linksPerNetloc:
  147.             linksPerNetloc[nl] = []
  148.  
  149.         linksPerNetloc[nl].append(link)
  150.  
  151.     shortestLinks = []
  152.     for key in linksPerNetloc:
  153.         linksPerNetloc[key].sort(key=len)
  154.         shortestLinks.append(linksPerNetloc[key][0])
  155.  
  156.     return shortestLinks
  157.  
  158.  
  159. # read existing file
  160. updateTextInFileVar()
  161.  
  162. postsToScan = []
  163.  
  164. # get posts
  165. RedditPosts = bot.subreddit('opendirectories').new(limit=postsToLoad)
  166.  
  167.  
  168. # add custom links content to posts
  169. for link in pagesToCheckForLinks:
  170.     postsToScan.append(requests.get(link).text)
  171.     print("loaded 1000 posts from pushshift")
  172.  
  173. for post in RedditPosts:
  174.     # postsToScan.append(post)
  175.     pass
  176.  
  177. # search for links in each post
  178. for post in postsToScan:
  179.  
  180.     try:
  181.         text = post.selftext
  182.     except:
  183.         text = post
  184.  
  185.     try:
  186.         text += " " + post.url
  187.     except:
  188.         pass
  189.  
  190.     posts.append(text)
  191.  
  192.     urls = re.findall(URL_REGEX, text)
  193.  
  194.     print(len(posts), "posts read. Found", len(urls), "links in this one")
  195.     # print(urls)
  196.  
  197.     blackListed = False
  198.  
  199.     for url in urls:
  200.         # don't add links that contain blacklist terms
  201.         for term in blackList:
  202.             if term.lower() in url.lower():
  203.                 print(term, "found in ", url)
  204.                 blackListed = True
  205.                 break
  206.             else:
  207.                 blackListed = False
  208.  
  209.         if not url.endswith("/"):
  210.             url += "/"
  211.  
  212.         nl = urlsplit(url).netloc
  213.  
  214.         if url not in foundLinks and not blackListed and nl not in textInFile:
  215.             foundLinks.append(url)
  216.             print("adding", url)
  217.         else:
  218.             # print("not adding", url)
  219.             pass
  220.  
  221. print("foundlinks", foundLinks)
  222.  
  223. # start checking threads for each url
  224. for url in foundLinks:
  225.     while threads >= maxThreads:
  226.         print(threads, "threads already running. Waiting...")
  227.         time.sleep(1)
  228.  
  229.     if url not in textInFile and url not in newLinks:
  230.         threading.Thread(target=checkDirectory, args=(url,)).start()
  231.         threads += 1
  232.         print("started new thread", threads, "threads running. ", len(workingLinks), "working directories found")
  233.     else:
  234.         print(url, "is already in the file")
  235.  
  236. # wait for threads to finish
  237. while threads > 0:
  238.     try:
  239.         time.sleep(1)
  240.         print("Waiting for {} threads to finish. ctrl+c to stop".format(threads))
  241.     except KeyboardInterrupt:
  242.         print("keyboard interrupt")
  243.         break
  244.  
  245. # read links that have been added to the file
  246. file = open("opendirectories.txt", "r")
  247. urls = file.read().split("\n")
  248. file.close()
  249.  
  250. # keep only the shortest link of each web server
  251. urls = (shortestPartialDuplicateLink(urls))
  252.  
  253. # sort list on (sub) domain
  254. urls.sort(key=domain)
  255.  
  256. # write sorted links to file
  257. file = open("opendirectories.txt", "w+")
  258. for url in urls:
  259.     if len(url) > 5:
  260.         file.write(url + "\n")
  261. file.close()
  262.  
  263. print("Got {} working directories with links from {} posts with {} links".format(len(workingLinks), len(posts),
  264.                                                                                  len(foundLinks)))
  265. input("\nCOMPLETED\n")
Add Comment
Please, Sign In to add comment