Wannabe99

Opendirs in python

Apr 6th, 2019
159
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import praw
  2. import re
  3. from urllib.parse import *
  4. import threading
  5. import requests
  6. import time
  7. from bs4 import BeautifulSoup
  8. import os
  9.  
  10.  
  11.  
  12. posts = []
  13. foundLinks = []
  14. newLinks = []
  15. workingLinks = []
  16. threads = 0
  17. maxThreads = 250
  18. postsToLoad = 10
  19. timeoutForTesting = 10
  20.  
  21. URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
  22.  
  23. blackList = ["ch0c",
  24. "pastebin.com",
  25. "anotherBlacklistedItem",
  26. "filepursuit",
  27. "github.com",
  28. "reddit.com",
  29. "shodan.io",
  30. "wikipedia",
  31. "the-eye",
  32. "twitter",
  33. "facebook",
  34. "youtube",
  35. "tumblr.com",
  36. "archive.org",
  37. "archive.org",
  38. ]
  39.  
  40.  
  41. # reddit settings
  42. bot = praw.Reddit(client_id='YOUR REDDIT APP ID',
  43. client_secret='YOUR REDDIT APP SECRET',
  44. user_agent='MySimpleBot v0.1',
  45. username='YOUR REDDIT USERNAME',
  46. password='YOUR REDDIT PASSWORD')
  47.  
  48.  
  49. print('logged in to Reddit as: ' + str(bot.user.me()))
  50.  
  51.  
  52. # read existing file
  53. def updateTextInFileVar():
  54. global textInFile
  55. file = open("opendirectories.txt", "r")
  56. textInFile = file.read()
  57. file.close()
  58.  
  59.  
  60. # function for sorting on domain and subdomain
  61. def domain(link):
  62. try:
  63. netlocParts = urlsplit(link).netloc.split(".")
  64. sortTerm = netlocParts[-2]
  65. try:
  66. sub = netlocParts[-3]
  67. sortTerm += sub
  68. except:
  69. pass
  70. return sortTerm
  71. except:
  72. return link
  73.  
  74.  
  75. def checkDirectory(linkToCheck):
  76. global workingLinks
  77. global threads
  78. netLoc = urlsplit(linkToCheck).netloc
  79. dirLinks = []
  80.  
  81. try:
  82. page = requests.get(linkToCheck, timeout=timeoutForTesting).text
  83.  
  84. if "</script>" in page and "bitdl" not in linkToCheck and 'bitdownload' not in linkToCheck and "Index of /" not in page: # bitdl is an od with js
  85. print(linkToCheck, "has javascript. Skipping")
  86. raise ZeroDivisionError
  87.  
  88. soup = BeautifulSoup(page, 'html.parser')
  89.  
  90. for link in soup.findAll('a'):
  91. href = str(link.get('href'))
  92.  
  93. if "?C" in href: # skip sorting links
  94. continue
  95.  
  96. fullLink = urljoin(linkToCheck, href)
  97. if "?C=" not in fullLink and len(fullLink) > len(link):
  98. if netLoc in fullLink and fullLink not in dirLinks:
  99. dirLinks.append(fullLink)
  100.  
  101. print(len(dirLinks), "found on", linkToCheck)
  102.  
  103. if len(dirLinks) >= 2:
  104. workingLinks.append(linkToCheck)
  105. if linkToCheck not in textInFile:
  106. print(linkToCheck, "is new")
  107.  
  108. pFile = open("opendirectories.txt", "a")
  109. pFile.write(linkToCheck + "\n")
  110. pFile.close()
  111.  
  112. except Exception as e:
  113. print(linkToCheck, "failed:")
  114.  
  115. threads -= 1
  116.  
  117.  
  118. def shortestPartialDuplicateLink(links):
  119. linksPerNetloc = {}
  120.  
  121. for link in links:
  122. nl = urlsplit(link).netloc
  123.  
  124. if nl not in linksPerNetloc:
  125. linksPerNetloc[nl] = []
  126.  
  127. linksPerNetloc[nl].append(link)
  128.  
  129. shortestLinks = []
  130. for key in linksPerNetloc:
  131. linksPerNetloc[key].sort(key=len)
  132. shortestLinks.append(linksPerNetloc[key][0])
  133.  
  134. return shortestLinks
  135.  
  136.  
  137. # read existing file
  138. updateTextInFileVar()
  139.  
  140. # get posts
  141. RedditPosts = bot.subreddit('opendirectories').new(limit=postsToLoad)
  142.  
  143.  
  144. # search for links in each post
  145. for post in RedditPosts:
  146. text = post.selftext
  147.  
  148. try:
  149. text += " " + post.url
  150. except:
  151. pass
  152.  
  153. posts.append(text)
  154.  
  155. urls = re.findall(URL_REGEX, text)
  156.  
  157. print(len(posts), "posts read. Found", len(urls), "links in this one")
  158. blackListed = False
  159.  
  160. for url in urls:
  161. # don't add links that contain blacklist terms
  162. for term in blackList:
  163. if term.lower() in url.lower():
  164. blackListed = True
  165. break
  166.  
  167. if not url.endswith("/"):
  168. url += "/"
  169.  
  170. nl = urlsplit(url).netloc
  171.  
  172. if url not in foundLinks and not blackListed and nl not in textInFile:
  173. foundLinks.append(url)
  174.  
  175.  
  176. # start checking threads for each url
  177. for url in foundLinks:
  178. while threads >= maxThreads:
  179. print(threads, "threads already running. Waiting...")
  180. time.sleep(1)
  181.  
  182. if url not in textInFile and url not in newLinks:
  183. threading.Thread(target=checkDirectory, args=(url, )).start()
  184. threads += 1
  185. print("started new thread")
  186. else:
  187. print(url, "is already in the file")
  188.  
  189.  
  190. # wait for threads to finish
  191. while threads > 0:
  192. try:
  193. time.sleep(1)
  194. print("Waiting for {} threads to finish. ctrl+c to stop".format(threads))
  195. except KeyboardInterrupt:
  196. print("keyboard interrupt")
  197. break
  198.  
  199.  
  200. # read links that have been added to the file
  201. file = open("opendirectories.txt", "r")
  202. urls = file.read().split("\n")
  203. file.close()
  204.  
  205.  
  206. # keep only the shortest link of each web server
  207. urls = (shortestPartialDuplicateLink(urls))
  208.  
  209. # sort list on (sub) domain
  210. urls.sort(key=domain)
  211.  
  212.  
  213. # write sorted links to file
  214. file = open("opendirectories.txt", "w+")
  215. for url in urls:
  216. if len(url) > 5:
  217. file.write(url + "\n")
  218. file.close()
  219.  
  220. print("Got {} working directories with links from {} posts with {} links".format(len(workingLinks), len(posts), len(foundLinks)))
  221. input("\nCOMPLETED\n")
Add Comment
Please, Sign In to add comment