Advertisement
XenoTheStrange

Clunky Proxy Scraper (Several Sites)

Jan 3rd, 2024
1,921
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.55 KB | Source Code | 0 0
  1. #!/usr/bin/python3
  2.  
  3. import requests
  4. from bs4 import BeautifulSoup
  5. import subprocess
  6. from json import loads
  7. import os
  8. from os.path import exists
  9.  
  10. debug_success = True
  11. debug_error = False
  12.  
  13. Timeout = 5 #How long to wait for a proxy to respond
  14. keep_individual_lists = False #Delete lists when consolodating or not
  15.  
  16. def get_soup(url):
  17.     return BeautifulSoup(requests.get(url).text, "html.parser")
  18.  
  19. def appendToFile(filename, content):
  20.     with open(filename, "a") as file:
  21.         file.write(content)
  22.  
  23. def clearFile(filename):
  24.     with open(filename, "w") as file:
  25.         file.write("")
  26.  
  27. def parse_freeproxyupdate():
  28.     print("[INFO] Parsing https://freeproxyupdate.com/")
  29.     out_array = []
  30.     clearFile("fpu.list")
  31.     pages = ["socks5-proxy", "socks4-proxy", "https-ssl-proxy", "http-proxy"]
  32.     for page in pages:
  33.         url = f"https://freeproxyupdate.com/{page}"
  34.         html = get_soup(url)
  35.         table = html.select(".list-proxy")[0]
  36.         rows = table.find_all("tr")
  37.         for row in rows:
  38.             if "IP address" in row.text: continue
  39.             cells_legend = ["IP address","Port","Country","Region/City","Protocol","Anonimity","Speed", "Latency", "Response", "Uptime", "Last Checked"]
  40.             cells = list(row.children)
  41.             data = [cell.text for cell in cells if "\n" not in cell]
  42.             out_array.append(",".join(data))
  43.         appendToFile("fpu.list", "\n".join(out_array))
  44.  
  45. def parse_proxylist():
  46.     print("[INFO] Parsing https://www.proxy-list.download/ using their API")
  47.     url = "https://www.proxy-list.download/api/v2/get?l=en&t="
  48.     pages = ["socks5", "socks4", "https", "http"]
  49.     out_array = []
  50.     clearFile("pl.list")
  51.     for page in pages:
  52.         obj_legend = ["IP","PORT","ANON","COUNTRY","ISO","PING"]
  53.         obj = loads(requests.get(f"{url}{page}").text)["LISTA"]
  54.         #todo output to file as f-string
  55.         for i in obj:
  56.             out_array.append(f"{i['IP']},{i['PORT']},{page},{i['ANON']},{i['COUNTRY']},{i['ISO']},{i['PING']}")
  57.     appendToFile("pl.list", "\n".join(out_array))
  58.  
  59. def parse_dupesites(site):
  60.     urls = [("https://www.socks-proxy.net/", "socks-proxy.net.list"), ("https://www.sslproxies.org/", "sslproxies.org.list")]
  61.     for i in urls:
  62.         if site in i[0]:
  63.             url = i[0]
  64.             filename = i[1]
  65.     clearFile(filename)
  66.     print(f"[INFO] Parsing {url}")
  67.     page = get_soup(url)
  68.     table = page.select(".table.table-striped.table-bordered")[0]
  69.     rows = table.select("tr")
  70.     cells_legend_socks = ["IP Address", "Port", "Code", "Country", "Version", "Anonymity", "Https", "Last Checked"]
  71.     cells_legend_ssl = ["IP Address", "Port", "Code", "Country", "anonymity", "Google", "Https", "Last Checked"]
  72.     out_array = []
  73.     for row in rows:
  74.         cells = [i.text for i in row.children]
  75.         out_array.append(",".join(cells))
  76.     out_array.pop(0)#removes header legend text
  77.     appendToFile(filename, "\n".join(out_array))
  78.  
  79. def parse_proxyscrape():
  80.     print(f"[INFO] Parsing https://www.proxyscrape.com/free-proxy-list using their API")
  81.     clearFile("proxyscrape.list")
  82.     socks_url = "https://api.proxyscrape.com/v2/?request=getproxies&protocol=$SOCKSVERSION&timeout=10000&country=all"
  83.     https_url = "https://api.proxyscrape.com/v2/?request=getproxies&protocol=http&timeout=10000&country=all&ssl=$SSL&anonymity=all&simplified=true"
  84.     urls = [
  85.         (socks_url.replace("$SOCKSVERSION", "socks5"), "socks5"),
  86.         (socks_url.replace("$SOCKSVERSION", "socks4"), "socks4"),
  87.         (https_url.replace("$SSL", "yes"), "https"),
  88.         (https_url.replace("$SSL", "no"), "http")
  89.         ]
  90.     for url in urls:
  91.         req = requests.get(url[0]).text
  92.         lst = req.split("\r\n")
  93.         lst.pop()
  94.         for i, line in enumerate(lst):
  95.             lst[i] = f"{url[1]}://" + line
  96.         appendToFile("proxyscrape.list", "\n".join(lst)+"\n")
  97.  
  98. def check_proxy(url):
  99.     global Timeout
  100.     proxy = {"http":url, "https":url, "socks4":url, "socks5":url}
  101.     try:
  102.         test = requests.get("http://ifconfig.me/", proxies=proxy, timeout=Timeout)
  103.         if debug_success:print(test.text)
  104.         if test.ok and len(test.text) < 30: return True
  105.     except Exception as err:
  106.         if debug_error: print(type(err))
  107.         return False
  108.  
  109. def check_all():
  110.     print("[INFO] Checking proxies")
  111.     with open("proxies.list","r") as file:
  112.         urls = file.read().split("\n")
  113.         for url in urls:
  114.             print(f"[INFO] Checking url: {url}")
  115.             if check_proxy(url):
  116.                 appendToFile("good_proxies.list", f"{url}\n")
  117.                 print(f"[GOOD]")
  118.     os.remove("proxies.list")
  119.  
  120. def consolodate():
  121.     print("[INFO] Consolidating proxy urls into a single file")
  122.     with open("proxies.list","w") as file:
  123.         try:
  124.             #freeproxyupdate.com
  125.             lst = open("fpu.list", "r").read().split("\n")
  126.             for line in lst:
  127.                 data = line.split(",")
  128.                 url = f"{data[4]}://{data[0]}:{data[1]}/"
  129.                 file.write(url + "\n")
  130.         except Exception as err:
  131.             print(err)
  132.             pass;
  133.         #proxy-list.download
  134.         lst = open("pl.list", "r").read().split("\n")
  135.         for line in lst:
  136.             data = line.split(",")
  137.             url = f"{data[2]}://{data[0]}:{data[1]}"
  138.             file.write(url + "\n")
  139.         #socks-proxy.net
  140.         lst = open("socks-proxy.net.list", "r").read().split("\n")
  141.         for line in lst:
  142.             data = line.split(",")
  143.             url = f"{data[4]}://{data[0]}:{data[1]}/"
  144.             file.write(url + "\n")
  145.         #sslproxies.org
  146.         lst = open("sslproxies.org.list", "r").read().split("\n")
  147.         for line in lst:
  148.             data = line.split(",")
  149.             url = f"https://{data[0]}:{data[1]}"
  150.             file.write(url + "\n")
  151.         #proxyscrape.com is already formatted
  152.         lst = open("proxyscrape.list", "r").read()
  153.         file.write(lst)
  154.     if not keep_individual_lists:
  155.         os.remove("fpu.list")
  156.         os.remove("pl.list")
  157.         os.remove("socks-proxy.net.list")
  158.         os.remove("sslproxies.org.list")
  159.         os.remove("proxyscrape.list")
  160.  
  161. def main():
  162.     if not exists("proxies.list"):
  163.         parse_freeproxyupdate()
  164.         parse_proxylist()
  165.         parse_dupesites("sslproxies")
  166.         parse_dupesites("socks-proxy")
  167.         parse_proxyscrape()
  168.         consolodate()
  169.     else:
  170.         print("[INFO] proxies.list already exists. Delete it to reaquire IPS. Commence checking.")
  171.     check_all()
  172.  
  173. if __name__ == "__main__":
  174.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement