Advertisement
TheMostDopePizza

Python proxy scraper [source] by TheMostDopePizza

Sep 2nd, 2015
14,833
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.56 KB | None | 0 0
  1. print'EasyProxy'
  2. print'Developed by TheMostDopePizza'
  3. print'version 1.0'
  4. print'[%]scraping started'
  5. import urllib
  6. from time import gmtime, strftime, sleep
  7.  
  8. #Functions,
  9. def isnum(ch):
  10. if ch == "0":
  11. return True
  12. if ch == "1":
  13. return True
  14. if ch == "2":
  15. return True
  16. if ch == "3":
  17. return True
  18. if ch == "4":
  19. return True
  20. if ch == "5":
  21. return True
  22. if ch == "6":
  23. return True
  24. if ch == "7":
  25. return True
  26. if ch == "8":
  27. return True
  28. if ch == "9":
  29. return True
  30. return False
  31.  
  32. def alfabetcheck(line):
  33. sw = False
  34. if "a" in line:
  35. sw = True
  36. if "b" in line:
  37. sw = True
  38. if "c" in line:
  39. sw = True
  40. if "d" in line:
  41. sw = True
  42. if "e" in line:
  43. sw = True
  44. if "f" in line:
  45. sw = True
  46. if "g" in line:
  47. sw = True
  48. if "h" in line:
  49. sw = True
  50. if "i" in line:
  51. sw = True
  52. if "j" in line:
  53. sw = True
  54. if "k" in line:
  55. sw = True
  56. if "l" in line:
  57. sw = True
  58. if "m" in line:
  59. sw = True
  60. if "n" in line:
  61. sw = True
  62. if "o" in line:
  63. sw = True
  64. if "p" in line:
  65. sw = True
  66. if "q" in line:
  67. sw = True
  68. if "r" in line:
  69. sw = True
  70. if "s" in line:
  71. sw = True
  72. if "t" in line:
  73. sw = True
  74. if "u" in line:
  75. sw = True
  76. if "v" in line:
  77. sw = True
  78. if "w" in line:
  79. sw = True
  80. if "x" in line:
  81. sw = True
  82. if "y" in line:
  83. sw = True
  84. if "z" in line:
  85. sw = True
  86. if sw == True:
  87. return False
  88. else:
  89. return True
  90.  
  91. def writetofile(lines):
  92. for line in lines:
  93. wfile.write(line + "\n")
  94.  
  95. def process (source):
  96. proxys = []
  97. templist = []
  98. temp = ""
  99.  
  100. for line in source:
  101. temp += line
  102.  
  103. z = 0
  104. prt1 = ""
  105. prt2 = ""
  106. for itm in temp:
  107. if ":" in itm:
  108. prt1 = temp[z - 15:z]
  109. prt2 = temp[z:z + 6]
  110. templist.append(prt1 + prt2)
  111. z += 1
  112.  
  113. for line in templist:
  114. if alfabetcheck(line):
  115. proxys.append(line)
  116.  
  117. x = 0
  118. for line in proxys:
  119. temp = ""
  120. for itm in line:
  121. if isnum(itm) or itm == "." or itm == ":":
  122. temp += itm
  123. proxys[x] = temp
  124. x += 1
  125. return proxys
  126.  
  127.  
  128. #The Websites proxies are gonna be scraped from, currently, unknown
  129. urls = ["http://50kproxies.com/10-february-10-02-new-fresh-daily-50000-proxy-list-50kproxies-com/",
  130. "http://50kproxies.com/11-january-11-01-new-fresh-daily-50000-proxy-list-50kproxies-com/",
  131. "http://50na50.net/",
  132. "http://50na50.net/proxy/httplist",
  133. "http://50na50.net/no_anonim_http.txt",
  134. "http://aliveproxy.com/anonymous-proxy-list",
  135. "http://aliveproxy.com/ca-proxy-list",
  136. "http://aliveproxy.com/fastest-proxies",
  137. "http://aliveproxy.com/fr-proxy-list",
  138. "http://aliveproxy.com/gb-proxy-list",
  139. "http://aliveproxy.com/high-anonymity-proxy-list",
  140. "http://aliveproxy.com/jp-proxy-list",
  141. "http://aliveproxy.com/proxy-list-port-3128",
  142. "http://aliveproxy.com/proxy-list-port-80",
  143. "http://aliveproxy.com/proxy-list-port-8000",
  144. "http://aliveproxy.com/proxy-list-port-8080",
  145. "http://aliveproxy.com/ru-proxy-list",
  146. "http://aliveproxy.com/us-proxy-list",
  147. "http://atomintersoft.com/anonymous_proxy_list",
  148. "http://atomintersoft.com/high_anonymity_elite_proxy_list",
  149. "http://atomintersoft.com/products/alive-proxy/proxy-list",
  150. "http://atomintersoft.com/products/alive-proxy/proxy-list?ap=9",
  151. "http://atomintersoft.com/products/alive-proxy/proxy-list/3128",
  152. "http://atomintersoft.com/products/alive-proxy/proxy-list/com",
  153. "http://atomintersoft.com/products/alive-proxy/proxy-list/high-anonymity/",
  154. "http://atomintersoft.com/products/alive-proxy/socks5-list",
  155. "http://atomintersoft.com/proxy_list_domain_com",
  156. "http://atomintersoft.com/proxy_list_domain_edu",
  157. "http://atomintersoft.com/proxy_list_domain_net",
  158. "http://atomintersoft.com/proxy_list_domain_org",
  159. "http://atomintersoft.com/proxy_list_port_3128",
  160. "http://atomintersoft.com/proxy_list_port_80",
  161. "http://atomintersoft.com/proxy_list_port_8000",
  162. "http://atomintersoft.com/proxy_list_port_81",
  163. "http://atomintersoft.com/transparent_proxy_list",
  164. "http://best-proxy.com/english/search.php?search=anonymous-and-elite&country=any&type=anonymous-and-elite&port=any&ssl=any",
  165. "http://best-proxy.com/english/search.php?search=anonymous-and-elite&country=any&type=anonymous-and-elite&port=any&ssl=any&p=2",
  166. "http://best-proxy.com/english/search.php?search=anonymous-and-elite&country=any&type=anonymous-and-elite&port=any&ssl=any&p=3",
  167. "http://bestproxy.narod.ru/proxy2.html",
  168. "http://checkerproxy.net/all_proxy",
  169. "http://ejohn.org/apps/anon/",
  170. "http://free-proxy-list.net/",
  171. "http://free-proxy-list.net/anonymous-proxy.html",
  172. "http://free-proxy-list.net/uk-proxy.html",
  173. "http://guncelproxy.com/Anasayfa/",
  174. "http://multiproxy.org/anon_proxy.htm",
  175. "http://multiproxy.org/txt_all/proxy.txt",
  176. "http://nntime.com/proxy-list-01.htm",
  177. "http://nntime.com/proxy-list-02.htm",
  178. "http://nntime.com/proxy-list-03.htm",
  179. "http://nntime.com/proxy-list-04.htm",
  180. "http://nntime.com/proxy-list-05.htm",
  181. "http://nntime.com/proxy-list-06.htm",
  182. "http://nntime.com/proxy-list-07.htm",
  183. "http://nntime.com/proxy-list-08.htm",
  184. "http://nntime.com/proxy-list-09.htm",
  185. "http://nntime.com/proxy-list-10.htm",
  186. "http://nntime.com/proxy-list-11.htm",
  187. "http://nntime.com/proxy-list-12.htm",
  188. "http://nntime.com/proxy-list-13.htm",
  189. "http://nntime.com/proxy-list-14.htm",
  190. "http://nntime.com/proxy-list-15.htm",
  191. "http://nntime.com/proxy-list-17.htm",
  192. "http://nntime.com/proxy-list-18.htm",
  193. "http://nntime.com/proxy-list-19.htm",
  194. "http://nntime.com/proxy-list-20.htm",
  195. "http://nntime.com/proxy-list-21.htm",
  196. "http://nntime.com/proxy-list-22.htm",
  197. "http://nntime.com/proxy-list-23.htm",
  198. "http://nntime.com/proxy-list-24.htm",
  199. "http://nntime.com/proxy-list-25.htm",
  200. "http://nntime.com/proxy-list-27.htm",
  201. "http://nntime.com/proxy-list-28.htm",
  202. "http://nntime.com/proxy-list-29.htm",
  203. "http://nntime.com/proxy-list-30.htm",
  204. "http://notan.h1.ru/hack/xwww/proxy1.html",
  205. "http://notan.h1.ru/hack/xwww/proxy10.html",
  206. "http://notan.h1.ru/hack/xwww/proxy2.html",
  207. "http://notan.h1.ru/hack/xwww/proxy3.html",
  208. "http://notan.h1.ru/hack/xwww/proxy4.html",
  209. "http://notan.h1.ru/hack/xwww/proxy5.html",
  210. "http://notan.h1.ru/hack/xwww/proxy6.html",
  211. "http://notan.h1.ru/hack/xwww/proxy7.html",
  212. "http://notan.h1.ru/hack/xwww/proxy8.html",
  213. "http://notan.h1.ru/hack/xwww/proxy9.html",
  214. "http://proxy.speedtest.at/proxybyActuality.php?offset=0",
  215. "http://proxy.speedtest.at/proxybyActuality.php?offset=100",
  216. "http://proxy.speedtest.at/proxybyActuality.php?offset=125",
  217. "http://proxy.speedtest.at/proxybyActuality.php?offset=150",
  218. "http://proxy.speedtest.at/proxybyActuality.php?offset=175",
  219. "http://proxy.speedtest.at/proxybyActuality.php?offset=200",
  220. "http://proxy.speedtest.at/proxybyActuality.php?offset=225",
  221. "http://proxy.speedtest.at/proxybyActuality.php?offset=25",
  222. "http://proxy.speedtest.at/proxybyActuality.php?offset=250",
  223. "http://proxy.speedtest.at/proxybyActuality.php?offset=275",
  224. "http://proxy.speedtest.at/proxybyActuality.php?offset=300",
  225. "http://proxy.speedtest.at/proxybyActuality.php?offset=325",
  226. "http://proxy.speedtest.at/proxybyActuality.php?offset=350",
  227. "http://proxy.speedtest.at/proxybyActuality.php?offset=375",
  228. "http://proxy.speedtest.at/proxybyActuality.php?offset=400",
  229. "http://proxy.speedtest.at/proxybyActuality.php?offset=425",
  230. "http://proxy.speedtest.at/proxybyActuality.php?offset=450",
  231. "http://proxy.speedtest.at/proxybyActuality.php?offset=475",
  232. "http://proxy.speedtest.at/proxybyActuality.php?offset=50",
  233. "http://proxy.speedtest.at/proxybyActuality.php?offset=500",
  234. "http://proxy.speedtest.at/proxybyActuality.php?offset=525",
  235. "http://proxy.speedtest.at/proxybyActuality.php?offset=550",
  236. "http://proxy.speedtest.at/proxybyActuality.php?offset=575",
  237. "http://proxy.speedtest.at/proxybyActuality.php?offset=600",
  238. "http://proxy.speedtest.at/proxybyActuality.php?offset=625",
  239. "http://proxy.speedtest.at/proxybyActuality.php?offset=650",
  240. "http://proxy.speedtest.at/proxybyActuality.php?offset=675",
  241. "http://proxy.speedtest.at/proxybyActuality.php?offset=700",
  242. "http://proxy.speedtest.at/proxybyActuality.php?offset=75",
  243. "http://proxylist.sakura.ne.jp/index.htm?pages=0",
  244. "http://proxylist.sakura.ne.jp/index.htm?pages=1",
  245. "http://proxylist.sakura.ne.jp/index.htm?pages=2",
  246. "http://proxylist.sakura.ne.jp/index.htm?pages=3",
  247. "http://proxylist.sakura.ne.jp/index.htm?pages=4",
  248. "http://proxylistchecker.org/proxylists.php",
  249. "http://proxylistchecker.org/proxylists.php?t=&p=10",
  250. "http://proxylistchecker.org/proxylists.php?t=&p=2",
  251. "http://proxylistchecker.org/proxylists.php?t=&p=3",
  252. "http://proxylistchecker.org/proxylists.php?t=&p=4",
  253. "http://proxylistchecker.org/proxylists.php?t=&p=5",
  254. "http://proxylistchecker.org/proxylists.php?t=&p=6",
  255. "http://proxylistchecker.org/proxylists.php?t=&p=7",
  256. "http://proxylistchecker.org/proxylists.php?t=&p=8",
  257. "http://proxylistchecker.org/proxylists.php?t=&p=9",
  258. "http://rootjazz.com/proxies/proxies.txt",
  259. "http://samair.ru/proxy/proxy-01.htm",
  260. "http://samair.ru/proxy/proxy-02.htm",
  261. "http://samair.ru/proxy/proxy-03.htm",
  262. "http://samair.ru/proxy/proxy-04.htm",
  263. "http://samair.ru/proxy/proxy-05.htm",
  264. "http://samair.ru/proxy/proxy-06.htm",
  265. "http://samair.ru/proxy/proxy-07.htm",
  266. "http://samair.ru/proxy/proxy-08.htm",
  267. "http://samair.ru/proxy/proxy-09.htm",
  268. "http://samair.ru/proxy/proxy-10.htm",
  269. "http://samair.ru/proxy/proxy-11.htm",
  270. "http://samair.ru/proxy/proxy-12.htm",
  271. "http://samair.ru/proxy/proxy-13.htm",
  272. "http://samair.ru/proxy/proxy-14.htm",
  273. "http://samair.ru/proxy/proxy-15.htm",
  274. "http://samair.ru/proxy/proxy-16.htm",
  275. "http://samair.ru/proxy/proxy-17.htm",
  276. "http://samair.ru/proxy/proxy-18.htm",
  277. "http://samair.ru/proxy/proxy-19.htm",
  278. "http://samair.ru/proxy/proxy-20.htm",
  279. "http://samair.ru/proxy/proxy-21.htm",
  280. "http://samair.ru/proxy/proxy-22.htm",
  281. "http://samair.ru/proxy/proxy-23.htm",
  282. "http://samair.ru/proxy/proxy-24.htm",
  283. "http://samair.ru/proxy/proxy-25.htm",
  284. "http://samair.ru/proxy/proxy-26.htm",
  285. "http://samair.ru/proxy/proxy-27.htm",
  286. "http://samair.ru/proxy/proxy-28.htm",
  287. "http://samair.ru/proxy/proxy-29.htm",
  288. "http://samair.ru/proxy/proxy-30.htm",
  289. "http://spys.ru/en/anonymous-proxy-list/",
  290. "http://spys.ru/en/free-proxy-list/",
  291. "http://tools.rosinstrument.com/proxy/?rule1",
  292. "http://txt.proxyspy.net/proxy.txt",
  293. "http://vmarte.com/proxy/proxy_all.txt",
  294. "http://www.getproxy.jp/en/default/1",
  295. "http://www.getproxy.jp/en/default/2",
  296. "http://www.getproxy.jp/en/default/3",
  297. "http://www.getproxy.jp/en/default/4",
  298. "http://www.getproxy.jp/en/default/5",
  299. "http://www.google-proxy.net/",
  300. "http://www.ip-adress.com/proxy_list/?k=time&d=desc",
  301. "http://www.my-proxy.com/free-proxy-list.html",
  302. "http://www.proxy4ever.com/",
  303. "http://www.proxyblind.org/anonymous-proxy.shtml",
  304. "http://www.proxyblind.org/free-proxy.shtml",
  305. "http://www.proxyblind.org/proxy-list.shtml",
  306. "http://www.proxyblind.org/ssl.shtml",
  307. "http://www.proxyforest.com/proxy.htm",
  308. "http://www.socks-proxy.net/",
  309. "http://www.ultrasurf.org/",
  310. "http://www.us-proxy.org/",]
  311.  
  312. for c in range(2, 11):
  313. urls.append("http://proxy-list.org/english/index.php?p=" + str(c))
  314. for c in range(2,31):
  315. if c < 10:
  316. urls.append("http://www.samair.ru/proxy/proxy-0" + str(c) + ".htm")
  317. else:
  318. urls.append("http://www.samair.ru/proxy/proxy-" + str(c) + ".htm")
  319.  
  320.  
  321. timestamp = strftime("%d, %b, %Y, %H, %M, %S", gmtime())
  322. wfile = open("proxies" + timestamp + ".txt","w")
  323.  
  324. proxycount = 0
  325. for x in range(len(urls)):
  326. proxies = []
  327. try:
  328. response = urllib.urlopen(urls[x])
  329. except Exception as e:
  330. print "An error occured at {}".format(urls[x])
  331. continue
  332. print "[%]Grabbing " + urls[x]
  333. html = response.read()
  334. response.close()
  335. proxies += process(html)
  336. writetofile(proxies)
  337. proxycount += len(proxies)
  338. sleep(1)
  339.  
  340.  
  341. print "Grabbing proxies finished, proxies scraped and saved!: " + str(proxycount)
  342. wfile.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement