Advertisement
Guest User

Python romanticdevil-crawler.py

a guest
Nov 27th, 2011
241
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.73 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # Hardcoding by Jimmyromanticdevil aka rahmat ramadhan iryanto
  3. # romanticdevil.jimmy@gmail.com
  4. # you can distribution anything do you want
  5. # http://jimmyromanticdevil.wordpress.com
  6. # webcrawler(romanticdevil-crawler.py)
  7. import urllib2
  8. import BeautifulSoup
  9. import sys
  10. import urllib
  11. from BeautifulSoup import BeautifulSoup as ziachow
  12. import urlparse
  13. from urllib2 import urlopen
  14. from urllib import urlretrieve
  15. import os
  16. import os.path
  17. import re
  18. import random
  19. out_folder = ""
  20. user_agent = ['Mozilla/4.0 (compatible; MSIE 5.0; SunOS 5.10 sun4u; X11)',
  21. 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2pre) Gecko/20100207 Ubuntu/9.04 (jaunty) Namoroka/3.6.2pre',
  22. 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser;',
  23. 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)',
  24. 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
  25. 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6)',
  26. 'Microsoft Internet Explorer/4.0b1 (Windows 95)',
  27. 'Opera/8.00 (Windows NT 5.1; U; en)',
  28. 'amaya/9.51 libwww/5.4.0',
  29. 'Mozilla/4.0 (compatible; MSIE 5.0; AOL 4.0; Windows 95; c_athome)',
  30. 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
  31. 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
  32. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; ZoomSpider.net bot; .NET CLR 1.1.4322)',
  33. 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; QihooBot 1.0 qihoobot@qihoo.net)',
  34. 'Mozilla/4.0 (compatible; MSIE 5.0; Windows ME) Opera 5.11 [en]'
  35. ]
  36.  
  37.  
  38. #fungsi untuk mendownload page
  39. def download(url):
  40. url_request = urllib.urlopen(url)
  41. try:
  42. localFile = open(url.split('/')[-1], 'w')
  43. localFile.write(url_request.read())
  44. url_request.close()
  45. localFile.close()
  46. except:
  47. pass
  48.  
  49. #fungsi untuk mendownload image
  50. def download_image(url):
  51. soup = ziachow(urlopen(url))
  52. parsed = list(urlparse.urlparse(url))
  53.  
  54. for image in soup.findAll("img"):
  55. try:
  56. filename = image["src"].split("/")[-1]
  57. if os.path.isfile(filename) or os.path.isfile(parsed[2]):
  58. break
  59. else:
  60. print "Image Download: %(src)s" % image
  61. parsed[2] = image["src"]
  62. outpath = os.path.join(out_folder, filename)
  63. if image["src"].lower().startswith("http"):
  64. urlretrieve(image["src"], outpath)
  65. else:
  66. urlretrieve(urlparse.urlunparse(parsed), outpath)
  67.  
  68. except:
  69. break
  70.  
  71.  
  72. #fungsi untuk mendownload css
  73. def download_css(url):
  74. try:
  75. soup = ziachow(urlopen(url))
  76. parsed = list(urlparse.urlparse(url))
  77. for image in soup.findAll("link"):
  78. try:
  79. filename = image["href"].split("/")[-1]
  80. parsed[2] = image["href"]
  81. if os.path.isfile(filename) or os.path.isfile(parsed[2]):
  82. break
  83. else:
  84. print "Css Download: %(href)s" % image
  85. outpath = os.path.join(out_folder, filename)
  86. if image["href"].lower().startswith("http"):
  87. urlretrieve(image["href"], outpath)
  88. else:
  89. urlretrieve(urlparse.urlunparse(parsed), outpath)
  90. except:
  91. break
  92. except:
  93. pass
  94.  
  95.  
  96. #fungsi untuk mendownload js
  97. def download_js(url):
  98. try:
  99. soup = ziachow(urlopen(url))
  100. parsed = list(urlparse.urlparse(url))
  101. for image in soup.findAll("script"):
  102. try:
  103. filename = image["src"].split("/")[-1]
  104. if os.path.isfile(filename) or os.path.isfile(parsed[2]):
  105. break
  106. else:
  107. print "Javascript/Jquery Download: %(src)s" % image
  108. parsed[2] = image["src"]
  109. outpath = os.path.join(out_folder, filename)
  110. if image["src"].lower().startswith("src"):
  111. urlretrieve(image["src"], outpath)
  112. else:
  113. urlretrieve(urlparse.urlunparse(parsed), outpath)
  114. except:
  115. break
  116.  
  117. except:
  118. pass
  119.  
  120. def main(url):
  121. opener = urllib2.build_opener()
  122. opener.addheaders = [('User-agent', random.choice(user_agent))]
  123. page = opener.open(url)
  124. htmlcode_page = page.read()
  125. dump_isihtml = BeautifulSoup.BeautifulSoup(htmlcode_page)
  126. print dump_isihtml.title.string
  127. Links = dump_isihtml.findAll("a", {"href": True})
  128. leng = len(Links)
  129. count = 0
  130. while count < leng:
  131. try:
  132. url_match = re.findall("((http\://|https\://|ftp\://)|(www.))+(([a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(/[a-zA-Z0-9%:/-_\?\.'~]*)?", Links[count]["href"])
  133.  
  134. if Links[count]["href"] == "#" or Links[count]["href"] == "" or os.path.isfile(Links[count]["href"]):
  135. print 'page available or got filter'
  136. count +=1
  137. elif url_match:
  138. print 'Fetch page %s'%Links[count]["href"]
  139. download(Links[count]["href"])
  140. download_css(Links[count]["href"])
  141. download_image(Links[count]["href"])
  142. download_js(Links[count]["href"])
  143. count += 1
  144.  
  145. else:
  146. print 'Fetch page %s'%Links[count]["href"]
  147. download(url+'/'+Links[count]["href"])
  148. download_css(url+'/'+Links[count]["href"])
  149. download_image(url+'/'+Links[count]["href"])
  150. download_js(url+'/'+Links[count]["href"])
  151. count += 1
  152. except:
  153. count +=1
  154.  
  155. if __name__ == '__main__':
  156. if len(sys.argv) == 2:
  157. try:
  158. main(sys.argv[1])
  159. except Exception,err:
  160. print err
  161. else:
  162. print 'usage: %s http://server.com/ ' % os.path.basename(sys.argv[0])
  163.  
  164.  
  165.  
  166.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement