Advertisement
Guest User

Untitled

a guest
Nov 18th, 2018
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.71 KB | None | 0 0
  1. def scraper(url,downloadDirectory):
  2. start = time.time()
  3.  
  4. baseUrl = r"{some_url}"
  5.  
  6.  
  7. html = urlopen(url)
  8. bsObj = BeautifulSoup(html.read())
  9. table = bsObj.findAll("table")[0]
  10. links = table.findAll("a")
  11.  
  12. count = 0
  13. broken_links = []
  14. for link in links:
  15. try:
  16. count += 1
  17. link = str(link).split(""")
  18. if len(link) > 1:
  19. print(link)
  20. link = link[1]
  21. linkBreak = link.split("_")
  22. else:
  23. if link[0] == "<a></a>":
  24. print("Skipping")
  25. continue
  26. else:
  27. print(link)
  28. linkBreak = link.split("_")
  29.  
  30. title = re.findall(r"[w']+",str(linkBreak))[9].strip("'")
  31. if title == "nyc":
  32. title = re.findall(r"[w']+",str(linkBreak))[10].strip("'")
  33. print("# " + str(count), "Title: " + str(title))
  34. dir_path = os.path.join(downloadDirectory,title)
  35. if os.path.isdir(dir_path) == False:
  36. print("Creating directory: " + str(os.path.join(downloadDirectory,title)))
  37. os.mkdir(dir_path)
  38.  
  39. file_path = urllib.parse.urljoin(baseUrl,link)
  40. print("File Path: " + str(file_path), "n" + "Directory Path: " + str(dir_path))
  41. print("Split array and length: ", linkBreak, len(linkBreak))
  42. if len(linkBreak) == 1:
  43. if os.path.isfile(os.path.join(dir_path,str(linkBreak[0]).split("/")[7])):
  44. print("Skipping")
  45. continue
  46. else:
  47. print("Result: " + str(os.path.join(dir_path,str(linkBreak[0]).split("/")[7])))
  48. urlretrieve(file_path,os.path.join(dir_path,str(linkBreak[0]).split("/")[7]))
  49. elif len(linkBreak) == 2:
  50. if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[1])):
  51. print("Skipping")
  52. continue
  53. elif str(os.path.join(dir_path,title + "_" + linkBreak[1])).endswith(".zip") == False:
  54. if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip")):
  55. print("Skipping")
  56. continue
  57. else:
  58. print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip")))
  59. urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[1] + ".zip"))
  60. else:
  61. print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[1])))
  62. urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[1]))
  63. elif len(linkBreak) == 3:
  64. if "?" in linkBreak[2]:
  65. linkBreak[2] = linkBreak[2].split("?", 1)[0]
  66. if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
  67. print("Skipping")
  68. continue
  69. else:
  70. print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
  71. urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
  72. if title == "sidewalkcafe":
  73. linkBreak[2] = str(linkBreak[1]) + str(linkBreak[2])
  74. if os.path.isfile(os.path.join(dir_path,title + linkBreak[2])):
  75. print("Skipping")
  76. continue
  77. else:
  78. print("Result: " + str(os.path.join(dir_path,title + linkBreak[2])))
  79. urlretrieve(file_path,os.path.join(dir_path,title + linkBreak[2]))
  80. else:
  81. if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
  82. print("Skipping")
  83. continue
  84. else:
  85. print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
  86. urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
  87. elif len(linkBreak) == 4:
  88. if "?" in linkBreak[3]:
  89. linkBreak[3] = linkBreak[3].split("?",1)[0]
  90. linkBreak[2] = str(linkBreak[2]) + "_" + str(linkBreak[3])
  91. if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
  92. print("Skipping")
  93. continue
  94. else:
  95. print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
  96. urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
  97. else:
  98. if os.path.isfile(os.path.join(dir_path,title + "_" + linkBreak[2])):
  99. print("Skipping")
  100. continue
  101. else:
  102. print("Result: " + str(os.path.join(dir_path,title + "_" + linkBreak[2])))
  103. urlretrieve(file_path,os.path.join(dir_path,title + "_" + linkBreak[2]))
  104. except HTTPError as e:
  105. if e.code == 404:
  106. print(e)
  107. print(count,"__________")
  108. broken_links.append([count,title,link])
  109. continue
  110. else:
  111. raise
  112.  
  113. end = time.time()
  114. fp = os.path.join(downloadDirectory,"BrokenLinks.txt")
  115. file = open(fp,"w+")
  116. for link in broken_links:
  117. file.write(str(link) + "n")
  118. file.write(str(datetime.now()))
  119. file.close()
  120.  
  121. return("Script completed in: " + str(end - start) + " seconds.")
  122.  
  123. archURL = {some_url}
  124. archDownloadDirectory = {some_localpath}
  125.  
  126. scraper(archURL,archDownloadDirectory)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement