Advertisement
Guest User

Untitled

a guest
Feb 20th, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.13 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. from urllib.request import urlopen
  3. import os
  4. to_path = '/Users/pcruz/Desktop/Universidade/Estágio/WIT/Development/Dataset/Concatenated_Files/'
  5. urls_path = '../Datasets/urls.csv'
  6. import pandas as pd
  7.  
  8. def extract_js(url,index):
  9.  
  10.  
  11. try:
  12. print('loading this url:')
  13. print(url)
  14.  
  15. try:
  16. page = urlopen(url, timeout = 2)
  17. except:
  18. print("Timeout")
  19. return 0
  20. soup = BeautifulSoup(page)
  21. file = ''
  22. for script in soup(["script"]):
  23. contents = script.contents
  24. content_index = 0
  25. for content in contents:
  26. file+=content
  27. content_index = content_index +1
  28. file_js = open(to_path + str(index) + '.js', 'w')
  29. file_js.write(str(file))
  30. file_js.close()
  31. except:
  32. print(url)
  33.  
  34. return 0
  35.  
  36.  
  37. def main():
  38. index = 0
  39. dataframe = pd.read_csv(urls_path)
  40. urls = dataframe['url']
  41. for url in urls:
  42. extract_js(url,index)
  43. index +=1
  44. return 0
  45.  
  46. if __name__ == '__main__':
  47. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement