SHARE
TWEET

Untitled

a guest Feb 20th, 2019 65 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from bs4 import BeautifulSoup
  2. from urllib.request import urlopen
  3. import os
  4. to_path = '/Users/pcruz/Desktop/Universidade/Estágio/WIT/Development/Dataset/Concatenated_Files/'
  5. urls_path = '../Datasets/urls.csv'
  6. import pandas as pd
  7.  
  8. def extract_js(url,index):
  9.  
  10.  
  11.     try:
  12.         print('loading this url:')
  13.         print(url)
  14.  
  15.         try:
  16.             page = urlopen(url, timeout = 2)
  17.         except:
  18.             print("Timeout")
  19.             return 0
  20.         soup = BeautifulSoup(page)
  21.         file = ''
  22.         for script in soup(["script"]):
  23.             contents  = script.contents
  24.             content_index = 0
  25.             for content in contents:
  26.                 file+=content
  27.                 content_index  = content_index +1
  28.         file_js = open(to_path +  str(index) + '.js', 'w')
  29.         file_js.write(str(file))
  30.         file_js.close()
  31.     except:
  32.         print(url)
  33.  
  34.     return 0
  35.  
  36.  
  37. def main():
  38.     index = 0
  39.     dataframe = pd.read_csv(urls_path)
  40.     urls = dataframe['url']
  41.     for url in urls:
  42.         extract_js(url,index)
  43.         index +=1
  44.     return 0
  45.  
  46. if __name__ == '__main__':
  47.     main()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top