Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- from urllib.request import urlopen
- import os
- to_path = '/Users/pcruz/Desktop/Universidade/Estágio/WIT/Development/Dataset/Concatenated_Files/'
- urls_path = '../Datasets/urls.csv'
- import pandas as pd
- def extract_js(url,index):
- try:
- print('loading this url:')
- print(url)
- try:
- page = urlopen(url, timeout = 2)
- except:
- print("Timeout")
- return 0
- soup = BeautifulSoup(page)
- file = ''
- for script in soup(["script"]):
- contents = script.contents
- content_index = 0
- for content in contents:
- file+=content
- content_index = content_index +1
- file_js = open(to_path + str(index) + '.js', 'w')
- file_js.write(str(file))
- file_js.close()
- except:
- print(url)
- return 0
- def main():
- index = 0
- dataframe = pd.read_csv(urls_path)
- urls = dataframe['url']
- for url in urls:
- extract_js(url,index)
- index +=1
- return 0
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement