Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## from https://stackoverflow.com/q/74759054/6146136
- ## output at https://docs.google.com/spreadsheets/d/1TcoU-nhKJp0XcQqAF4SrOQAq32x2mNmTYd9JsTz73PI/edit?usp=sharing
- import pandas as pd
- from bs4 import BeautifulSoup
- import requests
- import os
- user_agent_list = [
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36',
- 'Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 OPR/84.0.4316.14',
- 'Opera/9.80 (Linux armv7l) Presto/2.12.407 Version/12.51 , D50u-D1-UHD/V1.5.16-UHD (Vizio, D50u-D1, Wireless)',
- 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 OPR/42.0.2393.94',
- 'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-CN; EML-AL00 Build/HUAWEIEML-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 baidu.sogo.uc.UCBrowser/11.9.4.974 UWS/2.13.1.48 Mobile Safari/537.36 AliApp(DingTalk/4.5.11) com.alibaba.android.rimet/10487439 Channel/227200 language/zh-CN',
- 'Mozilla/5.0 (X11; U; Linux i686; en-US) U2/1.0.0 UCBrowser/9.3.1.344',
- 'Mozilla/5.0 (Linux; U; Android 10; en-US; RMX1901 Build/QKQ1.190918.001) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/78.0.3904.108 UCBrowser/13.4.0.1306 Mobile Safari/537.36',
- 'UCWEB/2.0 (Java; U; MIDP-2.0; Nokia203/20.37) U2/1.0.0 UCBrowser/8.7.0.218 U2/1.0.0 Mobile'
- ]
- url = 'https://scholar.google.com/citations?user=EnegzCwAAAAJ&hl=&view_op=list_works&cstart=0&pagesize=100'
- testLog = []
- for user_agent in user_agent_list:
- headers = {'User-Agent': user_agent}
- response = requests.get(url,headers=headers)
- soup = BeautifulSoup(response.text, 'lxml')
- table = soup.find('table',{'id':'gsc_a_t'})
- titles = [item.text for item in table.find_all(class_='gsc_a_at')] if table else []
- for_testLog = {
- 'User-Agent': user_agent, 'req_elapsed': response.elapsed,
- 'req_status': f'{response.status_code} {response.reason}', 'titles_count': len(titles)
- }
- for i, t in enumerate(titles): for_testLog[f'title_{i+1}'] = t
- if not titles and response.status_code == 200:
- for_testLog['miniHtml'] = ''.join([' '.join([
- w for w in l.split() if w # minimize whitespace
- ]) for l in soup.prettify().splitlines() if l.strip()])
- testLog.append(for_testLog)
- pd.DataFrame(testLog).to_csv('user_agents_testLog.csv', index=False)
- print('log saved to', os.path.abspath('user_agents_testLog.csv'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement