Advertisement
Try95th

Testing User Agents [requests + bs4]

Dec 11th, 2022 (edited)
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.29 KB | None | 0 0
  1. ## from https://stackoverflow.com/q/74759054/6146136
  2. ## output at https://docs.google.com/spreadsheets/d/1TcoU-nhKJp0XcQqAF4SrOQAq32x2mNmTYd9JsTz73PI/edit?usp=sharing
  3.  
  4. import pandas as pd
  5. from bs4 import BeautifulSoup
  6. import requests
  7. import os
  8.  
  9. user_agent_list = [
  10.   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
  11.   'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
  12.   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
  13.   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
  14.   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
  15.   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
  16.   'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36',
  17.   'Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.18',
  18.   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 OPR/84.0.4316.14',
  19.   'Opera/9.80 (Linux armv7l) Presto/2.12.407 Version/12.51 , D50u-D1-UHD/V1.5.16-UHD (Vizio, D50u-D1, Wireless)',
  20.   'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
  21.   'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36 OPR/42.0.2393.94',
  22.   'Mozilla/5.0 (Linux; U; Android 8.1.0; zh-CN; EML-AL00 Build/HUAWEIEML-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.108 baidu.sogo.uc.UCBrowser/11.9.4.974 UWS/2.13.1.48 Mobile Safari/537.36 AliApp(DingTalk/4.5.11) com.alibaba.android.rimet/10487439 Channel/227200 language/zh-CN',
  23.   'Mozilla/5.0 (X11; U; Linux i686; en-US) U2/1.0.0 UCBrowser/9.3.1.344',
  24.   'Mozilla/5.0 (Linux; U; Android 10; en-US; RMX1901 Build/QKQ1.190918.001) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/78.0.3904.108 UCBrowser/13.4.0.1306 Mobile Safari/537.36',
  25.   'UCWEB/2.0 (Java; U; MIDP-2.0; Nokia203/20.37) U2/1.0.0 UCBrowser/8.7.0.218 U2/1.0.0 Mobile'
  26. ]
  27.  
  28. url  = 'https://scholar.google.com/citations?user=EnegzCwAAAAJ&hl=&view_op=list_works&cstart=0&pagesize=100'
  29. testLog = []
  30. for user_agent in user_agent_list:  
  31.     headers = {'User-Agent': user_agent}
  32.     response = requests.get(url,headers=headers)
  33.     soup = BeautifulSoup(response.text, 'lxml')
  34.  
  35.     table = soup.find('table',{'id':'gsc_a_t'})
  36.     titles = [item.text for item in table.find_all(class_='gsc_a_at')] if table else []
  37.  
  38.     for_testLog = {
  39.         'User-Agent': user_agent, 'req_elapsed': response.elapsed,
  40.         'req_status': f'{response.status_code} {response.reason}', 'titles_count': len(titles)
  41.     }
  42.     for i, t in enumerate(titles): for_testLog[f'title_{i+1}'] = t
  43.     if not titles and response.status_code == 200:
  44.         for_testLog['miniHtml'] = ''.join([' '.join([
  45.             w for w in l.split() if w # minimize whitespace
  46.         ]) for l in soup.prettify().splitlines() if l.strip()])
  47.     testLog.append(for_testLog)
  48. pd.DataFrame(testLog).to_csv('user_agents_testLog.csv', index=False)
  49.  
  50. print('log saved to', os.path.abspath('user_agents_testLog.csv'))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement