Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
- # mobile user-agent
- MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
- ### browser
- config = Config()
- config.browser_user_agent = USER_AGENT
- config.request_timeout = 15
- for x in range(len(keyword)):
- print(x)
- stop = 0
- run = 0
- while stop == 0:
- if run == 1:
- break
- query = keyword[x]
- query2 = query.replace(' ', '+')
- if run == 0:
- URL = f"https://google.com/search?q={query2}"
- headers = {'Accept-Language': 'en-US', 'Upgrade-Insecure-Requests': '1', "user-agent": USER_AGENT, "referer": "http://www.google.com/", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive'}
- while True:
- resp = requests.get(URL, headers=headers)
- if resp.status_code == 200:
- break
- else:
- print('google captcha')
- time.sleep(60)
- doc = html.fromstring(resp.text)
- print(resp.status_code)
- links = doc.xpath("//div[@class='yuRUbf']/a/@href")
- for link in links:
- site = link
- result = self.dblinks.get(Query()['link'] == site)
- if result is None:
- stop = 1
- break
- if stop == 1:
- break
- run = run+1
- if run == 1:
- time.sleep(5)
- query = keyword[x]
- query2 = query.replace(' ', '+')
- if run == 0:
- URL = f"https://google.com/search?q={query2}&num=100"
- headers = {'Accept-Language': 'en-US', 'Upgrade-Insecure-Requests': '1', "user-agent": USER_AGENT, "referer": "http://www.google.com/", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive'}
- while True:
- resp = requests.get(URL, headers=headers)
- if resp.status_code == 200:
- break
- else:
- print('google captcha')
- time.sleep(60)
- doc = html.fromstring(resp.text)
- print(resp.status_code)
- links = doc.xpath("//div[@class='yuRUbf']/a/@href")
- h2listtotal = []
- introduction = ""
- introductioncount = 0
- y = 0
- linklist = []
- donotcontinue = 0
- noresetlist = []
- print(len(links))
- for link in links:
- print(link)
- y=y+1
- linklist.append(link)
- h2list = []
- if len(h2listtotal) >= 4:
- print('brekaing')
- break
- stop = 0
- done = 0
- error = 0
- site = link
- article = Article(link, config=config)
- article.download()
- soup = BeautifulSoup(article.html, 'html.parser')
- i = 0
- firsth2 = ""
- for header in soup.find_all('h2'):
- i = i+1
- if i == 1:
- if len(soup.find_all('h2')) > 0:
- firsth2 = header.text
- if len(soup.find_all('h2')) > 0:
- nextNode = header
- if len(nextNode.text) > 7 and len(nextNode.text) < 60 and len(h2list) < 2 and nextNode.text not in noresetlist and "Subscribe" not in nextNode.text and "Conclusion" not in nextNode.text:
- h2list.append(nextNode.text)
- noresetlist.append(nextNode.text)
- try:
- article.parse()
- except newspaper.article.ArticleException as e:
- print(e)
- donotcontinue=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement