Untitled

        USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"
        # mobile user-agent
        MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
        ### browser

        config = Config()
        config.browser_user_agent = USER_AGENT
        config.request_timeout = 15

        for x in range(len(keyword)):
            print(x)
            stop = 0
            run = 0
            while stop == 0:
                if run == 1:
                    break
                query = keyword[x]
                query2 = query.replace(' ', '+')
                if run == 0:
                    URL = f"https://google.com/search?q={query2}"
                    headers = {'Accept-Language': 'en-US', 'Upgrade-Insecure-Requests': '1', "user-agent": USER_AGENT, "referer": "http://www.google.com/", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive'}

                    while True:
                        resp = requests.get(URL, headers=headers)
                        if resp.status_code == 200:
                            break
                        else:
                            print('google captcha')
                            time.sleep(60)

                    doc = html.fromstring(resp.text)
                    print(resp.status_code)
                    links = doc.xpath("//div[@class='yuRUbf']/a/@href")
                    for link in links:
                        site = link
                        result = self.dblinks.get(Query()['link'] == site)
                        if result is None:
                            stop = 1
                            break
                if stop == 1:
                    break

                run = run+1

            if run == 1:
                time.sleep(5)
                query = keyword[x]
                query2 = query.replace(' ', '+')
                if run == 0:
                    URL = f"https://google.com/search?q={query2}&num=100"
                    headers = {'Accept-Language': 'en-US', 'Upgrade-Insecure-Requests': '1', "user-agent": USER_AGENT, "referer": "http://www.google.com/", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive'}

                    while True:
                        resp = requests.get(URL, headers=headers)
                        if resp.status_code == 200:
                            break
                        else:
                            print('google captcha')
                            time.sleep(60)

                    doc = html.fromstring(resp.text)
                    print(resp.status_code)
                    links = doc.xpath("//div[@class='yuRUbf']/a/@href")


            h2listtotal = []
            introduction = ""
            introductioncount = 0
            y = 0
            linklist = []
            donotcontinue = 0
            noresetlist = []
            print(len(links))
            for link in links:
                print(link)
                y=y+1
                linklist.append(link)
                h2list = []
                if len(h2listtotal) >= 4:
                    print('brekaing')
                    break
                stop = 0
                done = 0
                error = 0
                site = link

                article = Article(link, config=config)
                article.download()
                soup = BeautifulSoup(article.html, 'html.parser')
                i = 0
                firsth2 = ""
                for header in soup.find_all('h2'):
                    i = i+1
                    if i == 1:
                        if len(soup.find_all('h2')) > 0:
                            firsth2 = header.text
                    if len(soup.find_all('h2')) > 0:
                        nextNode = header
                        if len(nextNode.text) > 7 and len(nextNode.text) < 60 and len(h2list) < 2 and nextNode.text not in noresetlist and "Subscribe" not in nextNode.text and "Conclusion" not in nextNode.text:
                            h2list.append(nextNode.text)
                            noresetlist.append(nextNode.text)


                try:
                    article.parse()
                except newspaper.article.ArticleException as e:
                    print(e)
                    donotcontinue=1