Advertisement
Guest User

Untitled

a guest
Mar 4th, 2020
1,746
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.28 KB | None | 0 0
  1. import time
  2. import urllib.parse
  3.  
  4. import requests
  5. from bs4 import BeautifulSoup
  6. import re
  7.  
  8. from global_config import ConfigFile
  9. from program_logger import logger
  10.  
  11.  
  12. class ScholarScrape():
  13.     def __init__(self):
  14.         self.page = None
  15.         self.last_url = None
  16.         self.last_time = time.time()
  17.         self.min_time_between_scrape = int(ConfigFile.instance().config.get('scholar','bot_avoidance_time'))
  18.         self.header = {'User-Agent':ConfigFile.instance().config.get('scholar','user_agent')}
  19.         self.session = requests.Session()
  20.         pass
  21.  
  22.     def search(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
  23.         url = self.get_url(query, year_lo, year_hi, title_only, publication_string, author_string, include_citations, include_patents)
  24.         while True:
  25.             wait_time = self.min_time_between_scrape - (time.time() - self.last_time)
  26.             if wait_time > 0:
  27.                 logger.info("Delaying search by {} seconds to avoid bot detection.".format(wait_time))
  28.                 time.sleep(wait_time)
  29.             self.last_time = time.time()
  30.             logger.info("SCHOLARSCRAPE: " + url)
  31.             self.page = BeautifulSoup(self.session.get(url, headers=self.header).text, 'html.parser')
  32.             self.last_url = url
  33.  
  34.             if "Our systems have detected unusual traffic from your computer network" in str(self.page):
  35.                 raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
  36.  
  37.             return
  38.  
  39.     def get_url(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
  40.         base_url = "https://scholar.google.com.au/scholar?"
  41.         url = base_url + "as_q=" + urllib.parse.quote(query)
  42.  
  43.         if year_lo is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_lo))):
  44.             url += "&as_ylo=" + str(year_lo)
  45.  
  46.         if year_hi is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_hi))):
  47.             url += "&as_yhi=" + str(year_hi)
  48.  
  49.         if title_only:
  50.             url += "&as_yhi=title"
  51.         else:
  52.             url += "&as_yhi=any"
  53.  
  54.         if publication_string is not None:
  55.             url += "&as_publication=" + urllib.parse.quote('"' + str(publication_string) + '"')
  56.  
  57.         if author_string is not None:
  58.             url += "&as_sauthors=" + urllib.parse.quote('"' + str(author_string) + '"')
  59.  
  60.         if include_citations:
  61.             url += "&as_vis=0"
  62.         else:
  63.             url += "&as_vis=1"
  64.  
  65.         if include_patents:
  66.             url += "&as_sdt=0"
  67.         else:
  68.             url += "&as_sdt=1"
  69.  
  70.         return url
  71.  
  72.     def get_results_count(self):
  73.         e = self.page.findAll("div", {"class": "gs_ab_mdw"})
  74.         try:
  75.             item = e[1].text.strip()
  76.         except IndexError as ex:
  77.             if "Our systems have detected unusual traffic from your computer network" in str(self.page):
  78.                 raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
  79.             else:
  80.                 raise ex
  81.  
  82.         if self.has_numbers(item):
  83.             return self.get_results_count_from_soup_string(item)
  84.         for item in e:
  85.             item = item.text.strip()
  86.             if self.has_numbers(item):
  87.                 return self.get_results_count_from_soup_string(item)
  88.         return 0
  89.  
  90.     @staticmethod
  91.     def get_results_count_from_soup_string(element):
  92.         if "About" in element:
  93.             num = element.split(" ")[1].strip().replace(",","")
  94.         else:
  95.             num = element.split(" ")[0].strip().replace(",","")
  96.         return num
  97.  
  98.     @staticmethod
  99.     def has_numbers(input_string):
  100.         return any(char.isdigit() for char in input_string)
  101.  
  102.  
  103. class BotDetectionException(Exception):
  104.     pass
  105.  
  106. if __name__ == "__main__":
  107.     s = ScholarScrape()
  108.     s.search(**{
  109.         "query":"\"policy shaping\"",
  110.         # "publication_string":"JMLR",
  111.         "author_string": "gilboa",
  112.         "year_lo": "1995",
  113.         "year_hi": "2005",
  114.  
  115.     })
  116.     x = s.get_results_count()
  117.     print(x)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement