Advertisement
jerryturcios08

Untitled

Oct 22nd, 2019
140
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.42 KB | None | 0 0
  1. import requests
  2.  
  3. from fake_useragent import UserAgent
  4.  
  5. from fake_useragent import FakeUserAgentError
  6.  
  7. from bs4 import BeautifulSoup
  8.  
  9. import urllib
  10.  
  11. import re
  12.  
  13. import math
  14.  
  15. import os
  16.  
  17. import PyPDF2
  18.  
  19. import json
  20.  
  21. import logging
  22.  
  23. from threading import Thread
  24.  
  25. #data structure to hold google results
  26.  
  27.  
  28. class googleUrls(object):
  29.     def __init__(self, title=None, url=None, description=None):
  30.         self.title = title
  31.         self.url = url
  32.         self.description = description
  33.  
  34. #data structure
  35.  
  36.  
  37. class Website(object):
  38.     def __init__(self,
  39.                  title=None,
  40.                  url=None,
  41.                  description=None,
  42.                  originalRank=None,
  43.                  reRankScore=None):
  44.  
  45.         self.title = title
  46.  
  47.         self.url = url
  48.  
  49.         self.description = description
  50.  
  51.         self.originalRank = originalRank
  52.  
  53.         self.reRankScore = reRankScore
  54.  
  55.  
  56. def getUrlsFromGoogle(term, noOfResults):
  57.  
  58.     try:
  59.      ua = UserAgent()
  60.     except FakeUserAgentError:
  61.       pass
  62.  
  63.     query = urllib.parse.quote_plus(term)  # Format into URL encoding
  64.  
  65.     google_url = "https://www.google.com/search?q=" + query + "&num=" + str(
  66.         noOfResults)
  67.  
  68.     response = requests.get(google_url, {"User-Agent": ua.random})
  69.  
  70.     soup = BeautifulSoup(response.text, "html.parser")
  71.  
  72.     result_div = soup.find_all('div', attrs={'class': 'ZINbbc'})
  73.  
  74.     links = []
  75.  
  76.     titles = []
  77.  
  78.     descriptions = []
  79.  
  80.     # ontologyNameList = getTheListFromTree(ontologyTree, "title")
  81.  
  82.     for r in result_div:
  83.  
  84.         # Checks if each element is present, else, raise exception
  85.  
  86.         try:
  87.  
  88.             link = r.find('a', href=True)
  89.  
  90.             title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()
  91.  
  92.             description = r.find('div', attrs={'class': 's3v9rd'}).get_text()
  93.  
  94.             # Check to make sure everything is present before appending
  95.  
  96.             if link != '' and title != '' and description != '':
  97.  
  98.                 links.append(link['href'])
  99.  
  100.                 titles.append(title)
  101.  
  102.                 descriptions.append(description)
  103.  
  104.         # Next loop if one element is not present
  105.  
  106.         except:
  107.  
  108.             continue
  109.  
  110.     print(len(titles), len(descriptions), len(links))
  111.  
  112.     if (len(titles) != len(descriptions) or len(descriptions) != len(links)):
  113.  
  114.         print("ERROR! HORROR!")
  115.  
  116.         exit()
  117.  
  118.     to_remove = []
  119.  
  120.     clean_links = []
  121.  
  122.     for i, l in enumerate(links):
  123.  
  124.         clean = re.search('\/url\?q\=(.*)\&sa', l)
  125.  
  126.         # Anything that doesn't fit the above pattern will be removed
  127.  
  128.         if clean is None:
  129.  
  130.             to_remove.append(i)
  131.  
  132.             continue
  133.  
  134.         clean_links.append(clean.group(1))
  135.  
  136.     # Remove the corresponding titles & descriptions
  137.  
  138.     for x in to_remove:
  139.  
  140.         del titles[x]
  141.  
  142.         del descriptions[x]
  143.  
  144.     result_list = []
  145.  
  146.     count = 0
  147.  
  148.     for m in clean_links:
  149.         result_list.append(googleUrls(titles[count], m, descriptions[count]))
  150.  
  151.         count += 1
  152.  
  153.     return result_list
  154.  
  155. #
  156.  
  157.  
  158. def crawl_search(term, jsonTree):
  159.     logging.basicConfig(format='%(asctime)s %(message)s',
  160.                         datefmt='%m/%d/%Y %I:%M:%S %p')
  161.     logging.warning('is when this event was In.')
  162.  
  163.     termsList = getTheListFromTree(jsonTree, 'title')
  164.  
  165.     listDict = {}
  166.  
  167.     pos = 1
  168.     for m in termsList:
  169.       listDict[m.lower()] = pos
  170.       pos = pos + 1
  171.       print(m)
  172.  
  173.     isTermPresent = False
  174.     if term.lower() in listDict:
  175.       isTermPresent = True
  176.       getIndexOfTerm = listDict[term.lower()]
  177.  
  178.     if isTermPresent == False:
  179.         googleResultsForTerm = getUrlsFromGoogle(term, 30)
  180.  
  181.         websiteList = []
  182.  
  183.         threads = []
  184.  
  185.         for ii in range(len(googleResultsForTerm)):
  186.  
  187.           process = Thread(target=crawl, args=[
  188.                            googleResultsForTerm[ii], googleResultsForTerm, listDict, ii, websiteList])
  189.           process.start()
  190.           threads.append(process)
  191.  
  192.         for process in threads:
  193.               process.join()
  194.  
  195.         websiteList.sort(key=lambda x: x.reRankScore, reverse=True)
  196.  
  197.         json_string = json.dumps([ob.__dict__ for ob in websiteList])
  198.  
  199.         logging.basicConfig(format='%(asctime)s %(message)s',
  200.                             datefmt='%m/%d/%Y %I:%M:%S %p')
  201.         logging.warning('is when this event was out.')
  202.  
  203.         return json_string
  204.     else:
  205.  
  206.         websiteList = []
  207.  
  208.         googleResultsForTerm = getUrlsFromGoogle(term + termsList[0], 30)
  209.  
  210.         threads = []
  211.  
  212.         for ii in range(len(googleResultsForTerm)):
  213.  
  214.           process = Thread(target=crawl, args=[
  215.                            googleResultsForTerm[ii], googleResultsForTerm, listDict, ii, websiteList])
  216.           process.start()
  217.           threads.append(process)
  218.  
  219.         for process in threads:
  220.               process.join()
  221.  
  222.         #Sorting based on re-rank score
  223.         websiteList.sort(key=lambda x: x.reRankScore, reverse=True)
  224.  
  225.         termListSize = len(termsList)
  226.  
  227.         term1 = ""
  228.         term2 = ""
  229.         term3 = ""
  230.  
  231.         if (termListSize - getIndexOfTerm >= 3):
  232.             term1 = termsList[getIndexOfTerm]
  233.             term2 = termsList[getIndexOfTerm + 1]
  234.             term3 = termsList[getIndexOfTerm + 2]
  235.         elif (termListSize - getIndexOfTerm >= 2):
  236.             term1 = termsList[getIndexOfTerm]
  237.             term2 = termsList[getIndexOfTerm + 1]
  238.         elif (termListSize - getIndexOfTerm >= 1):
  239.             term1 = termsList[getIndexOfTerm]
  240.         elif (termListSize > 2):
  241.             term1 = termsList[getIndexOfTerm - 2]
  242.  
  243.         print(term1 + " " + term2 + " " + term3)
  244.  
  245.         if term1 and term2 and term3:
  246.             term1_google_results = getUrlsFromGoogle(term1, 7)
  247.             term2_google_results = getUrlsFromGoogle(term2, 7)
  248.             term3_google_results = getUrlsFromGoogle(term3, 7)
  249.  
  250.             json_string = json.dumps(
  251.                 [ob.__dict__ for ob in websiteList] +
  252.                 [ob.__dict__ for ob in term1_google_results] +
  253.                 [ob.__dict__ for ob in term2_google_results] +
  254.                 [ob.__dict__ for ob in term3_google_results]
  255.             )
  256.  
  257.         elif term1 and term2 and not term3:
  258.             term1_google_results = getUrlsFromGoogle(term1, 10)
  259.             term2_google_results = getUrlsFromGoogle(term2, 10)
  260.  
  261.             json_string = json.dumps(
  262.                 [ob.__dict__ for ob in websiteList] +
  263.                 [ob.__dict__ for ob in term1_google_results] +
  264.                 [ob.__dict__ for ob in term2_google_results]
  265.             )
  266.  
  267.         elif term1 and not term2 and not term3:
  268.             term1_google_results = getUrlsFromGoogle(term1, 20)
  269.  
  270.             json_string = json.dumps(
  271.                 [ob.__dict__ for ob in websiteList] +
  272.                 [ob.__dict__ for ob in term1_google_results]
  273.             )
  274.  
  275.         else:
  276.             json_string = json.dumps(
  277.                 [ob.__dict__ for ob in websiteList]
  278.             )
  279.  
  280.         logging.basicConfig(format='%(asctime)s %(message)s',
  281.                             datefmt='%m/%d/%Y %I:%M:%S %p')
  282.         logging.warning('is when this event was out.')
  283.  
  284.         return json_string
  285.  
  286.  
  287. #reading the pdf-urls script
  288.  
  289. def pdfReader(pdfUrl):
  290.  
  291.     r = requests.get(pdfUrl, stream=True)
  292.  
  293.     with open("python.pdf", "wb") as pdf:
  294.  
  295.         for chunk in r.iter_content(chunk_size=1024):
  296.  
  297.             # writing one chunk at a time to pdf file
  298.  
  299.             if chunk:
  300.  
  301.                 pdf.write(chunk)
  302.  
  303.     pdfFileObject = open('python.pdf', 'rb')
  304.  
  305.     pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
  306.  
  307.     count = pdfReader.numPages
  308.  
  309.     ss = ""
  310.  
  311.     for i in range(count):
  312.  
  313.         page = pdfReader.getPage(i)
  314.  
  315.         ss += page.extractText()
  316.  
  317.     return ss
  318.  
  319. #helper function for getTheListFromTree
  320.  
  321.  
  322. def item_generator(json_input, lookup_key):
  323.     if isinstance(json_input, dict):
  324.         for k, v in json_input.items():
  325.             if k == lookup_key:
  326.                 yield v
  327.             else:
  328.                 yield from item_generator(v, lookup_key)
  329.     elif isinstance(json_input, list):
  330.         for item in json_input:
  331.             yield from item_generator(item, lookup_key)
  332.  
  333. #generating all terms from ontology
  334.  
  335.  
  336. def getTheListFromTree(jsonTree, lookup_key):
  337.     data = json.loads(jsonTree)
  338.     ontologyList = []
  339.  
  340.     for _ in item_generator(data, lookup_key):
  341.         ontologyList.append(_.lower())
  342.  
  343.     return ontologyList
  344.  
  345. #re rank script
  346.  
  347.  
  348. def re_rank(url, ontologyTerms, totalResults, tempRank, doc):
  349.  
  350.    if tempRank == 1:
  351.  
  352.         baseScore = 1
  353.  
  354.    else:
  355.  
  356.         baseScore = (totalResults + 2 * math.log10(tempRank + 1)) / (
  357.             tempRank + totalResults)
  358.  
  359.    omegaScore = 0
  360.  
  361.    alpha = 0.85  # adjusting parameter
  362.  
  363.    for eachWord in doc:
  364.  
  365.      if eachWord.lower() in ontologyTerms:
  366.        posOfTerm = ontologyTerms[eachWord]
  367.  
  368.        omegaScore += 1 / (posOfTerm * posOfTerm)
  369.  
  370.        del ontologyTerms[eachWord]
  371.  
  372.    omegaScore = omegaScore / 2
  373.  
  374.    reRankScore = (0.85 * baseScore) + ((1 - alpha) * omegaScore)
  375.  
  376.    return reRankScore
  377.  
  378.  
  379. #Crawl through single url
  380. def crawl(url, googleResultsForTerm, termDict, index, websiteList):
  381.       score = -1
  382.       try:
  383.  
  384.           r = requests.get(url.url, timeout=1)
  385.  
  386.           content_type = r.headers.get('content-type')
  387.  
  388.           if (content_type == None or 'text/html' in content_type):
  389.  
  390.               soupToGetText = BeautifulSoup(r.text, 'html.parser')
  391.  
  392.               score = re_rank(url.url,
  393.                               termDict,
  394.                               len(googleResultsForTerm), index,
  395.                               soupToGetText.get_text().lower())
  396.  
  397.           elif 'application/pdf' in content_type:
  398.              pass
  399.              '''
  400.              textFromPdfUrl = pdfReader(url.url)
  401.  
  402.              score = re_rank(url.url,
  403.                              termDict,
  404.                              len(googleResultsForTerm), index,
  405.                              textFromPdfUrl.lower()) '''
  406.           else:
  407.               print("Unrecognized format")
  408.  
  409.       except requests.ReadTimeout:
  410.  
  411.           pass
  412.  
  413.       except requests.exceptions.RequestException as e:
  414.  
  415.           print(e)
  416.  
  417.       websiteList.append(
  418.           Website(googleResultsForTerm[index].title,
  419.                   googleResultsForTerm[index].url,
  420.                   googleResultsForTerm[index].description, index,
  421.                   score))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement