Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from fake_useragent import UserAgent
- from fake_useragent import FakeUserAgentError
- from bs4 import BeautifulSoup
- import urllib
- import re
- import math
- import os
- import PyPDF2
- import json
- import logging
- from threading import Thread
- #data structure to hold google results
- class googleUrls(object):
- def __init__(self, title=None, url=None, description=None):
- self.title = title
- self.url = url
- self.description = description
- #data structure
- class Website(object):
- def __init__(self,
- title=None,
- url=None,
- description=None,
- originalRank=None,
- reRankScore=None):
- self.title = title
- self.url = url
- self.description = description
- self.originalRank = originalRank
- self.reRankScore = reRankScore
- def getUrlsFromGoogle(term, noOfResults):
- try:
- ua = UserAgent()
- except FakeUserAgentError:
- pass
- query = urllib.parse.quote_plus(term) # Format into URL encoding
- google_url = "https://www.google.com/search?q=" + query + "&num=" + str(
- noOfResults)
- response = requests.get(google_url, {"User-Agent": ua.random})
- soup = BeautifulSoup(response.text, "html.parser")
- result_div = soup.find_all('div', attrs={'class': 'ZINbbc'})
- links = []
- titles = []
- descriptions = []
- # ontologyNameList = getTheListFromTree(ontologyTree, "title")
- for r in result_div:
- # Checks if each element is present, else, raise exception
- try:
- link = r.find('a', href=True)
- title = r.find('div', attrs={'class': 'vvjwJb'}).get_text()
- description = r.find('div', attrs={'class': 's3v9rd'}).get_text()
- # Check to make sure everything is present before appending
- if link != '' and title != '' and description != '':
- links.append(link['href'])
- titles.append(title)
- descriptions.append(description)
- # Next loop if one element is not present
- except:
- continue
- print(len(titles), len(descriptions), len(links))
- if (len(titles) != len(descriptions) or len(descriptions) != len(links)):
- print("ERROR! HORROR!")
- exit()
- to_remove = []
- clean_links = []
- for i, l in enumerate(links):
- clean = re.search('\/url\?q\=(.*)\&sa', l)
- # Anything that doesn't fit the above pattern will be removed
- if clean is None:
- to_remove.append(i)
- continue
- clean_links.append(clean.group(1))
- # Remove the corresponding titles & descriptions
- for x in to_remove:
- del titles[x]
- del descriptions[x]
- result_list = []
- count = 0
- for m in clean_links:
- result_list.append(googleUrls(titles[count], m, descriptions[count]))
- count += 1
- return result_list
- #
- def crawl_search(term, jsonTree):
- logging.basicConfig(format='%(asctime)s %(message)s',
- datefmt='%m/%d/%Y %I:%M:%S %p')
- logging.warning('is when this event was In.')
- termsList = getTheListFromTree(jsonTree, 'title')
- listDict = {}
- pos = 1
- for m in termsList:
- listDict[m.lower()] = pos
- pos = pos + 1
- print(m)
- isTermPresent = False
- if term.lower() in listDict:
- isTermPresent = True
- getIndexOfTerm = listDict[term.lower()]
- if isTermPresent == False:
- googleResultsForTerm = getUrlsFromGoogle(term, 30)
- websiteList = []
- threads = []
- for ii in range(len(googleResultsForTerm)):
- process = Thread(target=crawl, args=[
- googleResultsForTerm[ii], googleResultsForTerm, listDict, ii, websiteList])
- process.start()
- threads.append(process)
- for process in threads:
- process.join()
- websiteList.sort(key=lambda x: x.reRankScore, reverse=True)
- json_string = json.dumps([ob.__dict__ for ob in websiteList])
- logging.basicConfig(format='%(asctime)s %(message)s',
- datefmt='%m/%d/%Y %I:%M:%S %p')
- logging.warning('is when this event was out.')
- return json_string
- else:
- websiteList = []
- googleResultsForTerm = getUrlsFromGoogle(term + termsList[0], 30)
- threads = []
- for ii in range(len(googleResultsForTerm)):
- process = Thread(target=crawl, args=[
- googleResultsForTerm[ii], googleResultsForTerm, listDict, ii, websiteList])
- process.start()
- threads.append(process)
- for process in threads:
- process.join()
- #Sorting based on re-rank score
- websiteList.sort(key=lambda x: x.reRankScore, reverse=True)
- termListSize = len(termsList)
- term1 = ""
- term2 = ""
- term3 = ""
- if (termListSize - getIndexOfTerm >= 3):
- term1 = termsList[getIndexOfTerm]
- term2 = termsList[getIndexOfTerm + 1]
- term3 = termsList[getIndexOfTerm + 2]
- elif (termListSize - getIndexOfTerm >= 2):
- term1 = termsList[getIndexOfTerm]
- term2 = termsList[getIndexOfTerm + 1]
- elif (termListSize - getIndexOfTerm >= 1):
- term1 = termsList[getIndexOfTerm]
- elif (termListSize > 2):
- term1 = termsList[getIndexOfTerm - 2]
- print(term1 + " " + term2 + " " + term3)
- if term1 and term2 and term3:
- term1_google_results = getUrlsFromGoogle(term1, 7)
- term2_google_results = getUrlsFromGoogle(term2, 7)
- term3_google_results = getUrlsFromGoogle(term3, 7)
- json_string = json.dumps(
- [ob.__dict__ for ob in websiteList] +
- [ob.__dict__ for ob in term1_google_results] +
- [ob.__dict__ for ob in term2_google_results] +
- [ob.__dict__ for ob in term3_google_results]
- )
- elif term1 and term2 and not term3:
- term1_google_results = getUrlsFromGoogle(term1, 10)
- term2_google_results = getUrlsFromGoogle(term2, 10)
- json_string = json.dumps(
- [ob.__dict__ for ob in websiteList] +
- [ob.__dict__ for ob in term1_google_results] +
- [ob.__dict__ for ob in term2_google_results]
- )
- elif term1 and not term2 and not term3:
- term1_google_results = getUrlsFromGoogle(term1, 20)
- json_string = json.dumps(
- [ob.__dict__ for ob in websiteList] +
- [ob.__dict__ for ob in term1_google_results]
- )
- else:
- json_string = json.dumps(
- [ob.__dict__ for ob in websiteList]
- )
- logging.basicConfig(format='%(asctime)s %(message)s',
- datefmt='%m/%d/%Y %I:%M:%S %p')
- logging.warning('is when this event was out.')
- return json_string
- #reading the pdf-urls script
- def pdfReader(pdfUrl):
- r = requests.get(pdfUrl, stream=True)
- with open("python.pdf", "wb") as pdf:
- for chunk in r.iter_content(chunk_size=1024):
- # writing one chunk at a time to pdf file
- if chunk:
- pdf.write(chunk)
- pdfFileObject = open('python.pdf', 'rb')
- pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
- count = pdfReader.numPages
- ss = ""
- for i in range(count):
- page = pdfReader.getPage(i)
- ss += page.extractText()
- return ss
- #helper function for getTheListFromTree
- def item_generator(json_input, lookup_key):
- if isinstance(json_input, dict):
- for k, v in json_input.items():
- if k == lookup_key:
- yield v
- else:
- yield from item_generator(v, lookup_key)
- elif isinstance(json_input, list):
- for item in json_input:
- yield from item_generator(item, lookup_key)
- #generating all terms from ontology
- def getTheListFromTree(jsonTree, lookup_key):
- data = json.loads(jsonTree)
- ontologyList = []
- for _ in item_generator(data, lookup_key):
- ontologyList.append(_.lower())
- return ontologyList
- #re rank script
- def re_rank(url, ontologyTerms, totalResults, tempRank, doc):
- if tempRank == 1:
- baseScore = 1
- else:
- baseScore = (totalResults + 2 * math.log10(tempRank + 1)) / (
- tempRank + totalResults)
- omegaScore = 0
- alpha = 0.85 # adjusting parameter
- for eachWord in doc:
- if eachWord.lower() in ontologyTerms:
- posOfTerm = ontologyTerms[eachWord]
- omegaScore += 1 / (posOfTerm * posOfTerm)
- del ontologyTerms[eachWord]
- omegaScore = omegaScore / 2
- reRankScore = (0.85 * baseScore) + ((1 - alpha) * omegaScore)
- return reRankScore
- #Crawl through single url
- def crawl(url, googleResultsForTerm, termDict, index, websiteList):
- score = -1
- try:
- r = requests.get(url.url, timeout=1)
- content_type = r.headers.get('content-type')
- if (content_type == None or 'text/html' in content_type):
- soupToGetText = BeautifulSoup(r.text, 'html.parser')
- score = re_rank(url.url,
- termDict,
- len(googleResultsForTerm), index,
- soupToGetText.get_text().lower())
- elif 'application/pdf' in content_type:
- pass
- '''
- textFromPdfUrl = pdfReader(url.url)
- score = re_rank(url.url,
- termDict,
- len(googleResultsForTerm), index,
- textFromPdfUrl.lower()) '''
- else:
- print("Unrecognized format")
- except requests.ReadTimeout:
- pass
- except requests.exceptions.RequestException as e:
- print(e)
- websiteList.append(
- Website(googleResultsForTerm[index].title,
- googleResultsForTerm[index].url,
- googleResultsForTerm[index].description, index,
- score))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement