SHARE
TWEET

Untitled

a guest Apr 2nd, 2019 86 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import re
  2. import os
  3. import time
  4. import datetime
  5. import urllib
  6. from urllib.parse import urlparse
  7.  
  8. from url_normalize import url_normalize
  9.  
  10. import requests
  11. import psycopg2
  12. from bs4 import BeautifulSoup
  13.  
  14. import database
  15.  
  16. import scheduler
  17.  
  18.  
  19. def getPictures(soup, url):
  20.     links = []
  21.     for link in soup.findAll('img', attrs={'src': re.compile("^http://")}):
  22.         links.append(url_normalize(link.get('src')))
  23.     for link in soup.findAll('img', attrs={'src': re.compile("^(?!www\.|(?:http|ftp)s?://|[A-Za-z]:\\|//).*")}):
  24.         links.append(url + link.get('src'))
  25.     return links
  26.  
  27.  
  28. def getLinks(soup, url):
  29.     links = []
  30.     for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
  31.         links.append(url_normalize(link.get('href')))
  32.         # for link in soup.findAll('a', attrs={'href': re.compile("^(/|.*" + url + ")")}):
  33.     for link in soup.findAll('a', attrs={'href': re.compile("^(?!www\.|(?:http|ftp)s?://|[A-Za-z]:\\|//).*")}):
  34.         links.append(url + link.get('href'))
  35.     return links
  36.  
  37.  
  38. def initWorker(driver, url, active, lock, cache):
  39.     db_conn = psycopg2.connect("host=localhost dbname=crawldb user=postgres password=admin")
  40.     # db_conn = psycopg2.connect("host=localhost dbname=crawldb user=mkozmelj")
  41.     r = requests.get(url)
  42.     if r.status_code == 200:
  43.         # print(r.status_code, url)
  44.         nov_url = url
  45.         driver.get(url)
  46.         time.sleep(10)
  47.         soup = BeautifulSoup(driver.page_source, 'html.parser')
  48.         if len(soup.find_all('base')) == 1:
  49.             nov_url = soup.find_all('base')[0]['href']
  50.         links = getLinks(soup, nov_url)
  51.         pictures = getPictures(soup, nov_url)
  52.         driver.quit()
  53.         files = []
  54.         for i in links:
  55.             if re.search(r"(\.gov\.si)+", i):
  56.                 try:
  57.                     # Če stran nima robots.txt pride do errorja
  58.                     if cache.allowed(i, '*') is True:
  59.                         if re.compile("^.+\.(?:doc|docx|pdf|ppt|pptx)($|\n)", re.VERBOSE).search(i):
  60.                             files.append(i)
  61.                         else:
  62.                             database.add_page(urlparse(i).netloc, i, "FRONTIER",
  63.                                               None, None, datetime.datetime.now(),
  64.                                               db_conn, lock)
  65.                             database.add_link(url, i, db_conn, lock)
  66.                 except Exception as e:
  67.                     print("Napaka pri dodajanju v bazo: ")
  68.                     print(e)
  69.                     # Todo Pri shranjevanju v bazo je potrebno preverit če je mogoče duplikat.
  70.                     #  V primeru da bomo računali podobnost bo najbolje računat hašh kar tukaj.
  71.         database.add_page(urlparse(nov_url).netloc, url, "HTML", str(soup), r.status_code, r.headers['Date'], db_conn,
  72.                           lock)
  73.         #print("stevilo slik: ", len(pictures))
  74.         for i in pictures:
  75.             if re.search(r"(\.gov\.si)+", i):
  76.                 if re.compile("^.+\.(?:jpg | gif | png | bmp | tiff)($ | \n)", re.VERBOSE).search(i):
  77.                     filename = os.path.basename(urlparse(i).path)
  78.                     if database.check_image(filename, url, db_conn, lock) == -1:
  79.                         try:
  80.                             with urllib.request.urlopen(i) as response:
  81.                                 data = response.read(100000)  # 1 MB
  82.                                 extension = os.path.splitext(filename)[1][1:]
  83.                                 # content_type = response.info().get_content_type()
  84.                                 database.add_image(url, filename, extension, data, datetime.datetime.now(), db_conn,
  85.                                                    lock)
  86.                         except:
  87.                             print("Napaka pri dodajanju slike v bazo")
  88.                             pass
  89.                     #else:
  90.                         #print("Slika je ze v bazi!")
  91.         for j in files:
  92.             filename = os.path.basename(urlparse(j).path)
  93.             if database.check_file(filename, url, db_conn, lock) == -1 and filename:
  94.                 with urllib.request.urlopen(j) as response:
  95.                     data = response.read(100000)  # 1 MB
  96.                     # content_type = response.info().get_content_type()
  97.                     extension = os.path.splitext(filename)[1][1:]
  98.                     # content_type = response.info().get_content_maintype()
  99.                     database.add_page_data(url, extension.upper(), data, db_conn, lock)
  100.  
  101.  
  102.     elif r.status_code == 404:
  103.         print(r.status_code, url)
  104.         driver.get(url)
  105.         time.sleep(10)
  106.         soup = BeautifulSoup(driver.page_source, 'html.parser')
  107.         database.add_page(urlparse(url).netloc, url, "HTML", str(soup), r.status_code, datetime.datetime.now(),
  108.                           db_conn, lock)
  109.     db_conn.commit()
  110.     db_conn.close()
  111.     driver.quit()
  112.     # print("koncujem task",url)
  113.     active.value -= 1
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top