Advertisement
Guest User

Untitled

a guest
Apr 2nd, 2019
113
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.96 KB | None | 0 0
  1. import re
  2. import os
  3. import time
  4. import datetime
  5. import urllib
  6. from urllib.parse import urlparse
  7.  
  8. from url_normalize import url_normalize
  9.  
  10. import requests
  11. import psycopg2
  12. from bs4 import BeautifulSoup
  13.  
  14. import database
  15.  
  16. import scheduler
  17.  
  18.  
  19. def getPictures(soup, url):
  20.     links = []
  21.     for link in soup.findAll('img', attrs={'src': re.compile("^http://")}):
  22.         links.append(url_normalize(link.get('src')))
  23.     for link in soup.findAll('img', attrs={'src': re.compile("^(?!www\.|(?:http|ftp)s?://|[A-Za-z]:\\|//).*")}):
  24.         links.append(url + link.get('src'))
  25.     return links
  26.  
  27.  
  28. def getLinks(soup, url):
  29.     links = []
  30.     for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
  31.         links.append(url_normalize(link.get('href')))
  32.         # for link in soup.findAll('a', attrs={'href': re.compile("^(/|.*" + url + ")")}):
  33.     for link in soup.findAll('a', attrs={'href': re.compile("^(?!www\.|(?:http|ftp)s?://|[A-Za-z]:\\|//).*")}):
  34.         links.append(url + link.get('href'))
  35.     return links
  36.  
  37.  
  38. def initWorker(driver, url, active, lock, cache):
  39.     db_conn = psycopg2.connect("host=localhost dbname=crawldb user=postgres password=admin")
  40.     # db_conn = psycopg2.connect("host=localhost dbname=crawldb user=mkozmelj")
  41.     r = requests.get(url)
  42.     if r.status_code == 200:
  43.         # print(r.status_code, url)
  44.         nov_url = url
  45.         driver.get(url)
  46.         time.sleep(10)
  47.         soup = BeautifulSoup(driver.page_source, 'html.parser')
  48.         if len(soup.find_all('base')) == 1:
  49.             nov_url = soup.find_all('base')[0]['href']
  50.         links = getLinks(soup, nov_url)
  51.         pictures = getPictures(soup, nov_url)
  52.         driver.quit()
  53.         files = []
  54.         for i in links:
  55.             if re.search(r"(\.gov\.si)+", i):
  56.                 try:
  57.                     # Če stran nima robots.txt pride do errorja
  58.                     if cache.allowed(i, '*') is True:
  59.                         if re.compile("^.+\.(?:doc|docx|pdf|ppt|pptx)($|\n)", re.VERBOSE).search(i):
  60.                             files.append(i)
  61.                         else:
  62.                             database.add_page(urlparse(i).netloc, i, "FRONTIER",
  63.                                               None, None, datetime.datetime.now(),
  64.                                               db_conn, lock)
  65.                             database.add_link(url, i, db_conn, lock)
  66.                 except Exception as e:
  67.                     print("Napaka pri dodajanju v bazo: ")
  68.                     print(e)
  69.                     # Todo Pri shranjevanju v bazo je potrebno preverit če je mogoče duplikat.
  70.                     #  V primeru da bomo računali podobnost bo najbolje računat hašh kar tukaj.
  71.         database.add_page(urlparse(nov_url).netloc, url, "HTML", str(soup), r.status_code, r.headers['Date'], db_conn,
  72.                           lock)
  73.         #print("stevilo slik: ", len(pictures))
  74.         for i in pictures:
  75.             if re.search(r"(\.gov\.si)+", i):
  76.                 if re.compile("^.+\.(?:jpg | gif | png | bmp | tiff)($ | \n)", re.VERBOSE).search(i):
  77.                     filename = os.path.basename(urlparse(i).path)
  78.                     if database.check_image(filename, url, db_conn, lock) == -1:
  79.                         try:
  80.                             with urllib.request.urlopen(i) as response:
  81.                                 data = response.read(100000)  # 1 MB
  82.                                 extension = os.path.splitext(filename)[1][1:]
  83.                                 # content_type = response.info().get_content_type()
  84.                                 database.add_image(url, filename, extension, data, datetime.datetime.now(), db_conn,
  85.                                                    lock)
  86.                         except:
  87.                             print("Napaka pri dodajanju slike v bazo")
  88.                             pass
  89.                     #else:
  90.                         #print("Slika je ze v bazi!")
  91.         for j in files:
  92.             filename = os.path.basename(urlparse(j).path)
  93.             if database.check_file(filename, url, db_conn, lock) == -1 and filename:
  94.                 with urllib.request.urlopen(j) as response:
  95.                     data = response.read(100000)  # 1 MB
  96.                     # content_type = response.info().get_content_type()
  97.                     extension = os.path.splitext(filename)[1][1:]
  98.                     # content_type = response.info().get_content_maintype()
  99.                     database.add_page_data(url, extension.upper(), data, db_conn, lock)
  100.  
  101.  
  102.     elif r.status_code == 404:
  103.         print(r.status_code, url)
  104.         driver.get(url)
  105.         time.sleep(10)
  106.         soup = BeautifulSoup(driver.page_source, 'html.parser')
  107.         database.add_page(urlparse(url).netloc, url, "HTML", str(soup), r.status_code, datetime.datetime.now(),
  108.                           db_conn, lock)
  109.     db_conn.commit()
  110.     db_conn.close()
  111.     driver.quit()
  112.     # print("koncujem task",url)
  113.     active.value -= 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement