Advertisement
Typhoon

Bazos.py

Jul 12th, 2016
182
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.01 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3.  
  4. import re
  5. import urllib.request as urllib
  6. from datetime import datetime
  7.  
  8. import cat_subcat_extractor
  9. from bs4 import BeautifulSoup
  10. from elasticsearch import Elasticsearch
  11. from tendo import singleton
  12.  
  13. me = singleton.SingleInstance()
  14.  
  15. print("Gathering Categories and SubCategories. It may take a while...")
  16. categories = cat_subcat_extractor.category_parser()
  17. print("Gathered CAT/SUBCAT: " + str(categories))
  18.  
  19. es = Elasticsearch(host='localhost', port=9200)
  20.  
  21. es_response = es.search(
  22.     index="bazos",
  23.     body={
  24.         "query": {
  25.             "bool": {
  26.                 "must": [{"match": {"_type": "bazos_items"}}]
  27.             }
  28.         },
  29.         "aggs": {
  30.             "max_price": {"max": {"field": "id"}}},
  31.         "sort": {"id": {"order": "desc"}},
  32.         "size": 1
  33.     }
  34. )
  35.  
  36. for hit in es_response['hits']['hits']:
  37.     last_elastic_id = (hit['_source']['id'])
  38.     print("Last Elasticsearch ID is: " + str((last_elastic_id)))
  39.  
  40. checknumber = int(last_elastic_id)
  41. lastchecknumber = int(checknumber) + 500
  42. while checknumber < lastchecknumber:
  43.     checknumber = (checknumber + 1)
  44.     checkurl = 'https://ostatne.bazos.sk/inzerat/' + str(checknumber) + '/index.php'
  45.     checkpage = urllib.urlopen(checkurl)
  46.     checksoup = BeautifulSoup(checkpage.read(), "html.parser")
  47.     response = urllib.urlopen(checkurl)
  48.  
  49.     if response.url != checkurl:
  50.         print("Redirected Site " + checkurl)
  51.         pass
  52.  
  53.     else:
  54.         try:
  55.             second_category = checksoup.find('div', class_='drobky').find_all('a')[2].getText()
  56.             main_category = categories[second_category]
  57.             title = checksoup.find('h1', class_='nadpis').getText()
  58.             demand = checksoup.find('span', class_='velikost10').getText()
  59.             demand = 'True' if "Dopyt" in str(demand) else 'False'
  60.             user_name = checksoup.select(".listadvlevo table tr td a")[0].getText()
  61.             user_phone = checksoup.select(".listadvlevo table tr td a")[1].getText()
  62.             user_psc = checksoup.select(".listadvlevo table tr td a")[2].getText()
  63.             user_psc = str(re.findall(r'\d+', user_psc.replace(" ", ""))[0])
  64.             user_town = checksoup.select(".listadvlevo table tr td a")[2].getText()
  65.             user_town = str(re.findall(r'\b[^\W\d_]+\b', user_town)[0])
  66.             user_mailid = checksoup.select(".listadvlevo table tr td a")[0]
  67.             user_mailid = re.search('(mail=)(\d+)(\&amp)', str(user_mailid)).group(2)
  68.             desc = checksoup.find('div', class_='popis').getText().replace("\n\n", ".")
  69.             price = checksoup.select(".listadvlevo table tr td b")[1].getText().replace(" ", "").replace("€", "") \
  70.                 .replace("Dohodou", "-1").replace("Vtexte", "-2").replace("Ponúknite", "-3")
  71.             item_date = checksoup.select(".velikost10")[0].getText()
  72.             item_date = str(re.findall(r'(?<=\[)(.*?)(?=\])', item_date)[0]).replace(" ", "")
  73.             item_date = str(item_date + " " + str(datetime.now().time()))
  74.             timestamp = datetime.strptime(item_date, '%d.%m.%Y %H:%M:%S.%f')
  75.  
  76.             # print("### START ###")
  77.             print(checkurl + "\n")
  78.             # print('1 Category: ' + main_category)
  79.             # print('2 Category: ' + second_category)
  80.             # print('Title: ' + title)
  81.             # print('Demand: ' + demand)
  82.             # print('User Name: ' + user_name)
  83.             # print('User Phone: ' + user_phone)
  84.             # print('User Mail ID: ' + user_mailid)
  85.             # print('User PSC: ' + user_psc)
  86.             # print('User Town: ' + user_town)
  87.             # print('Description: ' + desc[:10] + '...')
  88.             # print('Price: ' + price)
  89.             # print("### END ###\n")
  90.  
  91.             doc = {
  92.                 'uid': checknumber,
  93.                 'id': checknumber,
  94.                 'checkurl': checkurl,
  95.                 'main_category': main_category,
  96.                 'second_category': second_category,
  97.                 'title': title,
  98.                 'demand': demand,
  99.                 'user_name': user_name,
  100.                 'user_phone': user_phone,
  101.                 'user_mailid': user_mailid,
  102.                 'user_psc': user_psc,
  103.                 'user_town': user_town,
  104.                 'desc': desc,
  105.                 'price': price,
  106.                 'timestamp': timestamp
  107.             }
  108.             try:
  109.                 res = es.index(index="bazos", doc_type='bazos_items', body=doc)
  110.             except:
  111.                 print("Error while indexing document: " + checkurl + "\n")
  112.  
  113.         except:
  114.             print("Scraping ERROR: " + checkurl + checkurl + "\n")
  115.  
  116. print("Finish. Exiting... " + str(datetime.now()))
  117. exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement