Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- import re
- import urllib.request as urllib
- from datetime import datetime
- import cat_subcat_extractor
- from bs4 import BeautifulSoup
- from elasticsearch import Elasticsearch
- from tendo import singleton
- me = singleton.SingleInstance()
- print("Gathering Categories and SubCategories. It may take a while...")
- categories = cat_subcat_extractor.category_parser()
- print("Gathered CAT/SUBCAT: " + str(categories))
- es = Elasticsearch(host='localhost', port=9200)
- es_response = es.search(
- index="bazos",
- body={
- "query": {
- "bool": {
- "must": [{"match": {"_type": "bazos_items"}}]
- }
- },
- "aggs": {
- "max_price": {"max": {"field": "id"}}},
- "sort": {"id": {"order": "desc"}},
- "size": 1
- }
- )
- for hit in es_response['hits']['hits']:
- last_elastic_id = (hit['_source']['id'])
- print("Last Elasticsearch ID is: " + str((last_elastic_id)))
- checknumber = int(last_elastic_id)
- lastchecknumber = int(checknumber) + 500
- while checknumber < lastchecknumber:
- checknumber = (checknumber + 1)
- checkurl = 'https://ostatne.bazos.sk/inzerat/' + str(checknumber) + '/index.php'
- checkpage = urllib.urlopen(checkurl)
- checksoup = BeautifulSoup(checkpage.read(), "html.parser")
- response = urllib.urlopen(checkurl)
- if response.url != checkurl:
- print("Redirected Site " + checkurl)
- pass
- else:
- try:
- second_category = checksoup.find('div', class_='drobky').find_all('a')[2].getText()
- main_category = categories[second_category]
- title = checksoup.find('h1', class_='nadpis').getText()
- demand = checksoup.find('span', class_='velikost10').getText()
- demand = 'True' if "Dopyt" in str(demand) else 'False'
- user_name = checksoup.select(".listadvlevo table tr td a")[0].getText()
- user_phone = checksoup.select(".listadvlevo table tr td a")[1].getText()
- user_psc = checksoup.select(".listadvlevo table tr td a")[2].getText()
- user_psc = str(re.findall(r'\d+', user_psc.replace(" ", ""))[0])
- user_town = checksoup.select(".listadvlevo table tr td a")[2].getText()
- user_town = str(re.findall(r'\b[^\W\d_]+\b', user_town)[0])
- user_mailid = checksoup.select(".listadvlevo table tr td a")[0]
- user_mailid = re.search('(mail=)(\d+)(\&)', str(user_mailid)).group(2)
- desc = checksoup.find('div', class_='popis').getText().replace("\n\n", ".")
- price = checksoup.select(".listadvlevo table tr td b")[1].getText().replace(" ", "").replace("€", "") \
- .replace("Dohodou", "-1").replace("Vtexte", "-2").replace("Ponúknite", "-3")
- item_date = checksoup.select(".velikost10")[0].getText()
- item_date = str(re.findall(r'(?<=\[)(.*?)(?=\])', item_date)[0]).replace(" ", "")
- item_date = str(item_date + " " + str(datetime.now().time()))
- timestamp = datetime.strptime(item_date, '%d.%m.%Y %H:%M:%S.%f')
- # print("### START ###")
- print(checkurl + "\n")
- # print('1 Category: ' + main_category)
- # print('2 Category: ' + second_category)
- # print('Title: ' + title)
- # print('Demand: ' + demand)
- # print('User Name: ' + user_name)
- # print('User Phone: ' + user_phone)
- # print('User Mail ID: ' + user_mailid)
- # print('User PSC: ' + user_psc)
- # print('User Town: ' + user_town)
- # print('Description: ' + desc[:10] + '...')
- # print('Price: ' + price)
- # print("### END ###\n")
- doc = {
- 'uid': checknumber,
- 'id': checknumber,
- 'checkurl': checkurl,
- 'main_category': main_category,
- 'second_category': second_category,
- 'title': title,
- 'demand': demand,
- 'user_name': user_name,
- 'user_phone': user_phone,
- 'user_mailid': user_mailid,
- 'user_psc': user_psc,
- 'user_town': user_town,
- 'desc': desc,
- 'price': price,
- 'timestamp': timestamp
- }
- try:
- res = es.index(index="bazos", doc_type='bazos_items', body=doc)
- except:
- print("Error while indexing document: " + checkurl + "\n")
- except:
- print("Scraping ERROR: " + checkurl + checkurl + "\n")
- print("Finish. Exiting... " + str(datetime.now()))
- exit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement