Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- from lxml import etree
- from lxml import objectify
- import urllib.request
- import ssl
- class Property:
- ad_name = ""
- participating_interest = ""
- developer = ""
- building_name = ""
- building = ""
- facing = ""
- floor = ""
- floors_total = ''
- building_type = ""
- rooms = ''
- total_area = ""
- living_area = ""
- kitchen_are = ""
- handover = ""
- address = ""
- price = ''
- phone = ""
- contact_person = ""
- img_list = []
- inf = ""
- date = ""
- #location
- country =""
- region=""
- district =""
- locality_name=""
- sub_locality_name=""
- address = ""
- metro = ""
- def create_node(self):
- node = etree.Element('offer')
- #some inf from website?
- location = etree.SubElement(node, 'location')
- country = etree.SubElement(location, 'country')
- region = etree.SubElement(location,'region')
- district = etree.SubElement(location, 'district')
- locality_name = etree.SubElement(location, 'locality-name')
- sub_locality_name = etree.SubElement(location, 'sub-locality-name')
- address = etree.SubElement(location, 'address')
- metro = etree.SubElement(location, 'metro')
- metro_name = etree.SubElement(metro, 'name')
- sales_agent = etree.SubElement(node, 'sales-agent')
- phone = etree.SubElement(sales_agent, 'phone')
- agent_name = etree.SubElement(sales_agent, 'name')
- email = etree.SubElement(sales_agent, 'email')
- category = etree.SubElement(sales_agent, 'category')
- organization = etree.SubElement(sales_agent, 'organization')
- deal_status = etree.SubElement(node, 'deal-status')
- price = etree.SubElement(node, 'price')
- price_value = etree.SubElement(price, 'value')
- etree.SubElement(price, 'currency').text = 'RUR'
- etree.SubElement(node, 'description').text = self.inf
- etree.SubElement(node, 'building-type').text = self.building_type
- etree.SubElement(node, 'rooms').text = self.rooms
- etree.SubElement(node, 'floor').text = self.floor
- etree.SubElement(node, 'floors-total').text = self.floors_total
- area = etree.SubElement(node, 'area')
- area_value = etree.SubElement(area, 'value')
- unit = etree.SubElement(area, 'unit')
- living_space = etree.SubElement(node, 'living-space')
- living_space__value = etree.SubElement(living_space, 'value')
- unit = etree.SubElement(living_space, 'unit')
- kitchen_space = etree.SubElement(node, 'kitchen-space')
- kitchen_space__value = etree.SubElement(kitchen_space, 'value')
- unit = etree.SubElement(kitchen_space, 'unit')
- etree.SubElement(node, 'lot-type')
- return node
- #some images
- def html_to_soup(link):
- try:
- # make ssl certificate (for launch on windows)
- gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
- with urllib.request.urlopen(link, context=gcontext) as response:
- # write html code to variable
- html = response.read()
- return BeautifulSoup(html, 'html.parser')
- except Exception as err:
- print('Error in html_to_soup')
- print(err.args)
- def soup_to_obj(soup):
- property = Property()
- #get title
- property.ad_name = soup.select_one('.title-info-title-text').get_text().strip()
- #get location
- property.locality_name, property.address = soup.select_one(".item-address__string").get_text().strip().split(', ',1)
- property.metro = soup.select_one('.item-address-georeferences-item__content').get_text().strip()
- #get description
- property.inf = soup.select_one('.item-description-html').get_text().strip()
- #get information about contact person
- property.contact_person = soup.select_one('.seller-info-value').get_text().strip()
- #get price
- #property.price = soup.select_one('.js-item-price').span['content']
- #get information
- list = [item.get_text().strip().split(': ') for item in soup.findAll('li', class_= "item-params-list-item")]
- for item in list:
- if item[0] == 'Тип участия':
- property.participating_interest = item[1]
- elif item[0] == 'Официальный застройщик':
- property.developer = item[1]
- elif item[0] == 'Название новостройки':
- property.building_name = item[1]
- elif item[0] == 'Корпус, строение':
- property.building = item[1]
- elif item[0] == 'Отделка':
- property.facing = item[1]
- elif item[0] == 'Этаж':
- property.floor = item[1]
- elif item[0] == 'Этажей в доме':
- property.floors_total = item[1]
- elif item[0] == 'Тип дома':
- property.building_type = item[1]
- elif item[0] == 'Количество комнат':
- property.rooms = item[1]
- elif item[0] == 'Общая площадь':
- property.total_area = item[1]
- elif item[0] == 'Жилая площадь':
- property.living_area = item[1]
- elif item[0] == 'Площадь кухни':
- property.kitchen_are = item[1]
- else:
- property.date = item[1]
- return property
- soup = html_to_soup("https://www.avito.ru/perm/kvartiry/1-k_kvartira_48.6_m_1725_et._1788295067")
- obj_xml = etree.tostring(soup_to_obj(soup).create_node(), pretty_print=True, xml_declaration=True, encoding='utf8')
- try:
- with open('test.xml', 'wb') as xml_writer:
- xml_writer.write(obj_xml)
- except IOError:
- pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement