Advertisement
Guest User

Untitled

a guest
Aug 26th, 2019
100
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.68 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. from lxml import etree
  3. from lxml import objectify
  4. import urllib.request
  5. import ssl
  6.  
  7.  
  8. class Property:
  9.     ad_name = ""
  10.     participating_interest = ""
  11.     developer = ""
  12.     building_name = ""
  13.     building = ""
  14.     facing = ""
  15.     floor = ""
  16.     floors_total = ''
  17.     building_type = ""
  18.     rooms = ''
  19.     total_area = ""
  20.     living_area = ""
  21.     kitchen_are = ""
  22.     handover = ""
  23.     address = ""
  24.     price = ''
  25.     phone = ""
  26.     contact_person = ""
  27.     img_list = []
  28.     inf = ""
  29.     date = ""
  30.  
  31.     #location
  32.     country =""
  33.     region=""
  34.     district =""
  35.     locality_name=""
  36.     sub_locality_name=""
  37.     address = ""
  38.     metro = ""
  39.  
  40.     def create_node(self):
  41.         node = etree.Element('offer')
  42.  
  43.         #some inf from website?
  44.  
  45.         location = etree.SubElement(node, 'location')
  46.         country = etree.SubElement(location, 'country')
  47.         region = etree.SubElement(location,'region')
  48.         district = etree.SubElement(location, 'district')
  49.         locality_name = etree.SubElement(location, 'locality-name')
  50.         sub_locality_name = etree.SubElement(location, 'sub-locality-name')
  51.         address = etree.SubElement(location, 'address')
  52.         metro = etree.SubElement(location, 'metro')
  53.         metro_name = etree.SubElement(metro, 'name')
  54.         sales_agent = etree.SubElement(node, 'sales-agent')
  55.         phone = etree.SubElement(sales_agent, 'phone')
  56.         agent_name = etree.SubElement(sales_agent, 'name')
  57.         email = etree.SubElement(sales_agent, 'email')
  58.         category = etree.SubElement(sales_agent, 'category')
  59.         organization = etree.SubElement(sales_agent, 'organization')
  60.         deal_status = etree.SubElement(node, 'deal-status')
  61.         price = etree.SubElement(node, 'price')
  62.         price_value = etree.SubElement(price, 'value')
  63.         etree.SubElement(price, 'currency').text = 'RUR'
  64.         etree.SubElement(node, 'description').text = self.inf
  65.         etree.SubElement(node, 'building-type').text = self.building_type
  66.         etree.SubElement(node, 'rooms').text = self.rooms
  67.         etree.SubElement(node, 'floor').text = self.floor
  68.         etree.SubElement(node, 'floors-total').text = self.floors_total
  69.         area = etree.SubElement(node, 'area')
  70.         area_value = etree.SubElement(area, 'value')
  71.         unit = etree.SubElement(area, 'unit')
  72.         living_space = etree.SubElement(node, 'living-space')
  73.         living_space__value = etree.SubElement(living_space, 'value')
  74.         unit = etree.SubElement(living_space, 'unit')
  75.         kitchen_space = etree.SubElement(node, 'kitchen-space')
  76.         kitchen_space__value = etree.SubElement(kitchen_space, 'value')
  77.         unit = etree.SubElement(kitchen_space, 'unit')
  78.         etree.SubElement(node, 'lot-type')
  79.  
  80.         return node
  81.  
  82.         #some images
  83.  
  84.  
  85. def html_to_soup(link):
  86.     try:
  87.         # make ssl certificate (for launch on windows)
  88.         gcontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
  89.  
  90.         with urllib.request.urlopen(link, context=gcontext) as response:
  91.             # write html code to variable
  92.             html = response.read()
  93.  
  94.         return BeautifulSoup(html, 'html.parser')
  95.  
  96.     except Exception as err:
  97.         print('Error in html_to_soup')
  98.         print(err.args)
  99.  
  100.  
  101. def soup_to_obj(soup):
  102.     property = Property()
  103.  
  104.     #get title
  105.     property.ad_name = soup.select_one('.title-info-title-text').get_text().strip()
  106.  
  107.     #get location
  108.     property.locality_name, property.address = soup.select_one(".item-address__string").get_text().strip().split(', ',1)
  109.     property.metro = soup.select_one('.item-address-georeferences-item__content').get_text().strip()
  110.  
  111.     #get description
  112.     property.inf = soup.select_one('.item-description-html').get_text().strip()
  113.  
  114.     #get information about contact person
  115.     property.contact_person = soup.select_one('.seller-info-value').get_text().strip()
  116.  
  117.     #get price
  118.     #property.price = soup.select_one('.js-item-price').span['content']
  119.  
  120.     #get information
  121.     list = [item.get_text().strip().split(': ') for item in soup.findAll('li', class_= "item-params-list-item")]
  122.     for item in list:
  123.         if item[0] == 'Тип участия':
  124.             property.participating_interest = item[1]
  125.         elif item[0] == 'Официальный застройщик':
  126.             property.developer = item[1]
  127.         elif item[0] == 'Название новостройки':
  128.             property.building_name = item[1]
  129.         elif item[0] == 'Корпус, строение':
  130.             property.building = item[1]
  131.         elif item[0] == 'Отделка':
  132.             property.facing = item[1]
  133.         elif item[0] == 'Этаж':
  134.             property.floor = item[1]
  135.         elif item[0] == 'Этажей в доме':
  136.             property.floors_total = item[1]
  137.         elif item[0] == 'Тип дома':
  138.             property.building_type = item[1]
  139.         elif item[0] == 'Количество комнат':
  140.             property.rooms = item[1]
  141.         elif item[0] == 'Общая площадь':
  142.             property.total_area = item[1]
  143.         elif item[0] == 'Жилая площадь':
  144.             property.living_area = item[1]
  145.         elif item[0] == 'Площадь кухни':
  146.             property.kitchen_are = item[1]
  147.         else:
  148.             property.date = item[1]
  149.     return property
  150.  
  151.  
  152. soup = html_to_soup("https://www.avito.ru/perm/kvartiry/1-k_kvartira_48.6_m_1725_et._1788295067")
  153. obj_xml = etree.tostring(soup_to_obj(soup).create_node(), pretty_print=True, xml_declaration=True, encoding='utf8')
  154.  
  155. try:
  156.     with open('test.xml', 'wb') as xml_writer:
  157.         xml_writer.write(obj_xml)
  158. except IOError:
  159.     pass
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement