Advertisement
Guest User

Untitled

a guest
Apr 6th, 2017
131
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.75 KB | None | 0 0
  1. #main.py
  2. import time
  3.  
  4. from selenium import webdriver
  5. from selenium.common.exceptions import NoSuchElementException
  6. from selenium.webdriver.common.keys import Keys
  7.  
  8. from src.db import connect
  9.  
  10. search_tags = 'Python Django'
  11. url = 'https://hh.ru/'
  12. vacancy_fields = ['job_type', 'company_name',
  13.                   'pub_date', 'vacancy_description',
  14.                   'work_hours', 'address', 'key_skills',
  15.                   ['salary', 'town', 'experience'],
  16.                   ]
  17. xpaths = [
  18.     '//span[@itemprop="employmentType"]',
  19.     '//a[@itemprop]',
  20.     '//time[@itemprop]',
  21.     '//div[@itemprop="description"]',
  22.     '//span[@itemprop="workHours"]',
  23.     "//div[@data-qa='vacancy-address-with-map']",
  24.     ("//span[@class='Bloko-TagList-Text']",),
  25.     ("//td[contains(@class,'b-v-info-content')]",),
  26. ]
  27.  
  28. driver = webdriver.Chrome()
  29.  
  30.  
  31. def start(uri):
  32.     print('Worker is running')
  33.     driver.get(uri)
  34.     path = '//input[@data-qa="vacancy-serp__query"]'
  35.     search_box = driver.find_element_by_xpath(path)
  36.     search_box.send_keys(search_tags)
  37.     search_box.send_keys(Keys.RETURN)
  38.  
  39.  
  40. def collect_links():
  41.     print('Collecting links')
  42.     path = "//a[@data-qa='vacancy-serp__vacancy-title']"
  43.     vacancies = driver.find_elements_by_xpath(path)
  44.     return vacancies
  45.  
  46.  
  47. def visit_all_links(links):
  48.     queryset = list()
  49.     for link in links:
  50.         print('Collecting link ' + link.text)
  51.         link.send_keys(Keys.CONTROL, Keys.RETURN)
  52.         time.sleep(5)
  53.         driver.switch_to.window(driver.window_handles[1])
  54.         data = collect_information()
  55.         queryset.append(data)
  56.         driver.close()
  57.         driver.switch_to.window(driver.window_handles[0])
  58.         print('Success')
  59.     return queryset
  60.  
  61.  
  62. def collect_information():
  63.     result = {}
  64.     print('Getting data from page')
  65.     for i in range(len(vacancy_fields)):
  66.         try:
  67.             if type(xpaths[i]) == tuple and type(vacancy_fields[i]) == list:
  68.                 data = driver.find_elements_by_xpath(xpaths[i][0])
  69.                 result.update(unpack_data(data, vacancy_fields[i]))
  70.                 continue
  71.             if type(xpaths[i]) == tuple and type(vacancy_fields[i]) != list:
  72.                 data = driver.find_elements_by_xpath(xpaths[i][0])
  73.                 result[vacancy_fields[i]] = ', '.join([x.text for x in data])
  74.                 continue
  75.             data = driver.find_element_by_xpath(xpaths[i]).text
  76.             result[vacancy_fields[i]] = data
  77.         except NoSuchElementException:
  78.             pass
  79.     print('Collecting data finished')
  80.     return result
  81.  
  82.  
  83. def unpack_data(parsed_data, fields):
  84.     result = dict()
  85.     for x in range(len(fields)):
  86.         result[fields[x]] = parsed_data[x].text
  87.     return result
  88.  
  89.  
  90. def put_in_db(data):
  91.     print('Putting to db proccess')
  92.     meta, connection = connect()
  93.     connection.execute(meta.tables['vacancies'].insert(), data)
  94.  
  95.  
  96. def interrupt():
  97.     print('Job done')
  98.     driver.quit()
  99.  
  100.  
  101. def main():
  102.     start(url)
  103.     links = collect_links()
  104.     data = visit_all_links(links)
  105.     put_in_db(data)
  106.     driver.quit()
  107. main()
  108.  
  109. #db.py
  110. import sqlalchemy
  111. from sqlalchemy import Table, Column, Integer, String, Date
  112.  
  113. username = 'asmo'
  114. word = 'allahakbar'
  115. db_name = 'hh'
  116.  
  117.  
  118. def connect(user=username, password=word,
  119.             db=db_name, host='localhost', port=5432):
  120.     url = 'postgresql://{}:{}@{}:{}/{}'
  121.     url = url.format(user, password, host, port, db)
  122.     con = sqlalchemy.create_engine(url, client_encoding='utf8')
  123.     meta = sqlalchemy.MetaData(bind=con, reflect=True)
  124.     return meta, con
  125.  
  126.  
  127. def add_column(con, table_name, col):
  128.     column_name = col.compile(dialect=con.dialect)
  129.     column_type = col.type.compile(con.dialect)
  130.     con.execute('ALTER TABLE {} ADD COLUMN {} {}'.
  131.                 format(table_name, column_name, column_type))
  132. metadata, connection = connect()
  133.  
  134. vacancy_fields = ['job_type', 'company_name',
  135.                   'pub_date', 'vacancy_description',
  136.                   'work_hours', 'address', 'key_skills',
  137.                   ['salary', 'town', 'experience'],
  138.                   ]
  139.  
  140.  
  141. def create_table(meta, con):
  142.     meta.create_all(con)
  143.  
  144.  
  145. hh_vacancies = Table('vacancies', metadata,
  146.                      Column('job_type', String(64)),
  147.                      Column('company_name', String(64)),
  148.                      Column('pub_date', Date),
  149.                      Column('vacancy_description', String),
  150.                      Column('work_hours', String),
  151.                      Column('address', String(100)),
  152.                      Column('key_skills', String),
  153.                      Column('salary', String(32)),
  154.                      Column('town', String(64)),
  155.                      Column('experience', String(64)),
  156.                      extend_existing=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement