Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import urllib2
- import re
- import csv
- match_page = re.compile(r'<a href=\"\/employer\/\d+\">.+?</a>', re.UNICODE)
- match_text = re.compile(r'[^(<a href="/employer/\d+?\>)].+[^(\s</a>)]', re.UNICODE)
- match_hh_company_url = re.compile(r'/employer/\d+', re.UNICODE)
- def listing_employers():
- for x in range(0, 20):
- source_list = []
- page_id = str(x)
- headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'}
- if page_id != 19:
- page = urllib2.urlopen("http://hh.ru/applicant/searchvacancyresult.xml?orderBy=0&itemsOnPage=100&areaId=1&professionalAreaId=5&compensationCurrencyCode=RUR&searchPeriod=30&page="+page_id)
- htmled = page.read()
- source_list += match_page.findall(htmled)
- else:
- break
- source_list = set(source_list)
- return source_list
- def w_to_file(f_name, source_list):
- with open(f_name, 'wb') as csvfile:
- bankwriter = csv.writer(csvfile, dialect='excel')
- for each in source_list:
- bankwriter.writerow(match_text.findall(each) + match_hh_company_url(each))
- def utf_8_encoder(unicode_csv_data):
- for line in unicode_csv_data:
- yield line.encode('utf-8')
- w_to_file('bank_base', listing_employers())
- utf_8_encoder('bank_base')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement