Advertisement
Guest User

Untitled

a guest
Sep 27th, 2013
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.25 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. import urllib2
  5. import re
  6. import csv
  7.  
  8. match_page = re.compile(r'<a href=\"\/employer\/\d+\">.+?</a>', re.UNICODE)
  9. match_text = re.compile(r'[^(<a href="/employer/\d+?\>)].+[^(\s</a>)]', re.UNICODE)
  10. match_hh_company_url = re.compile(r'/employer/\d+', re.UNICODE)
  11.  
  12. def listing_employers():
  13.  
  14.     for x in range(0, 20):
  15.         source_list = []
  16.         page_id = str(x)
  17.         headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'}
  18.         if page_id != 19:
  19.             page = urllib2.urlopen("http://hh.ru/applicant/searchvacancyresult.xml?orderBy=0&itemsOnPage=100&areaId=1&professionalAreaId=5&compensationCurrencyCode=RUR&searchPeriod=30&page="+page_id)
  20.             htmled = page.read()
  21.             source_list += match_page.findall(htmled)
  22.         else:
  23.             break
  24.     source_list = set(source_list)
  25.     return source_list
  26.  
  27.  
  28.  
  29. def w_to_file(f_name, source_list):
  30.     with open(f_name, 'wb') as csvfile:
  31.         bankwriter = csv.writer(csvfile, dialect='excel')
  32.         for each in source_list:
  33.             bankwriter.writerow(match_text.findall(each) + match_hh_company_url(each))
  34.  
  35. def utf_8_encoder(unicode_csv_data):
  36.     for line in unicode_csv_data:
  37.         yield line.encode('utf-8')
  38.  
  39.  
  40.  
  41. w_to_file('bank_base', listing_employers())
  42. utf_8_encoder('bank_base')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement