Untitled

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib2
import re
import csv

match_page = re.compile(r'<a href=\"\/employer\/\d+\">.+?</a>', re.UNICODE)
match_text = re.compile(r'[^(<a href="/employer/\d+?\>)].+[^(\s</a>)]', re.UNICODE)
match_hh_company_url = re.compile(r'/employer/\d+', re.UNICODE)

def listing_employers():

    for x in range(0, 20):
        source_list = []
        page_id = str(x)
        headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)'}
        if page_id != 19:
            page = urllib2.urlopen("http://hh.ru/applicant/searchvacancyresult.xml?orderBy=0&itemsOnPage=100&areaId=1&professionalAreaId=5&compensationCurrencyCode=RUR&searchPeriod=30&page="+page_id)
            htmled = page.read()
            source_list += match_page.findall(htmled)
        else:
            break
    source_list = set(source_list)
    return source_list


def w_to_file(f_name, source_list):
    with open(f_name, 'wb') as csvfile:
        bankwriter = csv.writer(csvfile, dialect='excel')
        for each in source_list:
            bankwriter.writerow(match_text.findall(each) + match_hh_company_url(each))

def utf_8_encoder(unicode_csv_data):
    for line in unicode_csv_data:
        yield line.encode('utf-8')


w_to_file('bank_base', listing_employers())
utf_8_encoder('bank_base')