Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy.spiders import CrawlSpider, Rule
- from scrapy.selector import Selector
- from scrapy.http import FormRequest
- from scrapy.http.request import Request
- from appext.items import AppextItem
- from scrapy.selector import HtmlXPathSelector
- from requests.utils import quote
- import itertools,urllib,re,urlparse,MySQLdb
- class AppextSpider(CrawlSpider):
- name = "appext"
- def __init__(self, run_mode=None, *args, **kwargs):
- super(AppextSpider, self).__init__(*args, **kwargs)
- self.run_mode = run_mode
- self.conn = MySQLdb.connect(user='umair', passwd='gumtree2016', db='scraper', host='workyard-ops.cxk1mlakkfon.ap-southeast-2.rds.amazonaws.com', charset="utf8", use_unicode=True)
- self.cursor = self.conn.cursor()
- self.conn.autocommit(True)
- global ALPHABET
- global URL
- URL = 'https://appext20.dos.ny.gov/corp_public/CORPSEARCH.SELECT_ENTITY?'
- detail_page_url = "https://appext20.dos.ny.gov/corp_public/CORPSEARCH.ENTITY_INFORMATION?"
- p_captcha = "11143"
- p_captcha_check = "8EFB73A4434235E8"
- p_token = "A5DD3488D2D9103F6448883FB53939B33AF5862E0A2304146509609FF60CF9146CFDB78783F5BC94F76D00EFED0B642E"
- payload = {'p_entity_name': '','p_name_type': 'A','p_search_type': 'BEGINS', 'p_captcha':p_captcha, 'p_captcha_check':p_captcha_check}
- allowed_domains = ["appext20.dos.ny.gov"]
- ALPHABET = [chr(i) for i in range(65, 65+26)]
- ALPHABET.extend([' '])
- construction_keywords = ['contracting', 'contractors', 'carpenters', 'carpentry', 'plastering', 'roofers', 'roofing', 'plumbing', 'remodelling', 'remodeling', 'tiling', 'painting', 'rendering', 'electrical', 'plumber', 'contracting ', 'contractor', 'construction', 'waterproofing', 'landscaping', 'bricklaying', 'cabinet maker', 'flooring', 'carpenters', 'electricians', 'restoration', 'drywall', 'renovation', 'renovating ', 'remodels ', 'framing', 'masonry', 'builders', 'woodwork', 'cabinetry', 'millwork', 'electric', 'plastering', 'painters', 'painting', 'hvac', 'labouring', 'fencing', 'concreting', 'glass', ' ac ', 'heating', 'glazier ', 'air duct', 'tiles', 'deck', 'guttering', 'concrete', 'demolition', 'debris', 'dumpster', 'cabinet', 'junk', 'stucco', 'general contract', 'home improvement', 'home repair', 'home build', 'homes', 'building maintenance', 'masons', 'siding', 'kitchens', 'paving', 'landscapers', 'landscapes', 'design & build', 'design build', 'design and build']
- def start_requests(self):
- if self.run_mode == "1":
- kwords = self.keyword_initial_set(3)
- for kw in kwords:
- payload = {'p_entity_name': kw,'p_name_type': 'A','p_search_type': 'BEGINS', 'p_srch_results_page':1, 'p_captcha':self.p_captcha, 'p_captcha_check':self.p_captcha_check}
- print('Searching for "' + kw + '"')
- yield Request(url= URL + urllib.urlencode(payload), callback=self.keyword_parse, meta = {'keyword':kw})
- elif self.run_mode == "2":
- self.cursor.execute("""SELECT id, link ,name ,searched_keyword FROM appext_ads WHERE LENGTH(dos_id) < 2 LIMIT 10000""")
- all_records_in_db = self.cursor.fetchall()
- for ad in all_records_in_db:
- a = 'test'
- # for a in self.construction_keywords:
- # if a in ad[2].lower():
- temp_dict = {}
- temp_dict['keyword'] = a
- temp_dict['id'] = ad[0]
- temp_dict['link'] = ad[1]
- temp_dict['name'] = ad[2]
- temp_dict['searched_keyword'] = ad[3]
- temp_dict['p_entity_name'] = urllib.quote(ad[3], safe='')
- parsed = urlparse.urlparse(ad[1])
- p_corpid = urlparse.parse_qs(parsed.query)['p_corpid'][0]
- p_nameid = urlparse.parse_qs(parsed.query)['p_nameid'][0]
- # now create URL.
- temp_dict['link'] = self.detail_page_url + "p_token="+self.p_token+"&p_nameid="+p_nameid+"&p_corpid="+p_corpid+"&p_captcha="+self.p_captcha+"&p_captcha_check="+self.p_captcha_check+"&p_entity_name="+ temp_dict['p_entity_name'] +"&p_name_type=A&p_search_type=BEGINS&p_srch_results_page=1"
- print("\""+a + "\" was found so scraping ID: "+temp_dict['id'] + " Name: " + temp_dict['name'])
- yield Request(url=temp_dict['link'], callback=self.get_ad, method = "GET", meta = temp_dict)
- # else:
- # pass
- def keyword_initial_set(self,n=3):
- global ALPHABET
- cartesian = list(itertools.product(*[ALPHABET for i in range(n)]))
- return map((lambda x: ''.join(x)), cartesian)
- def keyword_generator(self,base):
- for c in ALPHABET:
- yield re.sub( '\s+', ' ', base + c ).lstrip()
- def keyword_parse(self,response):
- try:
- n_res = Selector(response).xpath('//center/p/text()').extract()[0]
- except Exception:
- n_res = ''
- try:
- next_page = Selector(response).xpath("//a[text()='Next Page']/@href").extract()[0]
- except Exception:
- next_page = ''
- if "More than 500" in n_res and "p_srch_results_page=2" in next_page:
- print('More than 500 results found by keyword ' + response.meta['keyword'])
- for kw in self.keyword_generator(response.meta['keyword']):
- payload = {'p_entity_name': kw,'p_name_type': 'A','p_search_type': 'BEGINS', 'p_srch_results_page':1, 'p_captcha':self.p_captcha, 'p_captcha_check':self.p_captcha_check}
- print('In recursive search for "'+kw+'"')
- yield Request(url = URL+urllib.urlencode(payload), callback=self.keyword_parse, method = "GET", meta = {'keyword':kw})
- else:
- questions = Selector(response).xpath("//td[@headers='c1']")
- # print questions
- all_links = AppextItem()
- for tr in questions:
- temp_dict = {}
- temp_dict['link'] = 'https://appext20.dos.ny.gov/corp_public/' + tr.xpath('a/@href').extract()[0]
- temp_dict['name'] = tr.xpath('a/text()').extract()[0]
- temp_dict['searched_keyword'] = response.meta['keyword']
- parsed = urlparse.urlparse(temp_dict['link'])
- temp_dict['id'] = urlparse.parse_qs(parsed.query)['p_nameid']
- yield temp_dict
- next_page = Selector(response).xpath("//a[text()='Next Page']/@href").extract()
- try:
- next_page = 'https://appext20.dos.ny.gov/corp_public/' + next_page[0]
- yield Request(url=next_page, callback=self.keyword_parse, method = "GET", meta = {'keyword':response.meta['keyword']})
- except Exception,e:
- pass
- def get_p_token(self,response):
- print("trying to read p_token")
- self.p_token = 'https://appext20.dos.ny.gov/corp_public/' + Selector(response).xpath("/html/body/center/table/tr[2]/td/a[1]/@href").extract()[0]
- parsed = urlparse.urlparse( self.p_token )
- self.p_token = urlparse.parse_qs(parsed.query)['p_token'][0]
- print("New p_token is found "+self.p_token)
- parsed = urlparse.urlparse( response.meta['link'] )
- p_corpid = urlparse.parse_qs(parsed.query)['p_corpid'][0]
- p_nameid = urlparse.parse_qs(parsed.query)['p_nameid'][0]
- refreshed_link = self.detail_page_url + "p_token="+self.p_token+"&p_nameid="+p_nameid+"&p_corpid="+p_corpid+"&p_captcha="+self.p_captcha+"&p_captcha_check="+self.p_captcha_check+"&p_entity_name="+ response.meta['p_entity_name'] +"&p_name_type=A&p_search_type=BEGINS&p_srch_results_page=1"
- print('refreshed_link = '+refreshed_link)
- yield Request(url=refreshed_link, callback=self.get_ad, method = "GET", meta = response.meta)
- def get_ad(self, response):
- temp_dict = AppextItem()
- try:
- Selector(response).xpath('//div[@class="messagebox"]').extract()[0]
- print("Captcha found when scraping ID "+ response.meta['id'] + " LINK: "+response.meta['link'])
- self.p_token = ''
- payload = {'p_entity_name': "AAA",'p_name_type': 'A','p_search_type': 'BEGINS', 'p_srch_results_page':1, 'p_captcha':self.p_captcha, 'p_captcha_check':self.p_captcha_check}
- url_ = URL+urllib.urlencode(payload)
- return Request(url = url_, callback=self.get_p_token, method = "GET",priority=1, meta = response.meta)
- except Exception:
- print("Captcha was not found")
- temp_dict['run_mode'] = self.run_mode
- temp_dict['name'] = response.meta['name']
- temp_dict['link'] = response.meta['link']
- temp_dict['searched_keyword'] = response.meta['searched_keyword']
- temp_dict['keyword'] = response.meta['keyword']
- temp_dict['id'] = response.meta['id']
- temp_dict['dos_id'] = Selector(response).xpath("/html/body/center/table[1]/tr[2]/td/text()").extract()[0]
- temp_dict['date'] = Selector(response).xpath("/html/body/center/table[1]/tr[3]/td/text()").extract()[0]
- temp_dict['county'] = Selector(response).xpath("/html/body/center/table[1]/tr[4]/td/text()").extract()[0]
- rx = re.compile('\W+')
- # predfine address keys
- title_addresses = {'addresses':[]}
- prev_title = ""
- for add in Selector(response).xpath('//table[@id="tblAddr"]/tr'):
- try:
- prev_title = add.xpath('./th/text()').extract()[0]
- prev_title = rx.sub(' ', prev_title).strip()
- except Exception,e:
- if prev_title != "":
- address = [temp_dict['dos_id'] , prev_title, "", "", "", "", ""]
- _value = add.xpath('./td/text()').extract()[0]
- address[2] = _value
- try:
- _value = add.xpath('./td/text()').extract()[1]
- address[3] = _value
- _value = add.xpath('./td/text()').extract()[2]
- _value = _value.split(",")
- suburb = _value[0]
- address[4] = suburb
- state = _value[1]
- address[5] = state
- _zip = _value[2]
- address[6] = _zip
- title_addresses["addresses"].extend([address])
- except Exception,e:
- pass
- temp_dict['addresses'] = title_addresses["addresses"]
- return temp_dict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement