Advertisement
Guest User

Untitled

a guest
Oct 3rd, 2016
372
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.35 KB | None | 0 0
  1. from scrapy.spiders import CrawlSpider, Rule
  2. from scrapy.selector import Selector
  3. from scrapy.http import FormRequest
  4. from scrapy.http.request import Request
  5. from appext.items import AppextItem
  6. from scrapy.selector import HtmlXPathSelector
  7. from requests.utils import quote
  8.  
  9. import itertools,urllib,re,urlparse,MySQLdb
  10.  
  11. class AppextSpider(CrawlSpider):
  12.     name = "appext"
  13.    
  14.     def __init__(self, run_mode=None, *args, **kwargs):
  15.         super(AppextSpider, self).__init__(*args, **kwargs)
  16.         self.run_mode = run_mode
  17.         self.conn = MySQLdb.connect(user='umair', passwd='gumtree2016', db='scraper', host='workyard-ops.cxk1mlakkfon.ap-southeast-2.rds.amazonaws.com', charset="utf8", use_unicode=True)
  18.         self.cursor = self.conn.cursor()
  19.         self.conn.autocommit(True)
  20.  
  21.  
  22.     global ALPHABET
  23.     global URL
  24.  
  25.     URL = 'https://appext20.dos.ny.gov/corp_public/CORPSEARCH.SELECT_ENTITY?'
  26.  
  27.     detail_page_url = "https://appext20.dos.ny.gov/corp_public/CORPSEARCH.ENTITY_INFORMATION?"
  28.  
  29.     p_captcha = "11143"
  30.     p_captcha_check = "8EFB73A4434235E8"
  31.  
  32.     p_token = "A5DD3488D2D9103F6448883FB53939B33AF5862E0A2304146509609FF60CF9146CFDB78783F5BC94F76D00EFED0B642E"
  33.  
  34.     payload = {'p_entity_name': '','p_name_type': 'A','p_search_type': 'BEGINS', 'p_captcha':p_captcha, 'p_captcha_check':p_captcha_check}
  35.  
  36.     allowed_domains = ["appext20.dos.ny.gov"]
  37.     ALPHABET = [chr(i) for i in range(65, 65+26)]
  38.     ALPHABET.extend([' '])
  39.  
  40.     construction_keywords = ['contracting', 'contractors', 'carpenters', 'carpentry', 'plastering', 'roofers', 'roofing', 'plumbing', 'remodelling', 'remodeling', 'tiling', 'painting', 'rendering', 'electrical', 'plumber', 'contracting ', 'contractor', 'construction', 'waterproofing', 'landscaping', 'bricklaying', 'cabinet maker', 'flooring', 'carpenters', 'electricians', 'restoration', 'drywall', 'renovation', 'renovating ', 'remodels ', 'framing', 'masonry', 'builders', 'woodwork', 'cabinetry', 'millwork', 'electric', 'plastering', 'painters', 'painting', 'hvac', 'labouring', 'fencing', 'concreting', 'glass', ' ac ', 'heating', 'glazier ', 'air duct', 'tiles', 'deck', 'guttering', 'concrete', 'demolition', 'debris', 'dumpster', 'cabinet', 'junk', 'stucco', 'general contract', 'home improvement', 'home repair', 'home build', 'homes', 'building maintenance', 'masons', 'siding', 'kitchens', 'paving', 'landscapers', 'landscapes', 'design & build', 'design build', 'design and build']
  41.    
  42.     def start_requests(self):
  43.         if self.run_mode == "1":
  44.             kwords = self.keyword_initial_set(3)
  45.             for kw in kwords:
  46.                 payload = {'p_entity_name': kw,'p_name_type': 'A','p_search_type': 'BEGINS', 'p_srch_results_page':1, 'p_captcha':self.p_captcha, 'p_captcha_check':self.p_captcha_check}
  47.                 print('Searching for "' + kw + '"')
  48.                 yield Request(url= URL + urllib.urlencode(payload), callback=self.keyword_parse, meta = {'keyword':kw})
  49.  
  50.         elif self.run_mode == "2":
  51.             self.cursor.execute("""SELECT id, link ,name ,searched_keyword FROM appext_ads WHERE LENGTH(dos_id) < 2 LIMIT 10000""")
  52.             all_records_in_db = self.cursor.fetchall()
  53.             for ad in all_records_in_db:
  54.                 a = 'test'
  55.                 # for a in self.construction_keywords:
  56.                     # if a in ad[2].lower():
  57.                 temp_dict = {}
  58.                 temp_dict['keyword'] = a
  59.                 temp_dict['id'] = ad[0]
  60.  
  61.                 temp_dict['link'] = ad[1]
  62.                 temp_dict['name'] = ad[2]
  63.                 temp_dict['searched_keyword'] = ad[3]
  64.                 temp_dict['p_entity_name'] = urllib.quote(ad[3], safe='')
  65.  
  66.                 parsed = urlparse.urlparse(ad[1])
  67.                 p_corpid =  urlparse.parse_qs(parsed.query)['p_corpid'][0]
  68.                 p_nameid =  urlparse.parse_qs(parsed.query)['p_nameid'][0]
  69.            
  70.                     # now create URL.
  71.                 temp_dict['link'] = self.detail_page_url + "p_token="+self.p_token+"&p_nameid="+p_nameid+"&p_corpid="+p_corpid+"&p_captcha="+self.p_captcha+"&p_captcha_check="+self.p_captcha_check+"&p_entity_name="+ temp_dict['p_entity_name'] +"&p_name_type=A&p_search_type=BEGINS&p_srch_results_page=1"
  72.                 print("\""+a + "\" was found so scraping ID: "+temp_dict['id'] + " Name: " + temp_dict['name'])
  73.                 yield Request(url=temp_dict['link'], callback=self.get_ad, method = "GET", meta = temp_dict)
  74.                     # else:
  75.                     #   pass
  76.  
  77.  
  78.     def keyword_initial_set(self,n=3):
  79.         global ALPHABET
  80.         cartesian = list(itertools.product(*[ALPHABET for i in range(n)]))
  81.         return map((lambda x: ''.join(x)), cartesian)
  82.  
  83.     def keyword_generator(self,base):
  84.         for c in ALPHABET:
  85.             yield re.sub( '\s+', ' ', base + c ).lstrip()
  86.  
  87.     def keyword_parse(self,response):
  88.         try:
  89.             n_res = Selector(response).xpath('//center/p/text()').extract()[0]
  90.         except Exception:
  91.             n_res = ''
  92.  
  93.         try:
  94.             next_page = Selector(response).xpath("//a[text()='Next Page']/@href").extract()[0]
  95.         except Exception:
  96.             next_page = ''
  97.  
  98.         if "More than 500" in n_res and "p_srch_results_page=2" in next_page:
  99.             print('More than 500 results found by keyword ' + response.meta['keyword'])
  100.             for kw in self.keyword_generator(response.meta['keyword']):
  101.                 payload = {'p_entity_name': kw,'p_name_type': 'A','p_search_type': 'BEGINS', 'p_srch_results_page':1, 'p_captcha':self.p_captcha, 'p_captcha_check':self.p_captcha_check}
  102.                 print('In recursive search for "'+kw+'"')
  103.                 yield Request(url = URL+urllib.urlencode(payload), callback=self.keyword_parse, method = "GET", meta = {'keyword':kw})
  104.  
  105.        
  106.         else:
  107.             questions = Selector(response).xpath("//td[@headers='c1']")
  108.             # print questions
  109.             all_links = AppextItem()
  110.             for tr in questions:
  111.                 temp_dict = {}
  112.                 temp_dict['link'] = 'https://appext20.dos.ny.gov/corp_public/' + tr.xpath('a/@href').extract()[0]
  113.                 temp_dict['name'] = tr.xpath('a/text()').extract()[0]
  114.                 temp_dict['searched_keyword'] = response.meta['keyword']
  115.                 parsed = urlparse.urlparse(temp_dict['link'])
  116.                 temp_dict['id'] =  urlparse.parse_qs(parsed.query)['p_nameid']
  117.                 yield temp_dict
  118.  
  119.             next_page = Selector(response).xpath("//a[text()='Next Page']/@href").extract()
  120.             try:
  121.                 next_page = 'https://appext20.dos.ny.gov/corp_public/' + next_page[0]
  122.                 yield Request(url=next_page, callback=self.keyword_parse, method = "GET", meta = {'keyword':response.meta['keyword']})
  123.             except Exception,e:
  124.                 pass
  125.  
  126.     def get_p_token(self,response):
  127.         print("trying to read p_token")
  128.         self.p_token = 'https://appext20.dos.ny.gov/corp_public/' + Selector(response).xpath("/html/body/center/table/tr[2]/td/a[1]/@href").extract()[0]
  129.         parsed = urlparse.urlparse( self.p_token )
  130.         self.p_token =  urlparse.parse_qs(parsed.query)['p_token'][0]
  131.         print("New p_token is found "+self.p_token)
  132.  
  133.         parsed = urlparse.urlparse( response.meta['link'] )
  134.         p_corpid =  urlparse.parse_qs(parsed.query)['p_corpid'][0]
  135.         p_nameid =  urlparse.parse_qs(parsed.query)['p_nameid'][0]
  136.    
  137.         refreshed_link = self.detail_page_url + "p_token="+self.p_token+"&p_nameid="+p_nameid+"&p_corpid="+p_corpid+"&p_captcha="+self.p_captcha+"&p_captcha_check="+self.p_captcha_check+"&p_entity_name="+  response.meta['p_entity_name'] +"&p_name_type=A&p_search_type=BEGINS&p_srch_results_page=1"
  138.         print('refreshed_link = '+refreshed_link)
  139.         yield Request(url=refreshed_link, callback=self.get_ad, method = "GET", meta = response.meta)
  140.        
  141.  
  142.     def get_ad(self, response):
  143.         temp_dict = AppextItem()
  144.         try:
  145.             Selector(response).xpath('//div[@class="messagebox"]').extract()[0]
  146.             print("Captcha found when scraping ID "+ response.meta['id'] + " LINK: "+response.meta['link'])
  147.             self.p_token = ''
  148.            
  149.             payload = {'p_entity_name': "AAA",'p_name_type': 'A','p_search_type': 'BEGINS', 'p_srch_results_page':1, 'p_captcha':self.p_captcha, 'p_captcha_check':self.p_captcha_check}
  150.             url_ = URL+urllib.urlencode(payload)
  151.             return Request(url = url_, callback=self.get_p_token, method = "GET",priority=1, meta = response.meta)
  152.            
  153.  
  154.  
  155.         except Exception:
  156.             print("Captcha was not found")
  157.             temp_dict['run_mode'] = self.run_mode
  158.             temp_dict['name'] = response.meta['name']
  159.             temp_dict['link'] = response.meta['link']
  160.             temp_dict['searched_keyword'] = response.meta['searched_keyword']
  161.  
  162.             temp_dict['keyword'] = response.meta['keyword']
  163.             temp_dict['id'] = response.meta['id']
  164.             temp_dict['dos_id'] = Selector(response).xpath("/html/body/center/table[1]/tr[2]/td/text()").extract()[0]
  165.             temp_dict['date'] = Selector(response).xpath("/html/body/center/table[1]/tr[3]/td/text()").extract()[0]
  166.             temp_dict['county'] = Selector(response).xpath("/html/body/center/table[1]/tr[4]/td/text()").extract()[0]
  167.             rx = re.compile('\W+')
  168.             # predfine address keys
  169.             title_addresses = {'addresses':[]}
  170.             prev_title = ""
  171.             for add in Selector(response).xpath('//table[@id="tblAddr"]/tr'):
  172.                 try:
  173.                     prev_title = add.xpath('./th/text()').extract()[0]
  174.                     prev_title = rx.sub(' ', prev_title).strip()
  175.                 except Exception,e:
  176.                     if prev_title != "":
  177.                         address = [temp_dict['dos_id'] , prev_title, "", "", "", "", ""]
  178.                         _value = add.xpath('./td/text()').extract()[0]
  179.                         address[2] = _value
  180.  
  181.                         try:
  182.                             _value = add.xpath('./td/text()').extract()[1]
  183.                             address[3] = _value
  184.  
  185.                             _value = add.xpath('./td/text()').extract()[2]
  186.                             _value = _value.split(",")
  187.  
  188.                             suburb = _value[0]
  189.                             address[4] = suburb
  190.  
  191.                             state = _value[1]
  192.                             address[5] = state
  193.  
  194.                             _zip = _value[2]
  195.                             address[6] = _zip
  196.                             title_addresses["addresses"].extend([address])
  197.  
  198.                         except Exception,e:
  199.                             pass
  200.  
  201.  
  202.  
  203.             temp_dict['addresses'] = title_addresses["addresses"]
  204.  
  205.             return temp_dict
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement