Advertisement
Guest User

Untitled

a guest
Jan 30th, 2015
193
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.46 KB | None | 0 0
  1. from scrapy.contrib.spiders import CrawlSpider, Rule
  2. from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
  3. from scrapy import Selector
  4. from craigslist_sample.items import ContractorItem
  5.  
  6. class MySpider(CrawlSpider):
  7.     name = "craigs"
  8.     allowed_domains = ["https://www2.cslb.ca.gov/onlineservices/checklicenseII/NameSearch.aspx"]
  9.     start_urls = ["https://www2.cslb.ca.gov/onlineservices/checklicenseII/NameSearch.aspx?NextName=a"]
  10.  
  11.     def process_urls(value):
  12.         if ("NameSearch" not in value):
  13.             return
  14.         else:
  15.             print value
  16.             return 'https://www2.cslb.ca.gov/onlineservices/checklicenseII/' + value
  17.  
  18.     rules = (Rule (SgmlLinkExtractor(process_value=process_urls, tags=("form"), attrs=("action"))
  19.     , callback="parse_items", follow= True),
  20.     )
  21.  
  22.     def parse_items(self, response):
  23.         sel = Selector(response)
  24.         companies = sel.xpath('//table[@id="ctl00_LeftColumnMiddle_Table1"]//table')
  25.         items = []
  26.         for c in companies:
  27.             item = ContractorItem()
  28.             item ["name"]    = c.xpath("./tr[1]/td[2]/text()").extract()
  29.             item ["type"]    = c.xpath("./tr[2]/td[2]/text()").extract()
  30.             item ["license"] = c.xpath("./tr[3]/td[2]/text()").extract()
  31.             item ["city"]    = c.xpath("./tr[4]/td[2]/text()").extract()
  32.             item ["status"]  = c.xpath("./tr[5]/td[2]/text()").extract()
  33.             items.append(item)
  34.         return(items)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement