Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from scrapy.contrib.spiders import CrawlSpider, Rule
- from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
- from scrapy import Selector
- from craigslist_sample.items import ContractorItem
- class MySpider(CrawlSpider):
- name = "craigs"
- allowed_domains = ["https://www2.cslb.ca.gov/onlineservices/checklicenseII/NameSearch.aspx"]
- start_urls = ["https://www2.cslb.ca.gov/onlineservices/checklicenseII/NameSearch.aspx?NextName=a"]
- def process_urls(value):
- if ("NameSearch" not in value):
- return
- else:
- print value
- return 'https://www2.cslb.ca.gov/onlineservices/checklicenseII/' + value
- rules = (Rule (SgmlLinkExtractor(process_value=process_urls, tags=("form"), attrs=("action"))
- , callback="parse_items", follow= True),
- )
- def parse_items(self, response):
- sel = Selector(response)
- companies = sel.xpath('//table[@id="ctl00_LeftColumnMiddle_Table1"]//table')
- items = []
- for c in companies:
- item = ContractorItem()
- item ["name"] = c.xpath("./tr[1]/td[2]/text()").extract()
- item ["type"] = c.xpath("./tr[2]/td[2]/text()").extract()
- item ["license"] = c.xpath("./tr[3]/td[2]/text()").extract()
- item ["city"] = c.xpath("./tr[4]/td[2]/text()").extract()
- item ["status"] = c.xpath("./tr[5]/td[2]/text()").extract()
- items.append(item)
- return(items)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement