SHOW:
|
|
- or go back to the newest paste.
1 | from scrapy.spider import Spider | |
2 | from scrapy.shell import inspect_response | |
3 | from scrapy.http import Request,FormRequest | |
4 | from scrapy.exceptions import CloseSpider | |
5 | from boroughScrper.items import idoxpaSpiderItem | |
6 | from scrapy import log | |
7 | import urllib, time, MySQLdb, sys | |
8 | ||
9 | today = time.strftime("%x %X") | |
10 | ||
11 | class idoxpaSpider(Spider): | |
12 | - | pipeline = set([pipeline.Insert]) |
12 | + | pipeline = set([pipeline.Insert,]) |
13 | ||
14 | name = 'idoxpaSpider' | |
15 | - | domain = 'https://www.westminster.gov.uk' |
15 | + | domain = 'https://www.example.com' |
16 | ||
17 | - | base_url = ["http://idoxpa.westminster.gov.uk/online-applications/pagedSearchResults.do? action=page&searchCriteria.page"] |
17 | + | start_urls = ["http://www.example.com/online-applications/search.do?action=monthlyList"] |
18 | ### | |
19 | - | start_urls = ["http://idoxpa.westminster.gov.uk/online-applications/search.do?action=monthlyList"] |
19 | + | |
20 | for parish in response.xpath("//*[@id='parish']/option/@value").extract(): | |
21 | for month in response.xpath("//*[@id='month']/option/text()").extract(): | |
22 | yield FormRequest.from_response(response, | |
23 | formname = 'searchCriteriaForm', | |
24 | formdata = { 'searchCriteria.parish':parish, 'month':month, 'dateType':'DC_Validated', 'searchType':'Application' }, | |
25 | callback = self.parse_results) | |
26 | ||
27 | def parse_results(self, response): | |
28 | inspect_response(response) | |
29 | ||
30 | ||
31 | class Insert(Pipeline): | |
32 | def __init__(self): | |
33 | self.conn = MySQLdb.connect(user=<>, passwd=<>, db=<>, host=<>, charset="utf8", use_unicode=True) | |
34 | self.cursor = self.conn.cursor() | |
35 | ||
36 | @check_spider_pipeline | |
37 | def process_item(self, item, spider): | |
38 | return item |