Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- class SecSpider(scrapy.Spider):
- name = 'Sec'
- allowed_domains = ['www.sec.gov']
- start_urls = ['https://www.sec.gov/litigation/litreleases.shtml']
- def parse(self, response):
- rows = response.xpath('//*[@id="mainlist"]//tr')[2:]
- for row in rows:
- link = row.xpath('.//@href').extract_first()
- number = row.xpath('.//a/text()').extract_first()
- date = row.xpath('.//td[2]/text()').extract_first()
- title = row.xpath('.//td[3]/text()').extract()
- yield {
- "Link": link,
- "Number": number,
- "Date": date,
- "Title": title
- }
- import pymssql
- class ScrapingPipeline(object):
- def __init__(self):
- self.conn = pymssql.connect(host='localhost', user='sa', password='data1234', database='Sec')
- self.cursor = self.conn.cursor()
- def process_item(self, item, spider):
- self.cursor.execute("INSERT INTO updates(link, number, date, title) VALUES (%s, %s, %s, %s)",
- (item['Link'], item['Number'], item['Date'], item['Title']))
- self.conn.commit()
- return item
- ITEM_PIPELINES = {'Scraping.pipelines.ScrapingPipeline': 300}
- from scrapy import item, Field
- class ScrapingItem(scrapy.Item):
- link = scrapy.Field()
- number = scrapy.Field()
- date = scrapy.Field()
- title = scrapy.Field()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement