Advertisement
Guest User

Untitled

a guest
Jul 5th, 2017
119
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.40 KB | None | 0 0
  1. import scrapy
  2.  
  3.  
  4. class SecSpider(scrapy.Spider):
  5. name = 'Sec'
  6. allowed_domains = ['www.sec.gov']
  7. start_urls = ['https://www.sec.gov/litigation/litreleases.shtml']
  8.  
  9. def parse(self, response):
  10. rows = response.xpath('//*[@id="mainlist"]//tr')[2:]
  11. for row in rows:
  12. link = row.xpath('.//@href').extract_first()
  13. number = row.xpath('.//a/text()').extract_first()
  14. date = row.xpath('.//td[2]/text()').extract_first()
  15. title = row.xpath('.//td[3]/text()').extract()
  16. yield {
  17. "Link": link,
  18. "Number": number,
  19. "Date": date,
  20. "Title": title
  21. }
  22.  
  23. import pymssql
  24.  
  25.  
  26. class ScrapingPipeline(object):
  27. def __init__(self):
  28. self.conn = pymssql.connect(host='localhost', user='sa', password='data1234', database='Sec')
  29. self.cursor = self.conn.cursor()
  30.  
  31. def process_item(self, item, spider):
  32. self.cursor.execute("INSERT INTO updates(link, number, date, title) VALUES (%s, %s, %s, %s)",
  33. (item['Link'], item['Number'], item['Date'], item['Title']))
  34. self.conn.commit()
  35.  
  36.  
  37. return item
  38.  
  39. ITEM_PIPELINES = {'Scraping.pipelines.ScrapingPipeline': 300}
  40.  
  41. from scrapy import item, Field
  42.  
  43. class ScrapingItem(scrapy.Item):
  44. link = scrapy.Field()
  45. number = scrapy.Field()
  46. date = scrapy.Field()
  47. title = scrapy.Field()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement