Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- from scrapy.crawler import CrawlerProcess
- import sys
- from datetime import datetime, timedelta
- import psycopg2
- import argparse
- hostname = 'localhost'
- username = 'postgres'
- password = 'postgres'
- database = 'xeparser'
- def createParser ():
- parser = argparse.ArgumentParser()
- parser.add_argument ('date_begin', nargs='?')
- parser.add_argument ('date_end', nargs='?')
- return parser
- if __name__ == '__main__':
- parser = createParser()
- namespace = parser.parse_args()
- if not namespace.date_begin:
- dateEnd = datetime.now()
- dateBegin = dateEnd - timedelta(7)
- else:
- if namespace.date_end:
- dateBegin = datetime.strptime(namespace.date_begin,'%Y-%m-%d')
- dateEnd = datetime.strptime(namespace.date_end,'%Y-%m-%d')
- else:
- dateEnd = datetime.strptime(namespace.date_begin,'%Y-%m-%d')
- dateBegin = dateEnd - timedelta(7)
- if dateEnd < dateBegin:
- dateBegin = dateEnd
- class DBConnection:
- def __init__(self):
- try:
- self.connection = psycopg2.connect(host=hostname, database=database, user=username, password=password)
- self.connection.autocommit = True
- self.cursor = self.connection.cursor()
- except:
- print("Cannot connect to database")
- class MySpider(scrapy.Spider):
- name = "xeparser"
- url0 = 'https://www.xe.com/currencytables/?from=USD&date='
- start_urls = []
- i = 0
- while dateBegin + timedelta(i) <= dateEnd:
- start_urls.append(url0 + (dateBegin + timedelta(i)).strftime('%Y-%m-%d'))
- i += 1
- def parse(self, response):
- dbc = DBConnection()
- for row in response.xpath('//table[@id="historicalRateTbl"]//tbody/tr'):
- dbc.cursor.execute("insert into cur (date, currency, units_per_usd) values (%s, %s, %s)",
- (response.xpath('//p[@class="historicalRateTable-date"]/text()').extract_first(),
- row.xpath('td[2]//text()').extract_first(), row.xpath('td[3]//text()').extract_first()))
- process = CrawlerProcess({
- 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
- })
- process.crawl(MySpider)
- process.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement