Advertisement
Guest User

parser

a guest
Aug 28th, 2018
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.28 KB | None | 0 0
  1. import scrapy
  2. from scrapy.crawler import CrawlerProcess
  3. import sys
  4. from datetime import datetime, timedelta
  5. import psycopg2
  6. import argparse
  7.  
  8. hostname = 'localhost'
  9. username = 'postgres'
  10. password = 'postgres'
  11. database = 'xeparser'
  12.  
  13. def createParser ():
  14.     parser = argparse.ArgumentParser()
  15.     parser.add_argument ('date_begin', nargs='?')
  16.     parser.add_argument ('date_end', nargs='?')
  17.  
  18.     return parser
  19.  
  20.  
  21. if __name__ == '__main__':
  22.     parser = createParser()
  23.     namespace = parser.parse_args()
  24.     if not namespace.date_begin:
  25.         dateEnd = datetime.now()
  26.         dateBegin = dateEnd  - timedelta(7)
  27.     else:
  28.         if namespace.date_end:
  29.             dateBegin = datetime.strptime(namespace.date_begin,'%Y-%m-%d')
  30.             dateEnd = datetime.strptime(namespace.date_end,'%Y-%m-%d')
  31.         else:
  32.             dateEnd = datetime.strptime(namespace.date_begin,'%Y-%m-%d')
  33.             dateBegin = dateEnd - timedelta(7)
  34.     if dateEnd < dateBegin:
  35.         dateBegin = dateEnd
  36.        
  37.  
  38. class DBConnection:
  39.     def __init__(self):
  40.         try:
  41.             self.connection = psycopg2.connect(host=hostname, database=database, user=username, password=password)
  42.             self.connection.autocommit = True
  43.             self.cursor = self.connection.cursor()
  44.         except:
  45.             print("Cannot connect to database")
  46.        
  47.  
  48. class MySpider(scrapy.Spider):
  49.     name = "xeparser"
  50.  
  51.     url0 = 'https://www.xe.com/currencytables/?from=USD&date='
  52.     start_urls = []
  53.     i = 0
  54.     while dateBegin + timedelta(i) <= dateEnd:
  55.         start_urls.append(url0 + (dateBegin + timedelta(i)).strftime('%Y-%m-%d'))
  56.         i += 1
  57.    
  58.     def parse(self, response):
  59.         dbc = DBConnection()
  60.         for row in response.xpath('//table[@id="historicalRateTbl"]//tbody/tr'):
  61.             dbc.cursor.execute("insert into cur (date, currency, units_per_usd) values (%s, %s, %s)",
  62.                                (response.xpath('//p[@class="historicalRateTable-date"]/text()').extract_first(),
  63.                                row.xpath('td[2]//text()').extract_first(), row.xpath('td[3]//text()').extract_first()))
  64.  
  65. process = CrawlerProcess({
  66.     'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
  67. })
  68.  
  69. process.crawl(MySpider)
  70. process.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement