Advertisement
Guest User

Untitled

a guest
Dec 16th, 2017
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.46 KB | None | 0 0
  1. ## This program scrapes news articles from the website http://altaicholmon.ru/.
  2.  
  3. from lxml import html
  4. from datetime import date, timedelta
  5. from scrapers import ScraperAltay
  6. import http.client
  7. import string
  8. import requests
  9. import time
  10. import sys
  11.  
  12. HOMEPAGE = "http://altaicholmon.ru/news/allnews/"
  13.  
  14. translator = str.maketrans(dict.fromkeys("\n\t"))
  15.  
  16. articles = []
  17.  
  18. def populateArticleList(pageNum): #adds article information to the article list
  19. page = requests.get(HOMEPAGE + "page/" + str(pageNum))
  20. tree = html.fromstring(page.content)
  21. hrefs = tree.xpath('//div[@class="col-md-8 border_right_main_news"]/a[@class="link link-news"]/@href')
  22. titles = tree.xpath('//h3[@class="h3-theme-cholmon"]/text()')
  23. dates = tree.xpath('//p[@class="news_avtor_date"]/text()')
  24.  
  25. for i in range(0, len(hrefs)):
  26. titles[i] = str(titles[i]).translate(translator).strip()
  27. dates[i] = str(dates[i]).translate(translator).strip()
  28. articles.append((hrefs[i], titles[i], dates[i]))
  29.  
  30. def getLastPage(): #calculates the number of pages on the news website
  31. tree = html.fromstring(requests.get(HOMEPAGE).content)
  32. lastPage = str(tree.xpath('//div[@class="wp-pagenavi"]/a[@class="last"]/@href')[0])
  33. lastPageNum = ""
  34.  
  35. for i in range(0, len(lastPage)):
  36. if lastPage[i].isdigit():
  37. lastPageNum = lastPageNum + lastPage[i]
  38.  
  39. return int(lastPageNum)
  40.  
  41. def main():
  42. conn = http.client.HTTPConnection("altaicholmon.ru")
  43. ids = None
  44. root = None
  45. w = Writer()
  46.  
  47. def term_handler(sigNum, frame):
  48. print("\nReceived a SIGTERM signal. Closing the program.")
  49. w.close()
  50. sys.exit(0)
  51.  
  52. for i in range(0, getLastPage + 2):
  53. populateArticleList(i + 1)
  54.  
  55. try:
  56. for (title, url, date) in articles:
  57. try:
  58. source = Source(url, title=title, date=date, scraper=ScraperAltay, conn=conn)
  59. source.makeRoot("./", ids=ids, root=root, lang="alt")
  60. source.add_to_archive()
  61. if ids is None:
  62. ids = source.ids
  63. if root is None:
  64. root = source.root
  65. except Exception as e:
  66. print(url + " " + str(e))
  67. except KeyboardInterrupt:
  68. print("\nReceived a keyboard interrupt. Closing the program.")
  69. w.close()
  70. conn.close()
  71.  
  72. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement