Guest User

Untitled

a guest
May 6th, 2017
138
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import re
  2. import requests
  3. import time
  4. import pymongo
  5. from pprint import pprint
  6. from bs4 import BeautifulSoup
  7.  
  8. def mongo_connect():
  9.     client = pymongo.MongoClient('mongodb://172.17.0.2:27017/')
  10.     db = client['mydb']
  11.     collections = db['news']
  12.     collections.create_index('url', unique=True)
  13.     return collections
  14.  
  15. def get_article(url, title):
  16.     try:
  17.         r = requests.get(url)
  18.     except requests.exceptions.ConnectionError:
  19.         time.sleep(5)
  20.         try:
  21.             r = requests.get(url)
  22.         except requests.exceptions.ConnectionError:
  23.             return False
  24.  
  25.     if r.status_code == 200:
  26.         soup = BeautifulSoup(r.content, "html5lib")
  27.         result = {}
  28.         result['title'] = title
  29.         result['url'] = url
  30.         result['date'] = soup.find('time').text
  31.         result['content'] = ''
  32.         tmp_content = soup.find('div', {'class': 'side-article txt-article'}).findAll('p')
  33.         for i in tmp_content:
  34.             result['content'] = result['content']+" "+i.text
  35.  
  36.         return result
  37.     else:
  38.         return False
  39.  
  40. def get_index(url):
  41.     r = requests.get(url)
  42.     if r.status_code == 200:
  43.         soup = BeautifulSoup(r.content, "html5lib")
  44.         data = soup.find('div', {'class': 'lsi'}).findAll('h3')
  45.         for i in data:
  46.             result = {}
  47.             result['title'] = re.sub(r'\t|\n', '', i.text).strip()
  48.             result['link'] = i.find('a')['href']
  49.             yield result
  50.     else:
  51.         yield None
  52.  
  53. def main():
  54.     collections = mongo_connect()
  55.     for i in get_index('http://jabar.tribunnews.com/'):
  56.         data = get_article(i['link'], i['title'])
  57.         try:
  58.             collections.insert_one(data).inserted_id
  59.             pprint(data)
  60.             print
  61.         except pymongo.errors.DuplicateKeyError:
  62.             print('data sudah ada')
  63.  
  64.  
  65. if __name__ == '__main__':
  66.     main()
RAW Paste Data