Advertisement
Guest User

Untitled

a guest
May 30th, 2017
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.35 KB | None | 0 0
  1. import requests
  2. import os
  3. import xml.etree.ElementTree as ET
  4. from bs4 import BeautifulSoup
  5. import codecs
  6. import re
  7.  
  8. linksRss = [
  9.             'http://www.ansa.it/sito/notizie/politica/politica_rss.xml|politica',
  10.             'http://www.ansa.it/sito/notizie/mondo/mondo_rss.xml|mondo',
  11.             'http://www.ansa.it/sito/notizie/economia/economia_rss.xml|economia',
  12.             'http://www.ansa.it/sito/notizie/sport/calcio/calcio_rss.xml|calcio',
  13.             'http://www.ansa.it/sito/notizie/sport/sport_rss.xml|sport',
  14.             'http://www.ansa.it/sito/notizie/cultura/cinema/cinema_rss.xml|cinema',
  15.             'http://www.ansa.it/sito/notizie/cultura/cultura_rss.xml|cultura',
  16.             'http://www.ansa.it/sito/notizie/tecnologia/tecnologia_rss.xml|tecnologia',
  17.             'http://www.ansa.it/sito/notizie/cronaca/cronaca_rss.xml|cronaca',]
  18.  
  19.  
  20. for link_category in linksRss:
  21.     pageRss  = requests.get(link_category.split("|")[0])
  22.     category = link_category.split("|")[1]
  23.     xmlfile = ET.fromstring(pageRss.content)
  24.  
  25.     numberLinks = 0
  26.  
  27.     for links in xmlfile.iter('link'):
  28.         numberLinks += 1
  29.     trainigPercent = (numberLinks * 80) / 100
  30.     testPercent = numberLinks - trainigPercent
  31.  
  32.     count = 0
  33.  
  34.     print (numberLinks)
  35.     print (trainigPercent)
  36.    
  37.     for links in xmlfile.iter('link'):
  38.  
  39.         if count < trainigPercent:
  40.             path = "/Users/bigdata/Desktop/dataset/training/" + category
  41.             if not os.path.exists(path):
  42.                 os.makedirs(path)
  43.         else:
  44.             path = "/Users/bigdata/Desktop/dataset/test/" + category
  45.             if not os.path.exists(path):
  46.                 os.makedirs(path)
  47.  
  48.         if count > 0:
  49.             page = requests.get(links.text)
  50.             text = BeautifulSoup(page.content, 'html.parser').find_all(class_="news-txt")[0].find_all('p')
  51.             #textArticle = ''
  52.             name1 = re.sub("http(.*/)+", "", links.text)
  53.             name2 = re.sub(r'[^\w]+', "", name1)
  54.             if len(text) > 0 :
  55.                 textArticle = text[0].get_text()
  56.                 filePath = os.path.join(path, name2)
  57.                 if not os.path.isfile(filePath) :
  58.                     file = codecs.open(filePath, 'a', encoding='utf8')
  59.                     file.write(textArticle)
  60.                     file.close()
  61.         count+=1
  62.  
  63.     print (category)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement