Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import praw
- import time
- from xml.etree.ElementTree import Element, SubElement, ElementTree
- #Creo una instancia de Reddit
- reddit = praw.Reddit(client_id='kgNcJc52YxrqhQ', client_secret="GGV2z1gXvScnRsC5slcPMpxmFKY",
- password='Correplatano!23', user_agent='script:testsscript:v0.1 by /u/cuakcuak8',
- username='cuakcuak8')
- subreddit = reddit.subreddit('history')
- """
- Recogida de datos y creacion del XML
- """
- top = Element('top')
- numDocs=0
- for submission in subreddit.top(limit=50):#Recorriendo los post de "top"(populares)
- documento = SubElement(top,"documento")
- numDocs+=1
- print("Numero de documentos = "+str(numDocs))
- #Recogida de datos del post
- titulo = SubElement(documento,"titulo")
- titulo.text=submission.title
- contenido = SubElement(documento,"contenido")
- contenido.text = submission.selftext
- fecha = SubElement(documento,"fecha")
- fecha.text = time.strftime('%Y-%m-%d %H:%M:%S GMT', time.localtime(submission.created_utc))
- tipoDeEntrada = SubElement(documento,"tipo_entrada")
- tipoDeEntrada.text = "post"
- submission.comments.replace_more(limit=0)#Cargando todos los comentarios del post
- for comment in submission.comments.list():#Iteracion sobre los comentarios
- if (comment.body != '[removed]' and comment.body != '[deleted]'):
- documento = SubElement(top,"documento")
- numDocs+=1
- #Recogida de datos del comentario
- titulo = SubElement(documento,"titulo")
- titulo.text=""
- contenido = SubElement(documento,"contenido")
- contenido.text = comment.body
- fecha = SubElement(documento,"fecha")
- fecha.text = time.strftime('%Y-%m-%d %H:%M:%S GMT', time.localtime(comment.created_utc))
- tipoDeEntrada = SubElement(documento,"tipo_entrada")
- tipoDeEntrada.text = "comentario"
- #Volcado de datos a un XML
- path='/home/masterbigdata/TGINE-P1/historyTOP.xml'
- tree = ElementTree(top)
- tree.write(path)
- print("Se han obtenido "+str(numDocs)+" documentos y se han volcado en "+path)
- """
- Obtencion de los documentos del XML
- """
- import xml.etree.ElementTree as ET
- tree = ET.parse(path)
- root = tree.getroot()
- corpus=[]
- i=0
- for documento in root:
- i+=1
- if documento[3].text=="post":
- corpus.append(documento[0].text)
- else:
- corpus.append(documento[1].text)
- """
- Obtencion de los 10 termnos centrales
- """
- from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
- vectorizerTfIdf = TfidfVectorizer(analyzer = 'word', min_df=10, max_features=10, stop_words = 'english')
- X = vectorizerTfIdf.fit_transform(corpus)
- palabras=""
- for palabra in vectorizerTfIdf.get_feature_names():
- palabra.replace("u'","")
- palabra.strip()
- palabras=palabras+" "+palabra
- print 25*"-"
- print "Los 10 terminos centrales de la coleccion son:"
- print palabras
- """
- Obtencion de los 10 terminos mas frecuentes
- """
- vectorizerCount=CountVectorizer(analyzer = 'word', min_df=10, max_features=100, stop_words = 'english')
- Y = vectorizerCount.fit_transform(corpus)
- palabras=""
- for palabra in vectorizerCount.get_feature_names():
- palabra.replace("u'","")
- palabra.strip()
- palabras=palabras+" "+palabra
- print 25*"-"
- print "Los 100 terminos mas repetidos son: "
- print palabras
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement