Advertisement
Guest User

Untitled

a guest
May 1st, 2016
50
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.51 KB | None | 0 0
  1. import xml.etree.ElementTree as ET
  2. from urllib.request import urlopen
  3.  
  4. file = urlopen("http://gss.uva.nl/binaries/content/assets/programmas/information-studies/ds/collection.txt")
  5.  
  6. file_contents = "<documents>" + file.read().decode("utf-8") + "</documents>"
  7.  
  8. xml_root = ET.ElementTree(ET.fromstring(file_contents)).getroot()
  9.  
  10. for doc_node in xml_root.findall("doc"):
  11. text = doc_node.find("text")
  12. print("found a document, length " + str(len(text)))
  13. for paragraph in text.findall("p"):
  14. print(paragraph.text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement