Untitled

import xml.etree.ElementTree as ET
from urllib.request import urlopen

file = urlopen("http://gss.uva.nl/binaries/content/assets/programmas/information-studies/ds/collection.txt")

file_contents = "<documents>" + file.read().decode("utf-8") + "</documents>"

xml_root = ET.ElementTree(ET.fromstring(file_contents)).getroot()

for doc_node in xml_root.findall("doc"):
	text = doc_node.find("text")
	print("found a document, length " + str(len(text)))
	for paragraph in text.findall("p"):
		print(paragraph.text)