Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import xml.etree.ElementTree as ET
- from urllib.request import urlopen
- file = urlopen("http://gss.uva.nl/binaries/content/assets/programmas/information-studies/ds/collection.txt")
- file_contents = "<documents>" + file.read().decode("utf-8") + "</documents>"
- xml_root = ET.ElementTree(ET.fromstring(file_contents)).getroot()
- for doc_node in xml_root.findall("doc"):
- text = doc_node.find("text")
- print("found a document, length " + str(len(text)))
- for paragraph in text.findall("p"):
- print(paragraph.text)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement