Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import xml.etree.ElementTree as ET
- import pandas as pd
- tree = ET.parse('../data/webnlg2017/challenge_data_train_dev/train/2triples/2triples_Airport_train_challenge.xml')
- root = tree.getroot()
- all_elem = list(root.iter('entry'))
- # entries
- entries = [{
- "category": elem.attrib['category'],
- "eid": elem.attrib['eid'],
- "size": elem.attrib['size']
- } for elem in all_elem]
- entries_df = pd.DataFrame(entries)
- # original tripleset
- otriples = [
- {'eid': elem.attrib['eid'],
- 'text': e.text} for e in elem.find('originaltripleset').findall('otriple') for elem in all_elem
- ]
- otriples_df = pd.DataFrame(otriples)
- # modified tripleset
- mtriples = [
- {'eid': elem.attrib['eid'],
- 'text': e.text} for e in elem.find('modifiedtripleset').findall('mtriple') for elem in all_elem
- ]
- mtriples_df = pd.DataFrame(mtriples)
- # lexes
- lexes = [
- {'eid': elem.attrib['eid'],
- 'text': e.text,
- 'comment': e.attrib['comment'],
- 'lid': e.attrib['lid']} for e in elem.findall('lex') for elem in all_elem
- ]
- lexes_df = pd.DataFrame(lexes)
Add Comment
Please, Sign In to add comment