Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import xml.etree.ElementTree as ET
- import re
- import csv
- def format_time(secs):
- mins = secs / 60
- hrs = int(mins / 60)
- mins = int(mins - hrs * 60)
- secs -= (hrs * 3600 + mins * 60)
- return "{:02d}:{:02d}:{:05.2f}".format(hrs, mins, secs)
- fieldnames = ["Start time", "End time", "Duration (sec.)", "Row name", "Instance number", "Num. of Labels", "Instance ID", "Instance note", "No Group"]
- for f in os.listdir('.'):
- if f.endswith('.xml'):
- print "Processing {}".format(f)
- with open(f) as fp:
- content = fp.read().replace('\x00', '').replace('\r', '\n')[2:]
- content = re.sub("< */?.*?>", lambda x: x.group().lower(), content)
- e = ET.fromstring(content)
- with open(f + '.csv', 'w') as csvfile:
- writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_NONNUMERIC)
- writer.writeheader()
- for instance in e.iter('instance'):
- idv = int(instance.find('id').text)
- start = float(instance.find('start').text)
- end = float(instance.find('end').text)
- code = instance.find('code').text
- labels = [l.text for l in instance.iter('text')]
- writer.writerow({
- "Start time": format_time(start),
- "End time": format_time(end),
- "Duration (sec.)": "{:.2f}".format(end - start),
- "Row name": code,
- "Instance number": 0, # TODO: edit here
- "Num. of Labels": len(labels),
- "Instance ID": idv,
- "Instance note": "",
- "No Group": '\n'.join(labels)
- })
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement