Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import sys
- from xml.etree import ElementTree
- import os
- def run(filepath):
- #Prepare basic OS path stuff
- slice_n_splice_path = filepath.replace("original", "slice_n_splice")
- if not os.path.isdir(slice_n_splice_path):
- os.makedirs(slice_n_splice_path)
- for file in os.listdir(filepath):
- #Start crunching XML
- bookdata = ElementTree.parse(filepath+file).getroot()
- #Make sure we look only for "title" elements inside of "bookinfo"
- if bookdata.getiterator('bookinfo'):
- bookxml = open(slice_n_splice_path + (os.path.basename(file)) + '.spliced.xml', 'w+')
- for booktitleelement in bookdata.getiterator('title'):
- #Construct XML header and root element
- bookxml.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<book>\n")
- #Grab title and bookinfo elements from book file
- bookxml.write(ElementTree.tostring(booktitleelement))
- for bookinfoelement in bookdata.getiterator('bookinfo'):
- bookxml.write(ElementTree.tostring(bookinfoelement))
- #Iterate through chapterref elements and grab individual chapter XML files
- for chapterxmlelement in bookdata.getiterator('chapterref'):
- chapterxml = chapterxmlelement.attrib['href']
- firstchapter = None
- chapterdata = ElementTree.parse(filepath+'/'+chapterxml).getroot()
- if firstchapter is None:
- firstchapter = chapterdata
- else:
- firstchapter.append(chapterdata)
- bookxml.write(ElementTree.tostring(firstchapter))
- bookxml.write('\n')
- #Add chapterrefs back in
- for bookchapterrefelement in bookdata.getiterator('chapterref'):
- bookxml.write(ElementTree.tostring(bookchapterrefelement))
- #Close root tag
- bookxml.write("</book>")
- bookxml.close()
- #If no "bookinfo" element, then don't create new slice-n-splice file
- else:
- pass
- if __name__ == "__main__":
- run(sys.argv[1])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement