#!/usr/bin/env python
"""Convert flat-file Metafilter comment archive to XML"""
import sys
from xml.sax import saxutils
from optparse import OptionParser
__version__ = '1.0'
__author__ = 'Kadin2048 (http://kadin.sdf-us.org)'
__license__ = 'GPL 2.0 or later'
INFILE = 'my-mefi-comments.txt'
OUTFILE = 'my-mefi-comments.xml'
USAGE = '''%prog [options] [infile [outfile]]
If infile is not specified, "my-mefi-comments.txt" will be used.
If outfile is not specified, "my-mefi-comments.xml" will be used.'''
def main():
optparse = OptionParser(USAGE, version=__version__)
optparse.add_option('-r', '--root', metavar='NAME', default='metafilter',
help='specify root XML node (default: %default)')
optparse.add_option('-c', '--cdata', default=False, action='store_true',
help='protect HTML using CDATA')
optparse.add_option('-e', '--escape', default=False, action='store_true',
help='protect html by escaping')
opts, args = optparse.parse_args()
if opts.cdata and opts.escape:
optparse.error('CDATA and escape methods are incompatible')
inputname = args.pop(0) if args else INFILE
outputname = args.pop(0) if args else OUTFILE
if args:
optparse.error('Unknown argument passed.')
try:
print "Reading from: " + inputname
infile = open(inputname, 'rU')
print "Writing to: " + outputname
outfile = open(outputname, 'w')
if opts.cdata:
convertMefiCDATA(infile, outfile, opts)
elif opts.escape:
convertMefiMunge(infile, outfile, opts)
else:
convertMefi(infile, outfile, opts) # see below
print "Processing complete, closing files..."
infile.close()
outfile.close()
except IOError, error:
print "I/O Error: %s" %(error)
return 2
except StopIteration, error:
print "An error occurred while processing the file."
print "The error was: %s" %(error)
return 2
# This function converts the input to XML, encapsulating the HTML in an XML comment
def convertMefi(inputfile, outputfile, opts):
# First we prepare the output file
outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
outputfile.write("<" + opts.root + ">\n") # write the open tag for the root element
i,n = 1,0
while i:
line = inputfile.readline()
#print "DEBUG: ", line
if line == '':
print "End of file reached."
# if we've hit the end of the file, stop processing and break out of the loop
outputfile.write("--></htmldata>\n</post>\n")
n = n+1
print "Posts processed: %s" %(n)
break
elif line == "-----\n":
#print "End of post reached."
# when we reach the end of a post, write closing tags, reset counter and continue
outputfile.write("--></htmldata>\n</post>\n")
i = 1
n = n+1
#print "Posts processed: %s" %(n)
continue
elif i == 1:
# enter the loop here, for the date line
outputfile.write("<post>\n")
outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
i = i+1
continue
elif i == 2:
# after the date line is the URL line
outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
i = i+1
continue
elif i == 3:
# after the URL is the first line of the comment
outputfile.write("<htmldata><!--\n" + line)
i = i+1
continue
elif i > 3:
# then we can have multiple comment lines, possibly many
outputfile.write(line)
i = i+1
continue
else:
# this means none of the conditions were matched, which should never happen
raise StopIteration, "Processing Error"
break
outputfile.write("</" + opts.root + ">\n") # close the root element
# This is similar to above, except we wrap in CDATA sections instead of XML comments
def convertMefiCDATA(inputfile, outputfile, opts):
# First we prepare the output file
outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
outputfile.write("<" + opts.root + ">\n") # write the open tag for the root element
i,n = 1,0
while i:
line = inputfile.readline()
if line == '':
print "End of file reached."
# if we've hit the end of the file, stop processing and break out of the loop
outputfile.write("]]>\n</htmldata>\n</post>\n")
n = n+1
print "Posts processed: %s" %(n)
break
elif line == "-----\n":
# when we reach the end of a post, write closing tags, reset counter and continue
outputfile.write("]]>\n</htmldata>\n</post>\n")
i = 1
n = n+1
continue
elif i == 1:
# enter the loop here, for the date line
outputfile.write("<post>\n")
outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
i = i+1
continue
elif i == 2:
# after the date line is the URL line
outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
i = i+1
continue
elif i == 3:
# after the URL is the first line of the comment
outputfile.write("<htmldata><![CDATA[\n" + line)
i = i+1
continue
elif i > 3:
# then we can have multiple comment lines, possibly many
outputfile.write(line)
i = i+1
continue
else:
# this means none of the conditions were matched, which should never happen
raise StopIteration, "Processing Error"
break
outputfile.write("</" + opts.root + ">\n") # close the root element
# This is similar to above, except we actually go through the HTML and make it XML-safe using SAX
def convertMefiMunge(inputfile, outputfile, opts):
# First we prepare the output file
outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
outputfile.write("<" + opts.root + ">\n") # write the open tag for the root element
i,n = 1,0
escapedline = ""
while i:
line = unicode(inputfile.readline(), "UTF-8")
if line == '':
print "End of file reached."
# if we've hit the end of the file, stop processing and break out of the loop
outputfile.write("\n</htmldata>\n</post>\n")
n = n+1
print "Posts processed: %s" %(n)
break
elif line == "-----\n":
# when we reach the end of a post, write closing tags, reset counter and continue
outputfile.write("</htmldata>\n</post>\n")
i = 1
n = n+1
continue
elif i == 1:
# enter the loop here, for the date line
outputfile.write("<post>\n")
outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
i = i+1
continue
elif i == 2:
# after the date line is the URL line
outputfile.write("<url>" + line.rstrip('\n').encode('UTF-8') + "</url>\n")
i = i+1
continue
elif i == 3:
# after the URL is the first line of the comment
escapedline = saxutils.escape(line).encode('UTF-8') # This makes everything XML-safe
outputfile.write("<htmldata>\n" + escapedline)
i = i+1
continue
elif i > 3:
# then we can have multiple comment lines, possibly many
escapedline = saxutils.escape(line).encode('UTF-8') # This makes everything XML-safe
outputfile.write(escapedline)
i = i+1
continue
else:
# this means none of the conditions were matched, which should never happen
raise StopIteration, "Processing Error"
break
outputfile.write("</" + opts.root + ">\n") # close the root element
if __name__ == "__main__":
sys.exit( main() ) # get exit status from main()