#!/usr/local/bin/python
# Convert flat-file Metafilter comment archive to XML
# Filename: meficonverter.py
#
# Usage: "meficonverter.py [-r|--root rootname] [--cdata] [--escape] inputfilename [outputfilename]"
# If outputfilename is not specified, "my-mefi-comments.xml" will be used.
# The --root or -r option allows you to specify a different name for the root XML node; default <metafilter>
# The --cdata and --escape options enable different methods for escaping HTML comments to preserve XML.
#
# Version: 2009-04-14
# Author: Kadin2048 (http://kadin.sdf-us.org)
# License: GPL 2.0 or later
import sys
import getopt
from xml.sax import saxutils
def main(argv):
# Defaults:
inputname = "my-mefi-comments.txt"
outputname = "my-mefi-comments.xml"
global rootelement
rootelement = "metafilter"
global cdata
cdata = False
global munge
munge = False
try:
#print "ARGV : ", argv
opts, args = getopt.gnu_getopt(argv, "r:", ['root=', 'cdata', 'escape'])
#print "OPTIONS : ", opts
#print "ARGS : ", args
except getopt.GetoptError, err:
print str(err)
return 2
for o, a in opts:
if o in ("--root", "-r"):
print "Custom XML root specified, using ", a
rootelement = a
elif o == "--cdata":
print "CDATA mode enabled."
cdata = True
elif o == "--escape":
print "Inline HTML escape mode enabled."
munge = True
else:
print "Unknown option passed, exiting."
return 2
if (cdata == True and munge == True):
print "CDATA and HTML munging modes are mutually exclusive! Exiting."
return 2
if len(args) == 1:
inputname = args[0]
if len(args) == 2:
inputname = args[0]
outputname = args[1]
if len(args) > 2:
print "Unknown argument passed, exiting."
return 2
try:
print "Reading from: " + inputname
infile = open(inputname, 'rU')
print "Writing to: " + outputname
outfile = open(outputname, 'w')
if cdata:
convertMefiCDATA(infile, outfile)
elif munge:
convertMefiMunge(infile, outfile)
else:
convertMefi(infile, outfile) # see below
print "Processing complete, closing files..."
infile.close()
outfile.close()
except IOError, error:
print "I/O Error: %s" %(error)
return 2
except StopIteration, error:
print "An error occurred while processing the file."
print "The error was: %s" %(error)
return 2
# This function converts the input to XML, encapsulating the HTML in an XML comment
def convertMefi(inputfile, outputfile):
# First we prepare the output file
outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
outputfile.write("<" + rootelement + ">\n") # write the open tag for the root element
i,n = 1,0
while i:
line = inputfile.readline()
#print "DEBUG: ", line
if line == '':
print "End of file reached."
# if we've hit the end of the file, stop processing and break out of the loop
outputfile.write("--></htmldata>\n</post>\n")
n = n+1
print "Posts processed: %s" %(n)
break
elif line == "-----\n":
#print "End of post reached."
# when we reach the end of a post, write closing tags, reset counter and continue
outputfile.write("--></htmldata>\n</post>\n")
i = 1
n = n+1
#print "Posts processed: %s" %(n)
continue
elif i == 1:
# enter the loop here, for the date line
outputfile.write("<post>\n")
outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
i = i+1
continue
elif i == 2:
# after the date line is the URL line
outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
i = i+1
continue
elif i == 3:
# after the URL is the first line of the comment
outputfile.write("<htmldata><!--\n" + line)
i = i+1
continue
elif i > 3:
# then we can have multiple comment lines, possibly many
outputfile.write(line)
i = i+1
continue
else:
# this means none of the conditions were matched, which should never happen
raise StopIteration, "Processing Error"
break
outputfile.write("</" + rootelement + ">\n") # close the root element
# This is similar to above, except we wrap in CDATA sections instead of XML comments
def convertMefiCDATA(inputfile, outputfile):
# First we prepare the output file
outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
outputfile.write("<" + rootelement + ">\n") # write the open tag for the root element
i,n = 1,0
while i:
line = inputfile.readline()
if line == '':
print "End of file reached."
# if we've hit the end of the file, stop processing and break out of the loop
outputfile.write("]]>\n</htmldata>\n</post>\n")
n = n+1
print "Posts processed: %s" %(n)
break
elif line == "-----\n":
# when we reach the end of a post, write closing tags, reset counter and continue
outputfile.write("]]>\n</htmldata>\n</post>\n")
i = 1
n = n+1
continue
elif i == 1:
# enter the loop here, for the date line
outputfile.write("<post>\n")
outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
i = i+1
continue
elif i == 2:
# after the date line is the URL line
outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
i = i+1
continue
elif i == 3:
# after the URL is the first line of the comment
outputfile.write("<htmldata><![CDATA[\n" + line)
i = i+1
continue
elif i > 3:
# then we can have multiple comment lines, possibly many
outputfile.write(line)
i = i+1
continue
else:
# this means none of the conditions were matched, which should never happen
raise StopIteration, "Processing Error"
break
outputfile.write("</" + rootelement + ">\n") # close the root element
# This is similar to above, except we actually go through the HTML and make it XML-safe using SAX
def convertMefiMunge(inputfile, outputfile):
# First we prepare the output file
outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
outputfile.write("<" + rootelement + ">\n") # write the open tag for the root element
i,n = 1,0
escapedline = ""
while i:
line = unicode(inputfile.readline(), "UTF-8")
if line == '':
print "End of file reached."
# if we've hit the end of the file, stop processing and break out of the loop
outputfile.write("\n</htmldata>\n</post>\n")
n = n+1
print "Posts processed: %s" %(n)
break
elif line == "-----\n":
# when we reach the end of a post, write closing tags, reset counter and continue
outputfile.write("\n</htmldata>\n</post>\n")
i = 1
n = n+1
continue
elif i == 1:
# enter the loop here, for the date line
outputfile.write("<post>\n")
outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
i = i+1
continue
elif i == 2:
# after the date line is the URL line
outputfile.write("<url>" + line.rstrip('\n').encode('UTF-8') + "</url>\n")
i = i+1
continue
elif i == 3:
# after the URL is the first line of the comment
escapedline = saxutils.escape(line).encode('UTF-8') # This makes everything XML-safe
outputfile.write("<htmldata>\n" + escapedline)
i = i+1
continue
elif i > 3:
# then we can have multiple comment lines, possibly many
escapedline = saxutils.escape(line).encode('UTF-8') # This makes everything XML-safe
outputfile.write(escapedline)
i = i+1
continue
else:
# this means none of the conditions were matched, which should never happen
raise StopIteration, "Processing Error"
break
outputfile.write("</" + rootelement + ">\n") # close the root element
if __name__ == "__main__":
sys.exit( main(sys.argv[1:]) ) # get exit status from main()