Guest
Public paste!

Kadin2048

By: a guest | Apr 14th, 2009 | Syntax: Python | Size: 9.02 KB | Hits: 59 | Expires: Never
Copy text to clipboard
  1. #!/usr/local/bin/python
  2. #    Convert flat-file Metafilter comment archive to XML
  3. #    Filename: meficonverter.py
  4. #
  5. #    Usage: "meficonverter.py [-r|--root rootname] [--cdata] [--escape] inputfilename [outputfilename]"
  6. #      If outputfilename is not specified, "my-mefi-comments.xml" will be used.
  7. #      The --root or -r option allows you to specify a different name for the root XML node; default <metafilter>
  8. #      The --cdata and --escape options enable different methods for escaping HTML comments to preserve XML.
  9. #
  10. #    Version: 2009-04-14
  11. #    Author: Kadin2048 (http://kadin.sdf-us.org)
  12. #    License: GPL 2.0 or later
  13.  
  14. import sys
  15. import getopt
  16. from xml.sax import saxutils
  17.  
  18. def main(argv):
  19.     # Defaults:
  20.     inputname = "my-mefi-comments.txt"
  21.     outputname = "my-mefi-comments.xml"
  22.     global rootelement
  23.     rootelement = "metafilter"
  24.     global cdata
  25.     cdata = False
  26.     global munge
  27.     munge = False
  28.  
  29.     try:
  30.         #print "ARGV     : ", argv
  31.         opts, args = getopt.gnu_getopt(argv, "r:", ['root=', 'cdata', 'escape'])
  32.         #print "OPTIONS  : ", opts
  33.         #print "ARGS     : ", args
  34.     except getopt.GetoptError, err:
  35.         print str(err)
  36.         return 2
  37.     for o, a in opts:
  38.         if o in ("--root", "-r"):
  39.             print "Custom XML root specified, using ", a
  40.             rootelement = a
  41.         elif o == "--cdata":
  42.             print "CDATA mode enabled."
  43.             cdata = True
  44.         elif o == "--escape":
  45.             print "Inline HTML escape mode enabled."
  46.             munge = True
  47.         else:
  48.             print "Unknown option passed, exiting."
  49.             return 2
  50.     if (cdata == True and munge == True):
  51.         print "CDATA and HTML munging modes are mutually exclusive! Exiting."
  52.         return 2
  53.     if len(args) == 1:
  54.         inputname = args[0]
  55.     if len(args) == 2:
  56.         inputname = args[0]
  57.         outputname = args[1]
  58.     if len(args) > 2:
  59.         print "Unknown argument passed, exiting."
  60.         return 2
  61.  
  62.     try:
  63.         print "Reading from: " + inputname
  64.         infile = open(inputname, 'rU')
  65.         print "Writing to: " + outputname
  66.         outfile = open(outputname, 'w')
  67.         if cdata:
  68.             convertMefiCDATA(infile, outfile)
  69.         elif munge:
  70.             convertMefiMunge(infile, outfile)
  71.         else:
  72.             convertMefi(infile, outfile)  # see below
  73.         print "Processing complete, closing files..."
  74.         infile.close()
  75.         outfile.close()
  76.     except IOError, error:
  77.         print "I/O Error: %s" %(error)
  78.         return 2
  79.     except StopIteration, error:
  80.         print "An error occurred while processing the file."
  81.         print "The error was: %s" %(error)
  82.         return 2
  83.  
  84.  
  85. # This function converts the input to XML, encapsulating the HTML in an XML comment
  86. def convertMefi(inputfile, outputfile):
  87.     # First we prepare the output file
  88.     outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
  89.     outputfile.write("<" + rootelement + ">\n")  # write the open tag for the root element    
  90.     i,n = 1,0
  91.     while i:
  92.         line = inputfile.readline()
  93.         #print "DEBUG: ", line
  94.         if line == '':
  95.             print "End of file reached."
  96.             # if we've hit the end of the file, stop processing and break out of the loop
  97.             outputfile.write("--></htmldata>\n</post>\n")
  98.             n = n+1
  99.             print "Posts processed: %s" %(n)
  100.             break
  101.         elif line == "-----\n":
  102.             #print "End of post reached."
  103.             # when we reach the end of a post, write closing tags, reset counter and continue
  104.             outputfile.write("--></htmldata>\n</post>\n")
  105.             i = 1
  106.             n = n+1
  107.             #print "Posts processed: %s" %(n)
  108.             continue
  109.         elif i == 1:
  110.             # enter the loop here, for the date line
  111.             outputfile.write("<post>\n")
  112.             outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
  113.             i = i+1
  114.             continue
  115.         elif i == 2:
  116.             # after the date line is the URL line
  117.             outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
  118.             i = i+1
  119.             continue
  120.         elif i == 3:
  121.             # after the URL is the first line of the comment
  122.             outputfile.write("<htmldata><!--\n" + line)
  123.             i = i+1
  124.             continue
  125.         elif i > 3:
  126.             # then we can have multiple comment lines, possibly many
  127.             outputfile.write(line)
  128.             i = i+1
  129.             continue
  130.         else:
  131.             # this means none of the conditions were matched, which should never happen
  132.             raise StopIteration, "Processing Error"
  133.             break
  134.     outputfile.write("</" + rootelement + ">\n")  # close the root element
  135.  
  136.  
  137.  
  138. # This is similar to above, except we wrap in CDATA sections instead of XML comments
  139. def convertMefiCDATA(inputfile, outputfile):
  140.     # First we prepare the output file
  141.     outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
  142.     outputfile.write("<" + rootelement + ">\n")  # write the open tag for the root element    
  143.     i,n = 1,0
  144.     while i:
  145.         line = inputfile.readline()
  146.         if line == '':
  147.             print "End of file reached."
  148.             # if we've hit the end of the file, stop processing and break out of the loop
  149.             outputfile.write("]]>\n</htmldata>\n</post>\n")
  150.             n = n+1
  151.             print "Posts processed: %s" %(n)
  152.             break
  153.         elif line == "-----\n":
  154.             # when we reach the end of a post, write closing tags, reset counter and continue
  155.             outputfile.write("]]>\n</htmldata>\n</post>\n")
  156.             i = 1
  157.             n = n+1
  158.             continue
  159.         elif i == 1:
  160.             # enter the loop here, for the date line
  161.             outputfile.write("<post>\n")
  162.             outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
  163.             i = i+1
  164.             continue
  165.         elif i == 2:
  166.             # after the date line is the URL line
  167.             outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
  168.             i = i+1
  169.             continue
  170.         elif i == 3:
  171.             # after the URL is the first line of the comment
  172.             outputfile.write("<htmldata><![CDATA[\n" + line)
  173.             i = i+1
  174.             continue
  175.         elif i > 3:
  176.             # then we can have multiple comment lines, possibly many
  177.             outputfile.write(line)
  178.             i = i+1
  179.             continue
  180.         else:
  181.             # this means none of the conditions were matched, which should never happen
  182.             raise StopIteration, "Processing Error"
  183.             break
  184.     outputfile.write("</" + rootelement + ">\n")  # close the root element
  185.  
  186. # This is similar to above, except we actually go through the HTML and make it XML-safe using SAX
  187. def convertMefiMunge(inputfile, outputfile):
  188.     # First we prepare the output file
  189.     outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
  190.     outputfile.write("<" + rootelement + ">\n")  # write the open tag for the root element    
  191.     i,n = 1,0
  192.     escapedline = ""
  193.     while i:
  194.         line = unicode(inputfile.readline(), "UTF-8")
  195.         if line == '':
  196.             print "End of file reached."
  197.             # if we've hit the end of the file, stop processing and break out of the loop
  198.             outputfile.write("\n</htmldata>\n</post>\n")
  199.             n = n+1
  200.             print "Posts processed: %s" %(n)
  201.             break
  202.         elif line == "-----\n":
  203.             # when we reach the end of a post, write closing tags, reset counter and continue
  204.             outputfile.write("\n</htmldata>\n</post>\n")
  205.             i = 1
  206.             n = n+1
  207.             continue
  208.         elif i == 1:
  209.             # enter the loop here, for the date line
  210.             outputfile.write("<post>\n")
  211.             outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
  212.             i = i+1
  213.             continue
  214.         elif i == 2:
  215.             # after the date line is the URL line
  216.             outputfile.write("<url>" + line.rstrip('\n').encode('UTF-8') + "</url>\n")
  217.             i = i+1
  218.             continue
  219.         elif i == 3:
  220.             # after the URL is the first line of the comment
  221.             escapedline = saxutils.escape(line).encode('UTF-8')  # This makes everything XML-safe
  222.             outputfile.write("<htmldata>\n" + escapedline)
  223.             i = i+1
  224.             continue
  225.         elif i > 3:
  226.             # then we can have multiple comment lines, possibly many
  227.             escapedline = saxutils.escape(line).encode('UTF-8')  # This makes everything XML-safe
  228.             outputfile.write(escapedline)
  229.             i = i+1
  230.             continue
  231.         else:
  232.             # this means none of the conditions were matched, which should never happen
  233.             raise StopIteration, "Processing Error"
  234.             break
  235.     outputfile.write("</" + rootelement + ">\n")  # close the root element
  236.  
  237.  
  238.  
  239. if __name__ == "__main__":
  240.     sys.exit( main(sys.argv[1:]) ) # get exit status from main()