Guest
Public paste!

Kadin2048

By: a guest | Apr 15th, 2009 | Syntax: Python | Size: 8.35 KB | Hits: 95 | Expires: Never
This paste has a previous version, view the difference. Copy text to clipboard
  1. #!/usr/bin/env python
  2.  
  3. """Convert flat-file Metafilter comment archive to XML"""
  4.  
  5. import sys
  6. from xml.sax import saxutils
  7. from optparse import OptionParser
  8.  
  9. __version__ = '1.0'
  10. __author__ = 'Kadin2048 (http://kadin.sdf-us.org)'
  11. __license__ = 'GPL 2.0 or later'
  12.  
  13. INFILE = 'my-mefi-comments.txt'
  14. OUTFILE = 'my-mefi-comments.xml'
  15. USAGE = '''%prog [options] [infile [outfile]]
  16.  
  17. If infile is not specified, "my-mefi-comments.txt" will be used.
  18. If outfile is not specified, "my-mefi-comments.xml" will be used.'''
  19.  
  20. def main():    
  21.     optparse = OptionParser(USAGE, version=__version__)
  22.     optparse.add_option('-r', '--root', metavar='NAME', default='metafilter',
  23.                         help='specify root XML node (default: %default)')
  24.     optparse.add_option('-c', '--cdata', default=False, action='store_true',
  25.                         help='protect HTML using CDATA')
  26.     optparse.add_option('-e', '--escape', default=False, action='store_true',
  27.                         help='protect html by escaping')
  28.     opts, args = optparse.parse_args()
  29.     if opts.cdata and opts.escape:
  30.         optparse.error('CDATA and escape methods are incompatible')
  31.     inputname = args.pop(0) if args else INFILE
  32.     outputname = args.pop(0) if args else OUTFILE
  33.     if args:
  34.         optparse.error('Unknown argument passed.')
  35.  
  36.     try:
  37.         print "Reading from: " + inputname
  38.         infile = open(inputname, 'rU')
  39.         print "Writing to: " + outputname
  40.         outfile = open(outputname, 'w')
  41.         if opts.cdata:
  42.             convertMefiCDATA(infile, outfile, opts)
  43.         elif opts.escape:
  44.             convertMefiMunge(infile, outfile, opts)
  45.         else:
  46.             convertMefi(infile, outfile, opts)  # see below
  47.         print "Processing complete, closing files..."
  48.         infile.close()
  49.         outfile.close()
  50.     except IOError, error:
  51.         print "I/O Error: %s" %(error)
  52.         return 2
  53.     except StopIteration, error:
  54.         print "An error occurred while processing the file."
  55.         print "The error was: %s" %(error)
  56.         return 2
  57.  
  58. # This function converts the input to XML, encapsulating the HTML in an XML comment
  59. def convertMefi(inputfile, outputfile, opts):
  60.     # First we prepare the output file
  61.     outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
  62.     outputfile.write("<" + opts.root + ">\n")  # write the open tag for the root element    
  63.     i,n = 1,0
  64.     while i:
  65.         line = inputfile.readline()
  66.         #print "DEBUG: ", line
  67.         if line == '':
  68.             print "End of file reached."
  69.             # if we've hit the end of the file, stop processing and break out of the loop
  70.             outputfile.write("--></htmldata>\n</post>\n")
  71.             n = n+1
  72.             print "Posts processed: %s" %(n)
  73.             break
  74.         elif line == "-----\n":
  75.             #print "End of post reached."
  76.             # when we reach the end of a post, write closing tags, reset counter and continue
  77.             outputfile.write("--></htmldata>\n</post>\n")
  78.             i = 1
  79.             n = n+1
  80.             #print "Posts processed: %s" %(n)
  81.             continue
  82.         elif i == 1:
  83.             # enter the loop here, for the date line
  84.             outputfile.write("<post>\n")
  85.             outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
  86.             i = i+1
  87.             continue
  88.         elif i == 2:
  89.             # after the date line is the URL line
  90.             outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
  91.             i = i+1
  92.             continue
  93.         elif i == 3:
  94.             # after the URL is the first line of the comment
  95.             outputfile.write("<htmldata><!--\n" + line)
  96.             i = i+1
  97.             continue
  98.         elif i > 3:
  99.             # then we can have multiple comment lines, possibly many
  100.             outputfile.write(line)
  101.             i = i+1
  102.             continue
  103.         else:
  104.             # this means none of the conditions were matched, which should never happen
  105.             raise StopIteration, "Processing Error"
  106.             break
  107.     outputfile.write("</" + opts.root + ">\n")  # close the root element
  108.  
  109.  
  110.  
  111. # This is similar to above, except we wrap in CDATA sections instead of XML comments
  112. def convertMefiCDATA(inputfile, outputfile, opts):
  113.     # First we prepare the output file
  114.     outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
  115.     outputfile.write("<" + opts.root + ">\n")  # write the open tag for the root element    
  116.     i,n = 1,0
  117.     while i:
  118.         line = inputfile.readline()
  119.         if line == '':
  120.             print "End of file reached."
  121.             # if we've hit the end of the file, stop processing and break out of the loop
  122.             outputfile.write("]]>\n</htmldata>\n</post>\n")
  123.             n = n+1
  124.             print "Posts processed: %s" %(n)
  125.             break
  126.         elif line == "-----\n":
  127.             # when we reach the end of a post, write closing tags, reset counter and continue
  128.             outputfile.write("]]>\n</htmldata>\n</post>\n")
  129.             i = 1
  130.             n = n+1
  131.             continue
  132.         elif i == 1:
  133.             # enter the loop here, for the date line
  134.             outputfile.write("<post>\n")
  135.             outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
  136.             i = i+1
  137.             continue
  138.         elif i == 2:
  139.             # after the date line is the URL line
  140.             outputfile.write("<url>" + line.rstrip('\n') + "</url>\n")
  141.             i = i+1
  142.             continue
  143.         elif i == 3:
  144.             # after the URL is the first line of the comment
  145.             outputfile.write("<htmldata><![CDATA[\n" + line)
  146.             i = i+1
  147.             continue
  148.         elif i > 3:
  149.             # then we can have multiple comment lines, possibly many
  150.             outputfile.write(line)
  151.             i = i+1
  152.             continue
  153.         else:
  154.             # this means none of the conditions were matched, which should never happen
  155.             raise StopIteration, "Processing Error"
  156.             break
  157.     outputfile.write("</" + opts.root + ">\n")  # close the root element
  158.  
  159. # This is similar to above, except we actually go through the HTML and make it XML-safe using SAX
  160. def convertMefiMunge(inputfile, outputfile, opts):
  161.     # First we prepare the output file
  162.     outputfile.write('''<?xml version="1.0" encoding="UTF-8" ?>\n''')
  163.     outputfile.write("<" + opts.root + ">\n")  # write the open tag for the root element    
  164.     i,n = 1,0
  165.     escapedline = ""
  166.     while i:
  167.         line = unicode(inputfile.readline(), "UTF-8")
  168.         if line == '':
  169.             print "End of file reached."
  170.             # if we've hit the end of the file, stop processing and break out of the loop
  171.             outputfile.write("\n</htmldata>\n</post>\n")
  172.             n = n+1
  173.             print "Posts processed: %s" %(n)
  174.             break
  175.         elif line == "-----\n":
  176.             # when we reach the end of a post, write closing tags, reset counter and continue
  177.             outputfile.write("</htmldata>\n</post>\n")
  178.             i = 1
  179.             n = n+1
  180.             continue
  181.         elif i == 1:
  182.             # enter the loop here, for the date line
  183.             outputfile.write("<post>\n")
  184.             outputfile.write("<date>" + line.rstrip('\n') + "</date>\n")
  185.             i = i+1
  186.             continue
  187.         elif i == 2:
  188.             # after the date line is the URL line
  189.             outputfile.write("<url>" + line.rstrip('\n').encode('UTF-8') + "</url>\n")
  190.             i = i+1
  191.             continue
  192.         elif i == 3:
  193.             # after the URL is the first line of the comment
  194.             escapedline = saxutils.escape(line).encode('UTF-8')  # This makes everything XML-safe
  195.             outputfile.write("<htmldata>\n" + escapedline)
  196.             i = i+1
  197.             continue
  198.         elif i > 3:
  199.             # then we can have multiple comment lines, possibly many
  200.             escapedline = saxutils.escape(line).encode('UTF-8')  # This makes everything XML-safe
  201.             outputfile.write(escapedline)
  202.             i = i+1
  203.             continue
  204.         else:
  205.             # this means none of the conditions were matched, which should never happen
  206.             raise StopIteration, "Processing Error"
  207.             break
  208.     outputfile.write("</" + opts.root + ">\n")  # close the root element
  209.  
  210. if __name__ == "__main__":
  211.     sys.exit( main() ) # get exit status from main()