lyx2googletranslate

#!/usr/bin/python
# coding=utf-8
# Copyright Johannes 2011
# [email protected]
# Licensed under the GPL v. 3

def lyx2tex(filename):
    basefilename=filename[:-4]
    print "Converting " + filename + " -> "+basefilename+".tex..."
    runcommand('lyx -e luatex -f all '+filename)
    #print "Removing extra headers..."
    #regex_remove_shell(basefilename+'.tex',r'(?s)\\begin{document}(.*)\\end{document}')
    #print "Remove extra bibliographies..." # remove natbib bibliographies
    #regex_replace(basefilename+'.tex',r'\\bibliography{([^}]*)}','')
    print "Fixing latex..."
    runcommand('rpl "{[}" "[" '+basefilename+'.tex')
    runcommand('rpl "{]}" "]" '+basefilename+'.tex')
    runcommand('rpl -q ">{\\raggedright}" "" '+basefilename+'.tex')
    runcommand('rpl -q ">{\\raggedleft}" "" '+basefilename+'.tex')
    #runcommand('rpl -q "\\lyxdot " "." '+basefilename+'.tex')
    return filename[:-4]+'.tex'

from sys import exit
from commands import getstatusoutput

def runcommand(command,exit_on_failure=True):
    output = getstatusoutput(command)
    if (output[0] != 0 and exit_on_failure==True):
        print command
        print output[1]
        exit()
    return output

import re

def regex_replace(filename,from_string,to_string):
    orig_filecontents = file(filename).read()
    replacement_file = open(filename,'w')
    replacement_file.write(re.sub(from_string, to_string, orig_filecontents))
    replacement_file.close()
    return True

def split_html(filename,words):
    splits = words/5000
    regex_replace(filename,'<html><body><p>','')
    regex_replace(filename,'</body></html>','')
    orig_filecontents = file(filename).read()
    orig_filecontents_plist = orig_filecontents.split('<p>')
    orig_filecontents_plist_len = len(orig_filecontents_plist)
    orig_filecontents_plist_len_part = orig_filecontents_plist_len/splits
    for split in range(splits):
        part_filecontents = '<html><body><p>' + "<p>".join(orig_filecontents_plist[(orig_filecontents_plist_len_part*split):(orig_filecontents_plist_len_part*(split+1))]) + '</body></html>'
        part_file = open(filename[:-5]+'_'+str(split)+'.html','w')
        part_file.write(part_filecontents)
        part_file.close()
    runcommand('rm '+filename)
    return True

def spacerepl(matchobj):
    return matchobj.group(0)+matchobj.group(1).replace(' ','SPACEMARK')+matchobj.group(2)

def dotrepl(matchobj):
    return matchobj.group(0)+matchobj.group(1).replace('.','DOTMARK')+matchobj.group(2)

def commarepl(matchobj):
    return matchobj.group(0)+matchobj.group(1).replace(',','COMMAMARK')+matchobj.group(2)


def fit_for_html(html_file):
    regex_replace(html_file,'{}`','`')# cleaning up
    regex_replace(html_file,r'\\%','PERCENTAGE')
    regex_replace(html_file,r'%([^\n]*)\n',r'\n')# cleaning up
    regex_replace(html_file,'PERCENTAGE',r'<span class="notranslate">\\%</span>')
    regex_replace(html_file,r'^^','<html><body><div class="notranslate"><p>')
    regex_replace(html_file,r'$$','</p></body><html>')
    regex_replace(html_file,r'\n\n',r'\n</p><p>\n')
    regex_replace(html_file,r'<body>\n</p>',r'<body>\n')
    regex_replace(html_file,r'\\begin{document}',r'\\begin{document}</p></div>\n<p>')
    regex_replace(html_file,r'\\\\','SLASH')
    regex_replace(html_file,r'\\\[','BEGINPARENTESIS')
    regex_replace(html_file,r'\\\]','ENDPARENTESIS')
    regex_replace(html_file,r'\\\{','BEGINBRACKET')
    regex_replace(html_file,r'\\\}','ENDBRACKET')
    regex_replace(html_file,r'\{\}','EMPTYBRACKET')
    regex_replace(html_file,r'\\protect','')
    regex_replace(html_file,r'\\(hline|tabularnewline|ldots| \& |item)',r'<span class="notranslate">\\\1</span>') # without contents
    regex_replace(html_file,r'\\(index){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">endparentesis</span>') # translate contents
    regex_replace(html_file,r'\\(citep|citet|citetitle|citeauthor|begin|includegraphics|label|bibliography|input|include|ref|vref|pageref)({|\[)([^}]*)}',r'<span class="notranslate">\\\1\2\3endparentesis</span>') # don't translate contents
    regex_replace(html_file,r'\\(end)({|\[)([^}]*)}',r'</p><p class="notranslate">\\\1\2\3endparentesis</p><p>') # don't translate contents and add paragraph end
    regex_replace(html_file,r'\\(emph){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">endparentesis</span>') # translate contents
    regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">}</span>') # translate contents
    regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote)\[([^\]]*)\]{([^}]*)}',r'<span class="notranslate">\\\1[</span>\2<span class="notranslate">]{</span>\3<span class="notranslate">}</span>') # translate contents
    regex_replace(html_file,r'\\(addcontentsline)({[^}]*})({[^}]*}){([^}]*)}',r'<span class="notranslate">\\\1\2\3{</span>\4<span class="notranslate">}</span>') # translate some contents
    regex_replace(html_file,r'\\(item</span>) \[{([^}]*)}\]',r'\\\1 <span class="notranslate">[</span>\2<span class="notranslate">]</span>') # don't translate and remove extra bracket
    regex_replace(html_file,r'([|{)([^]}]*)(]|})',spacerepl) # temporarily remove spaces inside arguments
    regex_replace(html_file,r'([|{)([^]}]*)(]|})',dotrepl) # temporarily remove dots inside arguments
    regex_replace(html_file,r'([|{)([^]}]*)(]|})',commarepl) # temporarily remove commas inside arguments
    regex_replace(html_file,r'\\(parencites)([^\s\\\<\.\,]*)(\r|\s|\\|\<|\.|\,|\r)',r'<span class="notranslate">\\\1\2</span>\3') # don't translate contents, goes on forever
    regex_replace(html_file,'endparentesis','}')
    regex_replace(html_file,r'</span>({|\[)([^\s\r]*)(\r|\s)',r'\1\2</span>\3') # some arguments fell outside of span
    regex_replace(html_file,'&','<span class=\"notranslate\">&amp;</span>')
    regex_replace(html_file,r' \\\$',r'<span class="notranslate">-\\$</span>') # with space in front
    regex_replace(html_file,r'\\\$',r'<span class="notranslate">\\$</span>')
    regex_replace(html_file,'~','<span class="notranslate">~</span>')
    regex_replace(html_file,'\`',r'<span class="notranslate">`</span>')
    regex_replace(html_file,'\'s',"APOSMARKs") # genetive s singular
    regex_replace(html_file,'s\'',"sAPOSMARK") # genetive s plural
    regex_replace(html_file,r'([A-z])\'([A-z])',r"\1APOSMARK\2") # inside word, not likely quotation marks
    regex_replace(html_file,'\''," <span class=\"notranslate\">'</span>")
    regex_replace(html_file,'APOSMARK','\'')
    regex_replace(html_file,'SPACEMARK',' ')
    regex_replace(html_file,'DOTMARK','.')
    regex_replace(html_file,'COMMAMARK','.')
    regex_replace(html_file,'BEGINPARENTESIS',r'<span class="notranslate">\\\[</span>')
    regex_replace(html_file,'ENDPARENTESIS',r'<span class="notranslate">\\\]</span>')
    regex_replace(html_file,'BEGINBRACKET',r'<span class="notranslate">\\\{</span>')
    regex_replace(html_file,'ENDBRACKET',r'<span class="notranslate">\\\}</span>')
    regex_replace(html_file,'SLASH',r'<span class="notranslate">\\\\</span>')
    regex_replace(html_file,'EMPTYBRACKET',r'<span class="notranslate">{}</span>')
    regex_replace(html_file,r'</span>\.',r'.</span>')
    regex_replace(html_file,'</span>,',',</span>')
#   regex_replace(html_file,r'{','BEGINBRACKET')
#   regex_replace(html_file,r'}','ENDBRACKET')
    return True


def import_lyx():
    lyx_files = runcommand('ls *lyx')[1].split()
    for lyx_file in lyx_files:
        tex_file = lyx2tex(lyx_file)
        html_file = tex_file.replace('.','_') + '.html'
        runcommand('cp ' + tex_file + ' ' + html_file)
        fit_for_html(html_file)
        words = int(runcommand('wc -w ' + html_file)[1].split()[0])
        if words / 5000 > 1:
            split_html(html_file,words)

def exclamrepl(matchobj):
    return matchobj.group(0).replace('! ','!')

def fix_latex():
    tex_files = runcommand('ls *tex')[1].split()
    for tex_file in tex_files:
        regex_replace(tex_file,r'Translated version of ([^\.]*)\.html','') # remove header
        regex_replace(tex_file,r'\\index{([^}]*)}',exclamrepl) # replace exclamation marks
        regex_replace(tex_file,r'({|\[) ',r'\1') # remove extra spaces
        regex_replace(tex_file,r' (}|\])',r'\1') # remove extra spaces
        regex_replace(tex_file,r' \\index',r'\\index') # remove extra spaces
        regex_replace(tex_file,r'` ',r'`') # remove extra spaces
        regex_replace(tex_file,' \'','\'') # remove extra spaces
        regex_replace(tex_file,' ~ ','~') # remove extra spaces
        regex_replace(tex_file,r' \\\\$',r'\\$') # remove extra spaces
        regex_replace(tex_file,r'\-\\\\$',r'\\$') # remove extra spaces

import sys

if __name__ == "__main__":
    sys_argv=sys.argv
    command_options="i"
    if len(sys_argv) > 1 and sys_argv[1][0]== '-':
        command_options=sys_argv.pop(1)[1:]
    if 'h' in command_options:
        print
        print "Usage:"
        print sys_argv[0]+" -[hif]"
        print
        print "h)elp"
        print "i)import"
        print "f)ix output"
        print
    if 'i' in command_options:
        import_lyx()
    if 'f' in command_options:
        fix_latex()