SHARE
TWEET

lyx2googletranslate

johanneswilm Sep 6th, 2011 447 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. # coding=utf-8
  3. # Copyright Johannes 2011
  4. # mail@johanneswilm.org
  5. # Licensed under the GPL v. 3
  6.  
  7. def lyx2tex(filename):
  8.         basefilename=filename[:-4]
  9.         print "Converting " + filename + " -> "+basefilename+".tex..."
  10.         runcommand('lyx -e luatex -f all '+filename)
  11.         #print "Removing extra headers..."
  12.         #regex_remove_shell(basefilename+'.tex',r'(?s)\\begin{document}(.*)\\end{document}')
  13.         #print "Remove extra bibliographies..." # remove natbib bibliographies
  14.         #regex_replace(basefilename+'.tex',r'\\bibliography{([^}]*)}','')      
  15.         print "Fixing latex..."
  16.         runcommand('rpl "{[}" "[" '+basefilename+'.tex')
  17.         runcommand('rpl "{]}" "]" '+basefilename+'.tex')
  18.         runcommand('rpl -q ">{\\raggedright}" "" '+basefilename+'.tex')
  19.         runcommand('rpl -q ">{\\raggedleft}" "" '+basefilename+'.tex')
  20.         #runcommand('rpl -q "\\lyxdot " "." '+basefilename+'.tex')
  21.         return filename[:-4]+'.tex'
  22.  
  23. from sys import exit
  24. from commands import getstatusoutput
  25.  
  26. def runcommand(command,exit_on_failure=True):
  27.         output = getstatusoutput(command)
  28.         if (output[0] != 0 and exit_on_failure==True):
  29.                 print command
  30.                 print output[1]
  31.                 exit()
  32.         return output
  33.  
  34. import re
  35.  
  36. def regex_replace(filename,from_string,to_string):
  37.         orig_filecontents = file(filename).read()
  38.         replacement_file = open(filename,'w')
  39.         replacement_file.write(re.sub(from_string, to_string, orig_filecontents))
  40.         replacement_file.close()
  41.         return True
  42.  
  43. def split_html(filename,words):
  44.         splits = words/5000
  45.         regex_replace(filename,'<html><body><p>','')
  46.         regex_replace(filename,'</body></html>','')
  47.         orig_filecontents = file(filename).read()
  48.         orig_filecontents_plist = orig_filecontents.split('<p>')
  49.         orig_filecontents_plist_len = len(orig_filecontents_plist)
  50.         orig_filecontents_plist_len_part = orig_filecontents_plist_len/splits
  51.         for split in range(splits):
  52.                 part_filecontents = '<html><body><p>' + "<p>".join(orig_filecontents_plist[(orig_filecontents_plist_len_part*split):(orig_filecontents_plist_len_part*(split+1))]) + '</body></html>'
  53.                 part_file = open(filename[:-5]+'_'+str(split)+'.html','w')
  54.                 part_file.write(part_filecontents)
  55.                 part_file.close()
  56.         runcommand('rm '+filename)
  57.         return True
  58.  
  59. def spacerepl(matchobj):
  60.         return matchobj.group(0)+matchobj.group(1).replace(' ','SPACEMARK')+matchobj.group(2)
  61.  
  62. def dotrepl(matchobj):
  63.         return matchobj.group(0)+matchobj.group(1).replace('.','DOTMARK')+matchobj.group(2)
  64.  
  65. def commarepl(matchobj):
  66.         return matchobj.group(0)+matchobj.group(1).replace(',','COMMAMARK')+matchobj.group(2)
  67.  
  68.  
  69.  
  70. def fit_for_html(html_file):
  71.         regex_replace(html_file,'{}`','`')# cleaning up
  72.         regex_replace(html_file,r'\\%','PERCENTAGE')
  73.         regex_replace(html_file,r'%([^\n]*)\n',r'\n')# cleaning up
  74.         regex_replace(html_file,'PERCENTAGE',r'<span class="notranslate">\\%</span>')  
  75.         regex_replace(html_file,r'^^','<html><body><div class="notranslate"><p>')
  76.         regex_replace(html_file,r'$$','</p></body><html>')
  77.         regex_replace(html_file,r'\n\n',r'\n</p><p>\n')
  78.         regex_replace(html_file,r'<body>\n</p>',r'<body>\n')
  79.         regex_replace(html_file,r'\\begin{document}',r'\\begin{document}</p></div>\n<p>')
  80.         regex_replace(html_file,r'\\\\','SLASH')
  81.         regex_replace(html_file,r'\\\[','BEGINPARENTESIS')
  82.         regex_replace(html_file,r'\\\]','ENDPARENTESIS')
  83.         regex_replace(html_file,r'\\\{','BEGINBRACKET')
  84.         regex_replace(html_file,r'\\\}','ENDBRACKET')
  85.         regex_replace(html_file,r'\{\}','EMPTYBRACKET')
  86.         regex_replace(html_file,r'\\protect','')       
  87.         regex_replace(html_file,r'\\(hline|tabularnewline|ldots| \& |item)',r'<span class="notranslate">\\\1</span>') # without contents
  88.         regex_replace(html_file,r'\\(index){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">endparentesis</span>') # translate contents
  89.         regex_replace(html_file,r'\\(citep|citet|citetitle|citeauthor|begin|includegraphics|label|bibliography|input|include|ref|vref|pageref)({|\[)([^}]*)}',r'<span class="notranslate">\\\1\2\3endparentesis</span>') # don't translate contents
  90.         regex_replace(html_file,r'\\(end)({|\[)([^}]*)}',r'</p><p class="notranslate">\\\1\2\3endparentesis</p><p>') # don't translate contents and add paragraph end  
  91.         regex_replace(html_file,r'\\(emph){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">endparentesis</span>') # translate contents
  92.         regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">}</span>') # translate contents
  93.         regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote)\[([^\]]*)\]{([^}]*)}',r'<span class="notranslate">\\\1[</span>\2<span class="notranslate">]{</span>\3<span class="notranslate">}</span>') # translate contents
  94.         regex_replace(html_file,r'\\(addcontentsline)({[^}]*})({[^}]*}){([^}]*)}',r'<span class="notranslate">\\\1\2\3{</span>\4<span class="notranslate">}</span>') # translate some contents
  95.         regex_replace(html_file,r'\\(item</span>) \[{([^}]*)}\]',r'\\\1 <span class="notranslate">[</span>\2<span class="notranslate">]</span>') # don't translate and remove extra bracket
  96.         regex_replace(html_file,r'([|{)([^]}]*)(]|})',spacerepl) # temporarily remove spaces inside arguments
  97.         regex_replace(html_file,r'([|{)([^]}]*)(]|})',dotrepl) # temporarily remove dots inside arguments
  98.         regex_replace(html_file,r'([|{)([^]}]*)(]|})',commarepl) # temporarily remove commas inside arguments
  99.         regex_replace(html_file,r'\\(parencites)([^\s\\\<\.\,]*)(\r|\s|\\|\<|\.|\,|\r)',r'<span class="notranslate">\\\1\2</span>\3') # don't translate contents, goes on forever
  100.         regex_replace(html_file,'endparentesis','}')
  101.         regex_replace(html_file,r'</span>({|\[)([^\s\r]*)(\r|\s)',r'\1\2</span>\3') # some arguments fell outside of span
  102.         regex_replace(html_file,'&','<span class=\"notranslate\">&amp;</span>')
  103.         regex_replace(html_file,r' \\\$',r'<span class="notranslate">-\\$</span>') # with space in front
  104.         regex_replace(html_file,r'\\\$',r'<span class="notranslate">\\$</span>')
  105.         regex_replace(html_file,'~','<span class="notranslate">~</span>')
  106.         regex_replace(html_file,'\`',r'<span class="notranslate">`</span>')
  107.         regex_replace(html_file,'\'s',"APOSMARKs") # genetive s singular
  108.         regex_replace(html_file,'s\'',"sAPOSMARK") # genetive s plural
  109.         regex_replace(html_file,r'([A-z])\'([A-z])',r"\1APOSMARK\2") # inside word, not likely quotation marks
  110.         regex_replace(html_file,'\''," <span class=\"notranslate\">'</span>")
  111.         regex_replace(html_file,'APOSMARK','\'')
  112.         regex_replace(html_file,'SPACEMARK',' ')
  113.         regex_replace(html_file,'DOTMARK','.')
  114.         regex_replace(html_file,'COMMAMARK','.')
  115.         regex_replace(html_file,'BEGINPARENTESIS',r'<span class="notranslate">\\\[</span>')
  116.         regex_replace(html_file,'ENDPARENTESIS',r'<span class="notranslate">\\\]</span>')
  117.         regex_replace(html_file,'BEGINBRACKET',r'<span class="notranslate">\\\{</span>')
  118.         regex_replace(html_file,'ENDBRACKET',r'<span class="notranslate">\\\}</span>')
  119.         regex_replace(html_file,'SLASH',r'<span class="notranslate">\\\\</span>')
  120.         regex_replace(html_file,'EMPTYBRACKET',r'<span class="notranslate">{}</span>')
  121.         regex_replace(html_file,r'</span>\.',r'.</span>')
  122.         regex_replace(html_file,'</span>,',',</span>')
  123. #       regex_replace(html_file,r'{','BEGINBRACKET')
  124. #       regex_replace(html_file,r'}','ENDBRACKET')     
  125.         return True
  126.        
  127.  
  128.                
  129. def import_lyx():
  130.         lyx_files = runcommand('ls *lyx')[1].split()
  131.         for lyx_file in lyx_files:
  132.                 tex_file = lyx2tex(lyx_file)
  133.                 html_file = tex_file.replace('.','_') + '.html'
  134.                 runcommand('cp ' + tex_file + ' ' + html_file)
  135.                 fit_for_html(html_file)
  136.                 words = int(runcommand('wc -w ' + html_file)[1].split()[0])
  137.                 if words / 5000 > 1:
  138.                         split_html(html_file,words)
  139.  
  140. def exclamrepl(matchobj):
  141.         return matchobj.group(0).replace('! ','!')
  142.  
  143. def fix_latex():
  144.         tex_files = runcommand('ls *tex')[1].split()
  145.         for tex_file in tex_files:
  146.                 regex_replace(tex_file,r'Translated version of ([^\.]*)\.html','') # remove header
  147.                 regex_replace(tex_file,r'\\index{([^}]*)}',exclamrepl) # replace exclamation marks
  148.                 regex_replace(tex_file,r'({|\[) ',r'\1') # remove extra spaces
  149.                 regex_replace(tex_file,r' (}|\])',r'\1') # remove extra spaces
  150.                 regex_replace(tex_file,r' \\index',r'\\index') # remove extra spaces
  151.                 regex_replace(tex_file,r'` ',r'`') # remove extra spaces
  152.                 regex_replace(tex_file,' \'','\'') # remove extra spaces
  153.                 regex_replace(tex_file,' ~ ','~') # remove extra spaces
  154.                 regex_replace(tex_file,r' \\\\$',r'\\$') # remove extra spaces
  155.                 regex_replace(tex_file,r'\-\\\\$',r'\\$') # remove extra spaces
  156.  
  157. import sys
  158.  
  159. if __name__ == "__main__":
  160.         sys_argv=sys.argv
  161.         command_options="i"
  162.         if len(sys_argv) > 1 and sys_argv[1][0]== '-':
  163.                 command_options=sys_argv.pop(1)[1:]
  164.         if 'h' in command_options:
  165.                 print
  166.                 print "Usage:"
  167.                 print sys_argv[0]+" -[hif]"
  168.                 print
  169.                 print "h)elp"
  170.                 print "i)import"
  171.                 print "f)ix output"
  172.                 print
  173.         if 'i' in command_options:
  174.                 import_lyx()
  175.         if 'f' in command_options:
  176.                 fix_latex()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top