Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # coding=utf-8
- # Copyright Johannes 2011
- # Licensed under the GPL v. 3
- def lyx2tex(filename):
- basefilename=filename[:-4]
- print "Converting " + filename + " -> "+basefilename+".tex..."
- runcommand('lyx -e luatex -f all '+filename)
- #print "Removing extra headers..."
- #regex_remove_shell(basefilename+'.tex',r'(?s)\\begin{document}(.*)\\end{document}')
- #print "Remove extra bibliographies..." # remove natbib bibliographies
- #regex_replace(basefilename+'.tex',r'\\bibliography{([^}]*)}','')
- print "Fixing latex..."
- runcommand('rpl "{[}" "[" '+basefilename+'.tex')
- runcommand('rpl "{]}" "]" '+basefilename+'.tex')
- runcommand('rpl -q ">{\\raggedright}" "" '+basefilename+'.tex')
- runcommand('rpl -q ">{\\raggedleft}" "" '+basefilename+'.tex')
- #runcommand('rpl -q "\\lyxdot " "." '+basefilename+'.tex')
- return filename[:-4]+'.tex'
- from sys import exit
- from commands import getstatusoutput
- def runcommand(command,exit_on_failure=True):
- output = getstatusoutput(command)
- if (output[0] != 0 and exit_on_failure==True):
- print command
- print output[1]
- exit()
- return output
- import re
- def regex_replace(filename,from_string,to_string):
- orig_filecontents = file(filename).read()
- replacement_file = open(filename,'w')
- replacement_file.write(re.sub(from_string, to_string, orig_filecontents))
- replacement_file.close()
- return True
- def split_html(filename,words):
- splits = words/5000
- regex_replace(filename,'<html><body><p>','')
- regex_replace(filename,'</body></html>','')
- orig_filecontents = file(filename).read()
- orig_filecontents_plist = orig_filecontents.split('<p>')
- orig_filecontents_plist_len = len(orig_filecontents_plist)
- orig_filecontents_plist_len_part = orig_filecontents_plist_len/splits
- for split in range(splits):
- part_filecontents = '<html><body><p>' + "<p>".join(orig_filecontents_plist[(orig_filecontents_plist_len_part*split):(orig_filecontents_plist_len_part*(split+1))]) + '</body></html>'
- part_file = open(filename[:-5]+'_'+str(split)+'.html','w')
- part_file.write(part_filecontents)
- part_file.close()
- runcommand('rm '+filename)
- return True
- def spacerepl(matchobj):
- return matchobj.group(0)+matchobj.group(1).replace(' ','SPACEMARK')+matchobj.group(2)
- def dotrepl(matchobj):
- return matchobj.group(0)+matchobj.group(1).replace('.','DOTMARK')+matchobj.group(2)
- def commarepl(matchobj):
- return matchobj.group(0)+matchobj.group(1).replace(',','COMMAMARK')+matchobj.group(2)
- def fit_for_html(html_file):
- regex_replace(html_file,'{}`','`')# cleaning up
- regex_replace(html_file,r'\\%','PERCENTAGE')
- regex_replace(html_file,r'%([^\n]*)\n',r'\n')# cleaning up
- regex_replace(html_file,'PERCENTAGE',r'<span class="notranslate">\\%</span>')
- regex_replace(html_file,r'^^','<html><body><div class="notranslate"><p>')
- regex_replace(html_file,r'$$','</p></body><html>')
- regex_replace(html_file,r'\n\n',r'\n</p><p>\n')
- regex_replace(html_file,r'<body>\n</p>',r'<body>\n')
- regex_replace(html_file,r'\\begin{document}',r'\\begin{document}</p></div>\n<p>')
- regex_replace(html_file,r'\\\\','SLASH')
- regex_replace(html_file,r'\\\[','BEGINPARENTESIS')
- regex_replace(html_file,r'\\\]','ENDPARENTESIS')
- regex_replace(html_file,r'\\\{','BEGINBRACKET')
- regex_replace(html_file,r'\\\}','ENDBRACKET')
- regex_replace(html_file,r'\{\}','EMPTYBRACKET')
- regex_replace(html_file,r'\\protect','')
- regex_replace(html_file,r'\\(hline|tabularnewline|ldots| \& |item)',r'<span class="notranslate">\\\1</span>') # without contents
- regex_replace(html_file,r'\\(index){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">endparentesis</span>') # translate contents
- regex_replace(html_file,r'\\(citep|citet|citetitle|citeauthor|begin|includegraphics|label|bibliography|input|include|ref|vref|pageref)({|\[)([^}]*)}',r'<span class="notranslate">\\\1\2\3endparentesis</span>') # don't translate contents
- regex_replace(html_file,r'\\(end)({|\[)([^}]*)}',r'</p><p class="notranslate">\\\1\2\3endparentesis</p><p>') # don't translate contents and add paragraph end
- regex_replace(html_file,r'\\(emph){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">endparentesis</span>') # translate contents
- regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote){([^}]*)}',r'<span class="notranslate">\\\1{</span>\2<span class="notranslate">}</span>') # translate contents
- regex_replace(html_file,r'\\(chapter|section|subsection|subsubsection|chapter\*|section\*|subsection\*|subsubsection\*|caption|footnote)\[([^\]]*)\]{([^}]*)}',r'<span class="notranslate">\\\1[</span>\2<span class="notranslate">]{</span>\3<span class="notranslate">}</span>') # translate contents
- regex_replace(html_file,r'\\(addcontentsline)({[^}]*})({[^}]*}){([^}]*)}',r'<span class="notranslate">\\\1\2\3{</span>\4<span class="notranslate">}</span>') # translate some contents
- regex_replace(html_file,r'\\(item</span>) \[{([^}]*)}\]',r'\\\1 <span class="notranslate">[</span>\2<span class="notranslate">]</span>') # don't translate and remove extra bracket
- regex_replace(html_file,r'([|{)([^]}]*)(]|})',spacerepl) # temporarily remove spaces inside arguments
- regex_replace(html_file,r'([|{)([^]}]*)(]|})',dotrepl) # temporarily remove dots inside arguments
- regex_replace(html_file,r'([|{)([^]}]*)(]|})',commarepl) # temporarily remove commas inside arguments
- regex_replace(html_file,r'\\(parencites)([^\s\\\<\.\,]*)(\r|\s|\\|\<|\.|\,|\r)',r'<span class="notranslate">\\\1\2</span>\3') # don't translate contents, goes on forever
- regex_replace(html_file,'endparentesis','}')
- regex_replace(html_file,r'</span>({|\[)([^\s\r]*)(\r|\s)',r'\1\2</span>\3') # some arguments fell outside of span
- regex_replace(html_file,'&','<span class=\"notranslate\">&</span>')
- regex_replace(html_file,r' \\\$',r'<span class="notranslate">-\\$</span>') # with space in front
- regex_replace(html_file,r'\\\$',r'<span class="notranslate">\\$</span>')
- regex_replace(html_file,'~','<span class="notranslate">~</span>')
- regex_replace(html_file,'\`',r'<span class="notranslate">`</span>')
- regex_replace(html_file,'\'s',"APOSMARKs") # genetive s singular
- regex_replace(html_file,'s\'',"sAPOSMARK") # genetive s plural
- regex_replace(html_file,r'([A-z])\'([A-z])',r"\1APOSMARK\2") # inside word, not likely quotation marks
- regex_replace(html_file,'\''," <span class=\"notranslate\">'</span>")
- regex_replace(html_file,'APOSMARK','\'')
- regex_replace(html_file,'SPACEMARK',' ')
- regex_replace(html_file,'DOTMARK','.')
- regex_replace(html_file,'COMMAMARK','.')
- regex_replace(html_file,'BEGINPARENTESIS',r'<span class="notranslate">\\\[</span>')
- regex_replace(html_file,'ENDPARENTESIS',r'<span class="notranslate">\\\]</span>')
- regex_replace(html_file,'BEGINBRACKET',r'<span class="notranslate">\\\{</span>')
- regex_replace(html_file,'ENDBRACKET',r'<span class="notranslate">\\\}</span>')
- regex_replace(html_file,'SLASH',r'<span class="notranslate">\\\\</span>')
- regex_replace(html_file,'EMPTYBRACKET',r'<span class="notranslate">{}</span>')
- regex_replace(html_file,r'</span>\.',r'.</span>')
- regex_replace(html_file,'</span>,',',</span>')
- # regex_replace(html_file,r'{','BEGINBRACKET')
- # regex_replace(html_file,r'}','ENDBRACKET')
- return True
- def import_lyx():
- lyx_files = runcommand('ls *lyx')[1].split()
- for lyx_file in lyx_files:
- tex_file = lyx2tex(lyx_file)
- html_file = tex_file.replace('.','_') + '.html'
- runcommand('cp ' + tex_file + ' ' + html_file)
- fit_for_html(html_file)
- words = int(runcommand('wc -w ' + html_file)[1].split()[0])
- if words / 5000 > 1:
- split_html(html_file,words)
- def exclamrepl(matchobj):
- return matchobj.group(0).replace('! ','!')
- def fix_latex():
- tex_files = runcommand('ls *tex')[1].split()
- for tex_file in tex_files:
- regex_replace(tex_file,r'Translated version of ([^\.]*)\.html','') # remove header
- regex_replace(tex_file,r'\\index{([^}]*)}',exclamrepl) # replace exclamation marks
- regex_replace(tex_file,r'({|\[) ',r'\1') # remove extra spaces
- regex_replace(tex_file,r' (}|\])',r'\1') # remove extra spaces
- regex_replace(tex_file,r' \\index',r'\\index') # remove extra spaces
- regex_replace(tex_file,r'` ',r'`') # remove extra spaces
- regex_replace(tex_file,' \'','\'') # remove extra spaces
- regex_replace(tex_file,' ~ ','~') # remove extra spaces
- regex_replace(tex_file,r' \\\\$',r'\\$') # remove extra spaces
- regex_replace(tex_file,r'\-\\\\$',r'\\$') # remove extra spaces
- import sys
- if __name__ == "__main__":
- sys_argv=sys.argv
- command_options="i"
- if len(sys_argv) > 1 and sys_argv[1][0]== '-':
- command_options=sys_argv.pop(1)[1:]
- if 'h' in command_options:
- print
- print "Usage:"
- print sys_argv[0]+" -[hif]"
- print
- print "h)elp"
- print "i)import"
- print "f)ix output"
- print
- if 'i' in command_options:
- import_lyx()
- if 'f' in command_options:
- fix_latex()
Add Comment
Please, Sign In to add comment