Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import subprocess
- import sys
- def run_vecalign(name,volume):
- j_file = f"./all_volumes/{name}_jp_{volume}.txt"
- k_file = f"./all_volumes/{name}_ko_{volume}.txt"
- j_overlap = f"./all_volumes/{name}_jp_{volume}.overlap.txt"
- k_overlap = f"./all_volumes/{name}_ko_{volume}.overlap.txt"
- j_embed = f"./all_volumes/{name}_jp_{volume}.embed"
- k_embed = f"./all_volumes/{name}_ko_{volume}.embed"
- alignment = f"./alignments/{name}_{volume}.txt"
- alignment_v = f"./alignments/{name}_{volume}.verbose.txt"
- subprocess.run(f"./vecalign/overlap.py -i {j_file} -o {j_overlap} -n 3",shell=True)
- subprocess.run(f"./vecalign/overlap.py -i {k_file} -o {k_overlap} -n 3",shell=True)
- subprocess.run(f"./LASER/tasks/embed/embed.sh {j_overlap} ja {j_embed}",shell=True)
- subprocess.run(f"./LASER/tasks/embed/embed.sh {k_overlap} ko {k_embed}",shell=True)
- subprocess.run(f"./vecalign/vecalign.py --alignment_max_size 3 --src {j_file} --tgt {k_file} --src_embed {j_overlap} {j_embed} --tgt_embed {k_overlap} {k_embed} > {alignment}",shell=True)
- # with open(alignment,'r',encoding='utf-8') as file:
- # lines = file.readlines()
- # with open(j_file,'r',encoding='utf-8') as f:
- # jp = f.read().split('\n')
- # with open(k_file,'r',encoding='utf-8') as f:
- # ko = f.read().split('\n')
- # with open(alignment_v,'w',encoding='utf-8') as destfile:
- # for line in lines:
- # ret = ""
- # t=line.split(':')
- # tgt,dst,score = t[0],t[1],t[2]
- # tgt = tgt.replace('[','').replace(']','').split(',')
- # dst = dst.replace('[','').replace(']','').split(',')
- # if len(tgt[0])==0:
- # continue
- # if len(dst[0])==0:
- # continue
- # for no in tgt:
- # ret += jp[int(no)]
- # for no in dst:
- # ret += ko[int(no)]
- # ret += score
- # destfile.write(ret)
- os.remove(j_overlap)
- os.remove(k_overlap)
- os.remove(j_embed)
- os.remove(k_embed)
- def parse_infofile(infofile):
- ret={}
- with open(infofile,'r') as file:
- t = file.read()
- l = t.split('\n')
- current_key = ""
- for content in l:
- if content.strip().isalpha() and len(content)>1:
- current_key = content.strip()
- ret[current_key] = []
- else:
- for num in content.split():
- ret[current_key].append(num)
- return ret
- if __name__ == "__main__":
- info = parse_infofile("# volumes.txt")
- print("start.")
- for name in info.keys():
- for vol in info[name]:
- try:
- run_vecalign(name,vol)
- except KeyboardInterrupt:
- raise KeyboardInterrupt
- except:
- print(f"error with {name} {vol}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement