Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs
- '''
- Snippets from xWN, a wordnet toy-tool developed with Open Multilingual WordNet http://www.casta-net.jp/~kuribayashi/multi/
- '''
- # This function reads a .tab WN file and returns the WN in dic format.
- def readWNfile(wnfile, option="ss"):
- reader = codecs.open(wnfile, "r", "utf8").readlines()
- wn = {}
- for l in reader:
- if l[0] == "#": continue
- if l.split("\t")[0][-1] == "n":
- if option=="ss":
- k = l.split("\t")[0] #ss as key
- v = l.split("\t")[2][:-1] #word
- else:
- v = l.split("\t")[0] #ss as value
- k = l.split("\t")[2][:-1] #word as key
- try:
- temp = wn[k]
- wn[k] = temp + ";" + v
- except KeyError:
- wn[k] = v
- else:
- continue
- return wn
- def createDicfrom2WNs(wnfile1, wnfile2, outfile=None, delimiter=None):
- wn1 = readWNfile(wnfile1,"ss")
- wn2 = readWNfile(wnfile2,"ss")
- newdic = {}
- # Load WNs' entries into dictionary.
- for i in wn1:
- if i in wn2:
- if len(wn1[i]) or len(wn2[i]) is 1:
- for j in wn1[i].split(";"):
- print j, wn2[i]
- newdic[j] = wn2[i]
- # Configure output file.
- outfile = wnfile1[-7:-4]+"-"+wnfile2[-7:-4]+".dic" if outfile==None else outfile
- out = codecs.open(outfile,"w","utf8")
- delimiter = "\t" if delimiter ==None else delimiter
- # Loop through dictionary and output entries.
- for k in sorted(newdic):
- for v in sorted(newdic[k].split(";")):
- print>>out, k + delimiter + v
- return newdic
- eng_wnfile = '/media/E418A6B618A686E0/xling/wordnet/wn-data-eng.tab'
- spa_wnfile = '/media/E418A6B618A686E0/xling/wordnet/wn-data-ind.tab'
- createDicfrom2WNs(eng_wnfile,spa_wnfile, delimiter=" @ ")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement