Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from collections import defaultdict, Counter
- from itertools import chain
- src = """dogs are in house .
- dogs are in house .
- dogs are in house .
- dogs are in house .
- cat is in house .
- cat is in house .
- is house in town ?
- is house in town ?
- is house in town ?
- is house in town ?"""
- trg = """koirat on talossa .
- koira t on talo ssa .
- koirat on talo ssa .
- koira t on talossa .
- kissa on talossa .
- kissa on talo ssa .
- onks talo kaupungissa ?
- onks talo kaupunki ssa ?
- on ko talo kaupungissa ?
- on ko talo kaupunki ssa ?"""
- '''
- src = """dogs are in house .
- cat is in house .
- is house in town ?"""
- trg = """koirat on talossa .
- kissa on talossa .
- onks talo kaupungissa ?
- """
- '''
- # Convert into docstream.
- src = [i.split() for i in src.split('\n')]
- trg = [i.split() for i in trg.split('\n')]
- src_vocab = Counter(chain(*src))
- trg_vocab = Counter(chain(*trg))
- num_probs = len(src_vocab) * len(trg_vocab)
- default_prob = 1.0 / len(src_vocab)
- t = defaultdict(lambda: default_prob) # probability table.
- convergent_threshold=1e-2
- globally_converged = False
- probabilities = []
- iteration_count = 0
- while not globally_converged:
- count = defaultdict(float) # count(e|f)
- total = defaultdict(float) # total(f)
- for srcline, trgline in zip(src, trg):
- s_total = {} # Sum of probabilities for this sentence pair.
- for srcword in srcline:
- s_total[srcword] = 0.0
- for trgword in trgline:
- s_total[srcword] += t[srcword, trgword]
- for srcword in srcline:
- for trgword in trgline:
- # Normalize probabilities.
- cnt = t[srcword, trgword] / s_total[srcword]
- # Summing the prob of srcword given trgword.
- count[srcword, trgword] += cnt
- total[trgword] += cnt
- num_converged = 0
- for trgword in trg_vocab:
- for srcword in src_vocab:
- ##print engdeu_corpus.dictionary[trgword], engdeu_corpus.dictionary[srcword], count[srcword, trgword], total[trgword]
- new_prob = count[srcword, trgword] / total[trgword]
- delta = abs(t[srcword, trgword] - new_prob)
- if delta < convergent_threshold:
- num_converged += 1
- t[srcword, trgword] = new_prob
- iteration_count += 1
- if num_converged == num_probs:
- globally_converged = True
- for i,j in sorted(t):
- print i,j, t[i,j]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement