Advertisement
alvations

IBM 1

Sep 9th, 2014
581
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.46 KB | None | 0 0
  1. from collections import defaultdict, Counter
  2. from itertools import chain
  3.  
  4. src = """dogs are in house .
  5. dogs are in house .
  6. dogs are in house .
  7. dogs are in house .
  8. cat is in house .
  9. cat is in house .
  10. is house in town ?
  11. is house in town ?
  12. is house in town ?
  13. is house in town ?"""
  14.  
  15. trg = """koirat on talossa .
  16. koira t on talo ssa .
  17. koirat on talo ssa .
  18. koira t on talossa .
  19. kissa  on talossa .
  20. kissa  on talo ssa .
  21. onks talo kaupungissa ?
  22. onks talo kaupunki ssa ?
  23. on ko talo kaupungissa ?
  24. on ko talo kaupunki ssa ?"""
  25.  
  26. '''
  27. src = """dogs are in house .
  28. cat is in house .
  29. is house in town ?"""
  30.  
  31. trg = """koirat on talossa .
  32. kissa  on talossa .
  33. onks talo kaupungissa ?
  34. """
  35. '''
  36.  
  37. # Convert into docstream.
  38. src = [i.split() for i in src.split('\n')]
  39. trg = [i.split() for i in trg.split('\n')]
  40.  
  41. src_vocab = Counter(chain(*src))
  42. trg_vocab = Counter(chain(*trg))
  43.  
  44.  
  45. num_probs = len(src_vocab) * len(trg_vocab)
  46.  
  47. default_prob = 1.0 / len(src_vocab)
  48. t = defaultdict(lambda: default_prob) # probability table.
  49.  
  50. convergent_threshold=1e-2
  51. globally_converged = False
  52. probabilities = []
  53. iteration_count = 0
  54.  
  55. while not globally_converged:
  56.     count = defaultdict(float) # count(e|f)
  57.     total = defaultdict(float) # total(f)
  58.    
  59.     for srcline, trgline in zip(src, trg):
  60.         s_total = {} # Sum of probabilities for this sentence pair.
  61.         for srcword in srcline:
  62.             s_total[srcword] = 0.0
  63.             for trgword in trgline:
  64.                 s_total[srcword] += t[srcword, trgword]
  65.    
  66.  
  67.         for srcword in srcline:
  68.             for trgword in trgline:
  69.                 # Normalize probabilities.
  70.                 cnt = t[srcword, trgword] / s_total[srcword]
  71.                 # Summing the prob of srcword given trgword.
  72.                 count[srcword, trgword] += cnt
  73.                 total[trgword] += cnt
  74.        
  75.     num_converged = 0
  76.     for trgword in trg_vocab:
  77.         for srcword in src_vocab:
  78.             ##print engdeu_corpus.dictionary[trgword], engdeu_corpus.dictionary[srcword], count[srcword, trgword], total[trgword]
  79.             new_prob = count[srcword, trgword] / total[trgword]
  80.             delta = abs(t[srcword, trgword] - new_prob)
  81.             if delta < convergent_threshold:
  82.                 num_converged += 1
  83.             t[srcword, trgword] = new_prob
  84.  
  85.     iteration_count += 1
  86.     if num_converged == num_probs:
  87.         globally_converged = True
  88.  
  89. for i,j in sorted(t):
  90.     print i,j, t[i,j]
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement