Advertisement
Guest User

Untitled

a guest
Jan 18th, 2019
72
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.57 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import argparse
  3. from collections import defaultdict
  4.  
  5.  
  6. parser = argparse.ArgumentParser()
  7. parser.add_argument("-d", "--data",
  8. default="data/hansards",
  9. help="Data filename prefix (default=data/hansards).")
  10. parser.add_argument("-e", "--english",
  11. default="e",
  12. help="Suffix of English filename (default=e).")
  13. parser.add_argument("-f", "--french",
  14. default="f",
  15. help="Suffix of French filename (default=f).")
  16. parser.add_argument("-o", "--out",
  17. default="dice.a",
  18. help="Output path (default=dice.a).")
  19. parser.add_argument("-t", "--threshold",
  20. default=0.5,
  21. type=float,
  22. help="Threshold for aligning with Dice's coefficient "
  23. "(default=0.5).")
  24. parser.add_argument("-n", "--num_sentences",
  25. default=999999,
  26. type=int,
  27. help="Number of sentences to use for training and "
  28. "alignment.")
  29. args = parser.parse_args()
  30. f_data = "%s.%s" % (args.data, args.french)
  31. e_data = "%s.%s" % (args.data, args.english)
  32.  
  33.  
  34. print("Training with Dice's coefficient...")
  35. print("\tCounting...")
  36. bitext = [[sentence.strip().split() for sentence in pair] for
  37. pair in zip(open(f_data), open(e_data))][:args.num_sentences]
  38. f_count = defaultdict(int)
  39. e_count = defaultdict(int)
  40. fe_count = defaultdict(int)
  41. for n, (f, e) in enumerate(bitext):
  42. for f_i in set(f):
  43. f_count[f_i] += 1
  44. for e_j in set(e):
  45. fe_count[(f_i, e_j)] += 1
  46. for e_j in set(e):
  47. e_count[e_j] += 1
  48. if n % 500 == 0:
  49. print("Went through {} sentence pairs...".format(n))
  50.  
  51.  
  52. print("\tComputing coefficients...")
  53. dice = defaultdict(int)
  54. for (k, (f_i, e_j)) in enumerate(fe_count.keys()):
  55. dice[(f_i, e_j)] = 2 * fe_count[(f_i, e_j)] / (f_count[f_i] +
  56. e_count[e_j])
  57. if k % 5000 == 0:
  58. print("Went through {} word pairs...".format(k))
  59. print()
  60.  
  61. print("Computing alignments...")
  62. with open(args.out, mode="w") as out_file:
  63. for n, (f, e) in enumerate(bitext):
  64. for (i, f_i) in enumerate(f):
  65. for (j, e_j) in enumerate(e):
  66. if dice[(f_i, e_j)] >= args.threshold:
  67. out_file.write("%i-%i " % (i, j))
  68. if n % 500 == 0:
  69. print("Went through {} sentence pairs...".format(n))
  70. out_file.write("\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement