Guest User

Untitled

a guest
Jul 30th, 2023
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.78 KB | None | 0 0
  1. # process ljspeech format datasets
  2.  
  3. import glob
  4. import os
  5. import re
  6. import shutil
  7. import string
  8.  
  9. replace_dict = {
  10. 'Mrs.': 'Misses',
  11. 'Mr.': 'Mister',
  12. 'Ms.': 'Miss',
  13. 'Jr.': 'Junior',
  14. 'Sr.': 'Senior',
  15. 'Dr.': 'Doctor',
  16. 'St.': 'Saint',
  17. 'Rev.': 'Reverend',
  18. 'email': 'e-mail'
  19. }
  20.  
  21. training_file = "input-ljspeech-format.txt"
  22. out_file_name = "output-file.txt"
  23. separator = "|"
  24. all_text =""
  25. write_digits = 0
  26. do_replace_words = 1
  27.  
  28. out_file = open(out_file_name,"w")
  29. with open(training_file, 'r') as tf:
  30. for line in tf:
  31. line_sep = separator
  32. print(line)
  33. split_line = line.split("|")
  34. print(split_line)
  35. file_path = split_line[0]
  36. training_audio_files.append(file_path)
  37. trans_text = split_line[1]
  38. match = re.search("[^A-Z0-9a-zŽžÀ-ÿ!¿¡'(),-.:;? ]", trans_text)
  39. if not match:
  40. print(trans_text)
  41. trans_text = ""
  42. file_path = ""
  43. line_sep = ""
  44. domain = re.search("([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}", trans_text)
  45. if domain:
  46. trans_text = "*****" + trans_text
  47. abbrev = re.search(r"(?:[A-Z]{2}[:alpha:]*)|(?:[A-Z][a-z][A-Z][:alpha:]*)", trans_text)
  48. if abbrev:
  49. trans_text = "*****" + trans_text
  50.  
  51. digits_found = re.search(r"^(?!.*[0-9]).*$", trans_text)
  52. if not digits_found:
  53.  
  54. if write_digits == 1:
  55. trans_text = "*****" + trans_text
  56.  
  57. elif write_digits == 0:
  58. trans_text = ""
  59. line_sep = ""
  60. file_path = ""
  61.  
  62. if do_replace_words==1:
  63. for key, value in replace_dict.items():
  64. trans_text = re.sub(rf'\b{key}\b', value, trans_text)
  65.  
  66. abbrev_periods = re.search(r"\b(?:[a-zA-Z]\.){2,}", trans_text)
  67. if abbrev_periods:
  68. print(trans_text)
  69. trans_text = "*****" + trans_text
  70.  
  71. print(file_path)
  72. print(line_sep)
  73. print(trans_text)
  74. out_file.write(file_path + line_sep + trans_text)
  75.  
Advertisement
Add Comment
Please, Sign In to add comment