Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # process ljspeech format datasets
- import glob
- import os
- import re
- import shutil
- import string
- replace_dict = {
- 'Mrs.': 'Misses',
- 'Mr.': 'Mister',
- 'Ms.': 'Miss',
- 'Jr.': 'Junior',
- 'Sr.': 'Senior',
- 'Dr.': 'Doctor',
- 'St.': 'Saint',
- 'Rev.': 'Reverend',
- 'email': 'e-mail'
- }
- training_file = "input-ljspeech-format.txt"
- out_file_name = "output-file.txt"
- separator = "|"
- all_text =""
- write_digits = 0
- do_replace_words = 1
- out_file = open(out_file_name,"w")
- with open(training_file, 'r') as tf:
- for line in tf:
- line_sep = separator
- print(line)
- split_line = line.split("|")
- print(split_line)
- file_path = split_line[0]
- training_audio_files.append(file_path)
- trans_text = split_line[1]
- match = re.search("[^A-Z0-9a-zŽžÀ-ÿ!¿¡'(),-.:;? ]", trans_text)
- if not match:
- print(trans_text)
- trans_text = ""
- file_path = ""
- line_sep = ""
- domain = re.search("([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}", trans_text)
- if domain:
- trans_text = "*****" + trans_text
- abbrev = re.search(r"(?:[A-Z]{2}[:alpha:]*)|(?:[A-Z][a-z][A-Z][:alpha:]*)", trans_text)
- if abbrev:
- trans_text = "*****" + trans_text
- digits_found = re.search(r"^(?!.*[0-9]).*$", trans_text)
- if not digits_found:
- if write_digits == 1:
- trans_text = "*****" + trans_text
- elif write_digits == 0:
- trans_text = ""
- line_sep = ""
- file_path = ""
- if do_replace_words==1:
- for key, value in replace_dict.items():
- trans_text = re.sub(rf'\b{key}\b', value, trans_text)
- abbrev_periods = re.search(r"\b(?:[a-zA-Z]\.){2,}", trans_text)
- if abbrev_periods:
- print(trans_text)
- trans_text = "*****" + trans_text
- print(file_path)
- print(line_sep)
- print(trans_text)
- out_file.write(file_path + line_sep + trans_text)
Advertisement
Add Comment
Please, Sign In to add comment