Untitled

# process ljspeech format datasets

import glob
import os
import re
import shutil
import string

replace_dict = {
	'Mrs.': 'Misses',
	'Mr.': 'Mister',
	'Ms.': 'Miss',
	'Jr.': 'Junior',
	'Sr.': 'Senior',
	'Dr.': 'Doctor',
	'St.': 'Saint',
	'Rev.': 'Reverend',
	'email': 'e-mail'
	}

training_file = "input-ljspeech-format.txt"
out_file_name = "output-file.txt"
separator =  "|"
all_text =""
write_digits = 0
do_replace_words = 1

out_file = open(out_file_name,"w")
with open(training_file, 'r') as tf:
	for line in tf:
		line_sep = separator
		print(line)
		split_line = line.split("|")
		print(split_line)
		file_path = split_line[0]
		training_audio_files.append(file_path)
		trans_text = split_line[1]
		match = re.search("[^A-Z0-9a-zŽžÀ-ÿ!¿¡'(),-.:;? ]", trans_text)
		if not match:
			print(trans_text)
			trans_text = ""
			file_path = ""
			line_sep = ""
		domain = re.search("([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}", trans_text)
		if domain:
			trans_text = "*****" + trans_text
		abbrev = re.search(r"(?:[A-Z]{2}[:alpha:]*)|(?:[A-Z][a-z][A-Z][:alpha:]*)", trans_text)
		if abbrev:
			trans_text = "*****" + trans_text

		digits_found = re.search(r"^(?!.*[0-9]).*$", trans_text)
		if not digits_found:

			if write_digits == 1:
				trans_text = "*****" + trans_text

			elif write_digits == 0:
				trans_text = ""
				line_sep = ""
				file_path = ""

		if do_replace_words==1:
			for key, value in replace_dict.items():
				trans_text = re.sub(rf'\b{key}\b', value, trans_text)

		abbrev_periods = re.search(r"\b(?:[a-zA-Z]\.){2,}", trans_text)
		if abbrev_periods:
			print(trans_text)
			trans_text = "*****" + trans_text

		print(file_path)
		print(line_sep)
		print(trans_text)
		out_file.write(file_path + line_sep + trans_text)