Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- from pydub import AudioSegment
- from tqdm import tqdm
- import os
- from random import shuffle
- import utils.audio_converter as converter
- # ARPAbet block
- dictionary_path = r"/media/cookie/Samsung PM961/TwiBot/tacotron2/filelists/merged.dict_.txt"
- print("Running, Please wait...")
- thisdict = {}
- for line in reversed((open(dictionary_path, "r").read()).splitlines()):
- thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
- print("Dictionary Ready.")
- # Functions
- def concat_text(filenames, outpath):
- with open(outpath, 'w') as outfile:
- nan = 0
- for fname in filenames:
- if nan == 1: outfile.write("\n") # add newlines (\n) between each file
- else: nan = 1
- with open(fname) as infile:
- for line in infile:
- outfile.write(line)
- def arpabet(input_path, output_path, encoding="utf-8"):
- errored_words = ""
- sym = list("☺☻♥♦♣♠•◘○◙♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲") #  = new line
- output_string = ""
- for line in ((open(input_path, "r").read()).splitlines()):
- phoneme_embed = ""
- for i in sym:
- if i in line:
- phoneme_embed = phoneme_embed + i
- line = line.replace(i,"")
- out = ''
- for word_ in (line.split("|")[1]).split(" "):
- word=word_; end_chars = ''
- while any(elem in word for elem in r"!?,.;") and len(word) > 1:
- if word[-1] in ['!','?',',','.',';',';',':',"'","","-","_"]: end_chars = word[-1] + end_chars; word = word[:-1]
- else: break
- try: word_arpa = thisdict[word.upper()]
- except: word_arpa = ''
- if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
- out = (out + " " + word + end_chars).strip()
- output_string = output_string + line.split("|")[0] + "|" + phoneme_embed + out + "|" + line.split("|")[2] + "\n"
- output_string = output_string.replace("","")
- text_file = open(output_path, "w", encoding=encoding)
- text_file.write(output_string)
- text_file.close()
- metadata = {}
- # metadata["celestia"] = [{file_path: "", timestamp: "00_00_05", emotions: ["neutral"], noise_level: "", quote = "Once upon a time."}, .... , ....]
- def build_metadata(ignore_dirs=["Songs","Noise samples"]): # uses Global "directory" for recursive directory search, for every .flac file it will find the accompanying label and add to metadata.
- skip = 0
- for dir_ in [x[0] for x in os.walk(directory)]: # recursive directory search
- if len(os.listdir(dir_)) < 1: continue
- for directory_filter in ignore_dirs:
- if directory_filter in dir_: skip = 1
- if skip: skip = 0; continue
- for filename in os.listdir(dir_):
- if filename.endswith(".wav"):
- file_path = os.path.join(dir_,filename)
- splitted = filename.split("_")
- try:
- timestamp = "_".join(splitted[0:3]) # 00_00_05
- voice = splitted[3].lower() # celestia
- emotions = splitted[4].lower().split(" ") # neutral
- noise_level = splitted[5].lower() # "" = clean, "noisy" = Noisy, "very noisy" = Very Noisy
- filename_quote = splitted[6] # missing question marks
- except:
- print("'"+os.path.join(dir_,filename)+"' is not a valid file")
- try:
- with open(os.path.join(dir_,filename.replace(".wav",".txt")), 'r', encoding="latin-1") as file:
- txt_quote = file.read().replace('\n', '') # Once upon a time.
- except:
- print("txt for '"+str(os.path.join(dir_,filename))+"' is missing")
- continue
- if voice.lower() in list(metadata.keys()):
- metadata[str(voice).lower()].append({"file_path": file_path, "timestamp": timestamp, "emotions": emotions, "noise_level": noise_level, "quote": txt_quote})
- else:
- metadata[str(voice).lower()] = [{"file_path": file_path, "timestamp": timestamp, "emotions": emotions, "noise_level": noise_level, "quote": txt_quote}]
- else:
- continue
- def write_datasets(speaker_id = 0, permitted_noise_levels = [""], minimum_clips=3):
- multi_speaker_lines = []
- for voice in list(metadata.keys()):
- meta = metadata[voice] # meta == [{file_path: "", timestamp: "00_00_05", emotions: ["neutral"], noise_level: "", quote = "Once upon a time."}, .... , ....]
- if len(meta) < minimum_clips: continue # ignore voices with less than 3 clips of audio
- single_speaker_lines = []
- for clip in meta:
- if (clip["noise_level"] in permitted_noise_levels):
- single_speaker_lines.append(clip["file_path"]+"|"+clip["quote"]+"")
- multi_speaker_lines.append (clip["file_path"]+"|"+clip["quote"]+"|"+str(speaker_id))
- speaker_id+=1 # next speaker_id for next voice
- # shuffle stuff
- shuffled_multi_speaker_lines = multi_speaker_lines
- shuffle(shuffled_multi_speaker_lines)
- num_clips = len(shuffled_multi_speaker_lines)
- train_end = int(num_clips * percentage_training_data)
- train_arr = shuffled_multi_speaker_lines[:train_end]; validation_arr = shuffled_multi_speaker_lines[train_end:]
- # also make unshuffled stuff (sorted by speaker_id)
- unshuffled_multi_speaker_lines = []
- for i in range(len(list(metadata.keys()))):
- for line in multi_speaker_lines:
- if line.split("|")[2] == str(i): unshuffled_multi_speaker_lines.append(line)
- write_files(unshuffled_multi_speaker_lines, train_arr, validation_arr, output_directory_=directory)
- def write_files(multi_speaker_lines, train_arr, val_arr, output_directory_):
- output_directory = os.path.join(output_directory_,"filelists")
- if not os.path.exists(output_directory): os.makedirs(output_directory)
- # generate text dataset metadata
- text_file = open(os.path.join(output_directory,"unshuffled_taca2.txt"), "w", encoding="utf-8")
- text_file.write("\n".join(multi_speaker_lines)); text_file.close()
- # generate text dataset metadata
- text_file = open(os.path.join(output_directory,"train_taca2.txt"), "w", encoding="utf-8")
- text_file.write("\n".join(train_arr)); text_file.close()
- arpabet(os.path.join(output_directory,"train_taca2.txt"),os.path.join(output_directory,"train_taca2_arpa.txt"))
- # generate arpabet dataset metadata
- text_file = open(os.path.join(output_directory,"validation_taca2.txt"), "w", encoding="utf-8")
- text_file.write("\n".join(val_arr)); text_file.close()
- arpabet(os.path.join(output_directory,"validation_taca2.txt"),os.path.join(output_directory,"validation_taca2_arpa.txt"))
- # generate merged dataset metadata
- concat_text([os.path.join(output_directory,"train_taca2.txt"), os.path.join(output_directory,"train_taca2_arpa.txt")], os.path.join(output_directory,"train_taca2_merged.txt"))
- concat_text([os.path.join(output_directory,"validation_taca2.txt"), os.path.join(output_directory,"validation_taca2_arpa.txt")], os.path.join(output_directory,"validation_taca2_merged.txt"))
- # generate mel text dataset metadata
- text_file = open(os.path.join(output_directory,"mel_train_taca2.txt"), "w", encoding="utf-8")
- text_file.write("\n".join(train_arr).replace(".wav|",".npy|")); text_file.close()
- arpabet(os.path.join(output_directory,"mel_train_taca2.txt"),os.path.join(output_directory,"mel_train_taca2_arpa.txt"))
- # generate mel arpabet dataset metadata
- text_file = open(os.path.join(output_directory,"mel_validation_taca2.txt"), "w", encoding="utf-8")
- text_file.write("\n".join(val_arr).replace(".wav|",".npy|")); text_file.close()
- arpabet(os.path.join(output_directory,"mel_validation_taca2.txt"),os.path.join(output_directory,"mel_validation_taca2_arpa.txt"))
- # generate mel merged dataset metadata
- concat_text([os.path.join(output_directory,"mel_train_taca2.txt"), os.path.join(output_directory,"mel_train_taca2_arpa.txt")], os.path.join(output_directory,"mel_train_taca2_merged.txt"))
- concat_text([os.path.join(output_directory,"mel_validation_taca2.txt"), os.path.join(output_directory,"mel_validation_taca2_arpa.txt")], os.path.join(output_directory,"mel_validation_taca2_merged.txt"))
- def convert_dir_to_wav(directory, SAMPLE_RATE=48000, ignore_dirs=["Songs","Noise samples"]):
- skip = 0
- for dir_ in tqdm([x[0] for x in os.walk(directory)]): # recursive directory search
- for directory_filter in ignore_dirs:
- if directory_filter in dir_: skip = 1
- if skip: skip = 0; continue
- for filename in os.listdir(dir_):
- if filename.endswith(".flac"):
- file_path = os.path.join(dir_,filename)
- tqdm.write(file_path+" --> "+file_path.replace(".flac",".wav"))
- converter.flac2wav(file_path, file_path.replace(".flac",".wav"), "flac", frame_rate=SAMPLE_RATE, sample_width=2) # sample_width is bit_depth in bytes eg: 2 = 16 bit audio.
- def set_wavs_to_mono(directory):
- for dir_ in tqdm([x[0] for x in os.walk(directory)]): # recursive directory search
- for filename in os.listdir(dir_):
- if filename.endswith(".wav"):
- file_path = os.path.join(dir_,filename)
- tqdm.write(file_path)
- sound = AudioSegment.from_wav(file_path)
- sound = sound.set_channels(1)
- sound.export(file_path, format="wav")
- # Main block
- directory = r"/media/cookie/StableHDD/ClipperDatasetV2/SlicedDialogueTrimmed"
- percentage_training_data = 0.95
- build_metadata(ignore_dirs=["Songs","Noise samples"])
- write_datasets(permitted_noise_levels = [""], minimum_clips=3)
- convert_dir_to_wav(directory)
- set_wavs_to_mono(directory)
Add Comment
Please, Sign In to add comment