CookiePPP

Untitled

Jan 5th, 2020
225
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.98 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from pydub import AudioSegment
  4. from tqdm import tqdm
  5. import os
  6. from random import shuffle
  7. import utils.audio_converter as converter
  8.  
  9. # ARPAbet block
  10. dictionary_path = r"/media/cookie/Samsung PM961/TwiBot/tacotron2/filelists/merged.dict_.txt"
  11. print("Running, Please wait...")
  12. thisdict = {}
  13. for line in reversed((open(dictionary_path, "r").read()).splitlines()):
  14.     thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
  15. print("Dictionary Ready.")
  16.  
  17. # Functions
  18.  
  19. def concat_text(filenames, outpath):
  20.     with open(outpath, 'w') as outfile:
  21.         nan = 0
  22.         for fname in filenames:
  23.             if nan == 1: outfile.write("\n") # add newlines (\n) between each file
  24.             else: nan = 1
  25.             with open(fname) as infile:
  26.                 for line in infile:
  27.                     outfile.write(line)
  28.  
  29. def arpabet(input_path, output_path, encoding="utf-8"):
  30.     errored_words = ""
  31.     sym = list("☺☻♥♦♣♠•◘○◙♂♀♪♫☼►◄↕‼¶§▬↨↑↓→←∟↔▲") # ␤ = new line
  32.     output_string = ""
  33.     for line in ((open(input_path, "r").read()).splitlines()):
  34.         phoneme_embed = ""
  35.         for i in sym:
  36.             if i in line:
  37.                 phoneme_embed = phoneme_embed + i
  38.                 line = line.replace(i,"")
  39.         out = ''
  40.         for word_ in (line.split("|")[1]).split(" "):
  41.             word=word_; end_chars = ''
  42.             while any(elem in word for elem in r"!?,.;") and len(word) > 1:
  43.                 if word[-1] in ['!','?',',','.',';',';',':',"'","␤","-","_"]: end_chars = word[-1] + end_chars; word = word[:-1]
  44.                 else: break
  45.             try: word_arpa = thisdict[word.upper()]
  46.             except: word_arpa = ''
  47.             if len(word_arpa)!=0: word = "{" + str(word_arpa) + "}"
  48.             out = (out + " " + word + end_chars).strip()
  49.         output_string =  output_string + line.split("|")[0] + "|" + phoneme_embed + out + "␤|" + line.split("|")[2] + "\n"
  50.     output_string = output_string.replace("␤␤","␤")
  51.     text_file = open(output_path, "w", encoding=encoding)
  52.     text_file.write(output_string)
  53.     text_file.close()
  54.  
  55.  
  56. metadata = {}
  57. # metadata["celestia"] =  [{file_path: "", timestamp: "00_00_05", emotions: ["neutral"], noise_level: "", quote = "Once upon a time."}, .... , ....]
  58. def build_metadata(ignore_dirs=["Songs","Noise samples"]): # uses Global "directory" for recursive directory search, for every .flac file it will find the accompanying label and add to metadata.
  59.     skip = 0
  60.     for dir_ in [x[0] for x in os.walk(directory)]: # recursive directory search
  61.         if len(os.listdir(dir_)) < 1: continue
  62.         for directory_filter in ignore_dirs:
  63.             if directory_filter in dir_: skip = 1
  64.         if skip: skip = 0; continue
  65.         for filename in os.listdir(dir_):
  66.             if filename.endswith(".wav"):
  67.                 file_path = os.path.join(dir_,filename)
  68.                 splitted = filename.split("_")
  69.                 try:
  70.                     timestamp = "_".join(splitted[0:3])         # 00_00_05
  71.                     voice = splitted[3].lower()                     # celestia
  72.                     emotions = splitted[4].lower().split(" ")       # neutral
  73.                     noise_level = splitted[5].lower()           # "" = clean, "noisy" = Noisy, "very noisy" = Very Noisy
  74.                     filename_quote = splitted[6] # missing question marks
  75.                 except:
  76.                     print("'"+os.path.join(dir_,filename)+"' is not a valid file")
  77.                 try:
  78.                     with open(os.path.join(dir_,filename.replace(".wav",".txt")), 'r', encoding="latin-1") as file:
  79.                         txt_quote = file.read().replace('\n', '') # Once upon a time.
  80.                 except:
  81.                     print("txt for '"+str(os.path.join(dir_,filename))+"' is missing")
  82.                     continue
  83.                 if voice.lower() in list(metadata.keys()):
  84.                     metadata[str(voice).lower()].append({"file_path": file_path, "timestamp": timestamp, "emotions": emotions, "noise_level": noise_level, "quote": txt_quote})
  85.                 else:
  86.                     metadata[str(voice).lower()] = [{"file_path": file_path, "timestamp": timestamp, "emotions": emotions, "noise_level": noise_level, "quote": txt_quote}]
  87.             else:
  88.                 continue
  89.  
  90.  
  91. def write_datasets(speaker_id = 0, permitted_noise_levels = [""], minimum_clips=3):
  92.     multi_speaker_lines = []
  93.     for voice in list(metadata.keys()):
  94.         meta = metadata[voice] # meta == [{file_path: "", timestamp: "00_00_05", emotions: ["neutral"], noise_level: "", quote = "Once upon a time."}, .... , ....]
  95.         if len(meta) < minimum_clips: continue # ignore voices with less than 3 clips of audio
  96.         single_speaker_lines = []
  97.         for clip in meta:
  98.             if (clip["noise_level"] in permitted_noise_levels):
  99.                 single_speaker_lines.append(clip["file_path"]+"|"+clip["quote"]+"␤")
  100.                 multi_speaker_lines.append (clip["file_path"]+"|"+clip["quote"]+"␤|"+str(speaker_id))
  101.         speaker_id+=1 # next speaker_id for next voice
  102.     # shuffle stuff
  103.     shuffled_multi_speaker_lines = multi_speaker_lines
  104.     shuffle(shuffled_multi_speaker_lines)
  105.     num_clips = len(shuffled_multi_speaker_lines)
  106.     train_end = int(num_clips * percentage_training_data)
  107.     train_arr = shuffled_multi_speaker_lines[:train_end]; validation_arr = shuffled_multi_speaker_lines[train_end:]
  108.    
  109.     # also make unshuffled stuff (sorted by speaker_id)
  110.     unshuffled_multi_speaker_lines = []
  111.     for i in range(len(list(metadata.keys()))):
  112.         for line in multi_speaker_lines:
  113.             if line.split("|")[2] == str(i): unshuffled_multi_speaker_lines.append(line)
  114.     write_files(unshuffled_multi_speaker_lines, train_arr, validation_arr, output_directory_=directory)
  115.  
  116.  
  117. def write_files(multi_speaker_lines, train_arr, val_arr, output_directory_):
  118.     output_directory = os.path.join(output_directory_,"filelists")
  119.     if not os.path.exists(output_directory): os.makedirs(output_directory)
  120.  
  121.     # generate text dataset metadata
  122.     text_file = open(os.path.join(output_directory,"unshuffled_taca2.txt"), "w", encoding="utf-8")
  123.     text_file.write("\n".join(multi_speaker_lines)); text_file.close()
  124.    
  125.     # generate text dataset metadata
  126.     text_file = open(os.path.join(output_directory,"train_taca2.txt"), "w", encoding="utf-8")
  127.     text_file.write("\n".join(train_arr)); text_file.close()
  128.     arpabet(os.path.join(output_directory,"train_taca2.txt"),os.path.join(output_directory,"train_taca2_arpa.txt"))
  129.    
  130.     # generate arpabet dataset metadata
  131.     text_file = open(os.path.join(output_directory,"validation_taca2.txt"), "w", encoding="utf-8")
  132.     text_file.write("\n".join(val_arr)); text_file.close()
  133.     arpabet(os.path.join(output_directory,"validation_taca2.txt"),os.path.join(output_directory,"validation_taca2_arpa.txt"))
  134.    
  135.     # generate merged dataset metadata
  136.     concat_text([os.path.join(output_directory,"train_taca2.txt"), os.path.join(output_directory,"train_taca2_arpa.txt")], os.path.join(output_directory,"train_taca2_merged.txt"))
  137.     concat_text([os.path.join(output_directory,"validation_taca2.txt"), os.path.join(output_directory,"validation_taca2_arpa.txt")], os.path.join(output_directory,"validation_taca2_merged.txt"))
  138.    
  139.     # generate mel text dataset metadata
  140.     text_file = open(os.path.join(output_directory,"mel_train_taca2.txt"), "w", encoding="utf-8")
  141.     text_file.write("\n".join(train_arr).replace(".wav|",".npy|")); text_file.close()
  142.     arpabet(os.path.join(output_directory,"mel_train_taca2.txt"),os.path.join(output_directory,"mel_train_taca2_arpa.txt"))
  143.  
  144.     # generate mel arpabet dataset metadata
  145.     text_file = open(os.path.join(output_directory,"mel_validation_taca2.txt"), "w", encoding="utf-8")
  146.     text_file.write("\n".join(val_arr).replace(".wav|",".npy|")); text_file.close()
  147.     arpabet(os.path.join(output_directory,"mel_validation_taca2.txt"),os.path.join(output_directory,"mel_validation_taca2_arpa.txt"))
  148.  
  149.     # generate mel merged dataset metadata
  150.     concat_text([os.path.join(output_directory,"mel_train_taca2.txt"), os.path.join(output_directory,"mel_train_taca2_arpa.txt")], os.path.join(output_directory,"mel_train_taca2_merged.txt"))
  151.     concat_text([os.path.join(output_directory,"mel_validation_taca2.txt"), os.path.join(output_directory,"mel_validation_taca2_arpa.txt")], os.path.join(output_directory,"mel_validation_taca2_merged.txt"))
  152.  
  153. def convert_dir_to_wav(directory, SAMPLE_RATE=48000, ignore_dirs=["Songs","Noise samples"]):
  154.     skip = 0
  155.     for dir_ in tqdm([x[0] for x in os.walk(directory)]): # recursive directory search
  156.         for directory_filter in ignore_dirs:
  157.             if directory_filter in dir_: skip = 1
  158.         if skip: skip = 0; continue
  159.         for filename in os.listdir(dir_):
  160.             if filename.endswith(".flac"):
  161.                 file_path = os.path.join(dir_,filename)
  162.                 tqdm.write(file_path+" --> "+file_path.replace(".flac",".wav"))
  163.                 converter.flac2wav(file_path, file_path.replace(".flac",".wav"), "flac", frame_rate=SAMPLE_RATE, sample_width=2) # sample_width is bit_depth in bytes eg: 2 = 16 bit audio.
  164.  
  165. def set_wavs_to_mono(directory):
  166.     for dir_ in tqdm([x[0] for x in os.walk(directory)]): # recursive directory search
  167.         for filename in os.listdir(dir_):
  168.             if filename.endswith(".wav"):
  169.                 file_path = os.path.join(dir_,filename)
  170.                 tqdm.write(file_path)
  171.                 sound = AudioSegment.from_wav(file_path)
  172.                 sound = sound.set_channels(1)
  173.                 sound.export(file_path, format="wav")
  174.  
  175. # Main block
  176. directory = r"/media/cookie/StableHDD/ClipperDatasetV2/SlicedDialogueTrimmed"
  177. percentage_training_data = 0.95
  178. build_metadata(ignore_dirs=["Songs","Noise samples"])
  179. write_datasets(permitted_noise_levels = [""], minimum_clips=3)
  180. convert_dir_to_wav(directory)
  181. set_wavs_to_mono(directory)
Add Comment
Please, Sign In to add comment