sskadam

Untitled

Feb 21st, 2022
212
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.49 KB | None | 0 0
  1. import os
  2.  
  3. from TTS.config.shared_configs import BaseAudioConfig
  4. from TTS.trainer import Trainer, TrainingArgs
  5. from TTS.tts.configs.shared_configs import BaseDatasetConfig
  6. from TTS.tts.configs.tacotron2_config import Tacotron2Config
  7. from TTS.tts.datasets import load_tts_samples
  8. from TTS.tts.models.tacotron2 import Tacotron2
  9. from TTS.utils.audio import AudioProcessor
  10.  
  11. output_path = "/home/big-boy/Models/T2-only/"
  12.  
  13. # Using LJSpeech like dataset processing for the blizzard dataset
  14. dataset_config = BaseDatasetConfig(
  15.     name="ljspeech",
  16.     meta_file_train="metadata.csv",
  17.     path="/home/big-boy/Data/blizzard2013/segmented/",
  18. )
  19.  
  20. audio_config = BaseAudioConfig(
  21.     sample_rate=24000,
  22.     do_trim_silence=True,
  23.     trim_db=60.0,
  24.     signal_norm=True,
  25.     mel_fmin=80.0,
  26.     mel_fmax=12000,
  27.     spec_gain=20.0,
  28.     log_func="np.log",
  29.     ref_level_db=20,
  30.     preemphasis=0.0,
  31.     min_level_db=-100,
  32. )
  33.  
  34. # Using the standard Capacitron config
  35. # capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)
  36.  
  37. config = Tacotron2Config(
  38.     run_name="T2-DCA-only",
  39.     audio=audio_config,
  40.     batch_size=100,
  41.     eval_batch_size=16,
  42.     num_loader_workers=12,
  43.     num_eval_loader_workers=8,
  44.     run_eval=True,
  45.     test_delay_epochs=15,
  46.     ga_alpha=0.0,
  47.     r=2,
  48.     attention_type="dynamic_convolution",
  49.     double_decoder_consistency=False,
  50.     epochs=1000,
  51.     text_cleaner="phoneme_cleaners",
  52.     use_phonemes=True,
  53.     phoneme_language="en-us",
  54.     use_espeak_phonemes=True,
  55.     phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
  56.     stopnet_pos_weight=15,
  57.     print_step=500,
  58.     print_eval=True,
  59.     mixed_precision=False,
  60.     output_path=output_path,
  61.     datasets=[dataset_config],
  62.     min_seq_len=1,
  63.     max_seq_len=100,
  64.     lr=1e-3,
  65.     # Need to experiment with these below for capacitron
  66.     dashboard_logger="wandb",
  67.     loss_masking=False,
  68.     # decoder_loss_alpha=1.0,
  69.     # postnet_loss_alpha=1.0,
  70.     # postnet_diff_spec_alpha=0.0,
  71.     # decoder_diff_spec_alpha=0.0,
  72.     # decoder_ssim_alpha=0.0,
  73.     # postnet_ssim_alpha=0.0,
  74. )
  75.  
  76. ap = AudioProcessor(**config.audio.to_dict())
  77.  
  78. train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
  79.  
  80. model = Tacotron2(config, speaker_manager=None)
  81.  
  82. trainer = Trainer(
  83.     TrainingArgs(),
  84.     config,
  85.     output_path,
  86.     model=model,
  87.     train_samples=train_samples,
  88.     eval_samples=eval_samples,
  89.     training_assets={"audio_processor": ap},
  90. )
  91.  
  92. # 🚀
  93. trainer.fit()
  94.  
Advertisement
Add Comment
Please, Sign In to add comment