Guest User

Untitled

a guest
Feb 19th, 2023
32
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
YAML 4.96 KB | None | 0 0
  1. name: Test1
  2. model: extensibletrainer
  3. scale: 1
  4. gpu_ids: [0] # <-- unless you have multiple gpus, use this
  5. start_step: -1
  6. checkpointing_enabled: true  # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
  7. fp16: false # might want to check this out
  8. wandb: false  # <-- enable to log to wandb. tensorboard logging is always enabled.
  9. use_tb_logger: true
  10.  
  11. datasets:
  12.   train:
  13.     name: TestDataset
  14.     n_workers: 8 # idk what this does
  15.     batch_size: 13 # This leads to ~16GB of vram usage on my 3090.
  16.     mode: paired_voice_audio
  17.     path: /content/gdrive/MyDrive/dataset/train.txt
  18.     fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
  19.     phase: train
  20.     max_wav_length: 255995
  21.     max_text_length: 200
  22.     sample_rate: 22050
  23.     load_conditioning: True
  24.     num_conditioning_candidates: 2
  25.     conditioning_length: 44000
  26.     use_bpe_tokenizer: True
  27.     load_aligned_codes: False
  28.   val:
  29.     name: TestValidation
  30.     n_workers: 1
  31.     batch_size: 13 # this could be higher probably
  32.     mode: paired_voice_audio
  33.     path: /content/gdrive/MyDrive/dataset/val.txt
  34.     fetcher_mode: ['lj']
  35.     phase: val # might be broken idk
  36.     max_wav_length: 255995
  37.     max_text_length: 200
  38.     sample_rate: 22050
  39.     load_conditioning: True
  40.     num_conditioning_candidates: 2
  41.     conditioning_length: 44000
  42.     use_bpe_tokenizer: True
  43.     load_aligned_codes: False
  44.  
  45. steps:      
  46.   gpt_train:
  47.     training: gpt
  48.     loss_log_buffer: 500 # no idea what this does
  49.  
  50.     # Generally follows the recipe from the DALLE paper.
  51.     optimizer: adamw # this should be adamw_zero if you're using distributed training
  52.     optimizer_params:
  53.       lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
  54.       weight_decay: !!float 1e-2
  55.       beta1: 0.9
  56.       beta2: 0.96
  57.     clip_grad_eps: 4
  58.  
  59.     injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
  60.       paired_to_mel:
  61.         type: torch_mel_spectrogram
  62.         mel_norm_file: ../experiments/clips_mel_norms.pth
  63.         in: wav
  64.         out: paired_mel
  65.       paired_cond_to_mel:
  66.         type: for_each
  67.         subtype: torch_mel_spectrogram
  68.         mel_norm_file: ../experiments/clips_mel_norms.pth
  69.         in: conditioning
  70.         out: paired_conditioning_mel
  71.       to_codes:
  72.         type: discrete_token
  73.         in: paired_mel
  74.         out: paired_mel_codes
  75.         dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
  76.       paired_fwd_text:
  77.         type: generator
  78.         generator: gpt
  79.         in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
  80.         out: [loss_text_ce, loss_mel_ce, logits]      
  81.     losses:
  82.       text_ce:
  83.         type: direct
  84.         weight: .01
  85.         key: loss_text_ce
  86.       mel_ce:
  87.         type: direct
  88.         weight: 1
  89.         key: loss_mel_ce
  90.  
  91. networks:
  92.   gpt:
  93.     type: generator
  94.     which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
  95.     kwargs:
  96.       layers: 30 # WAS 8
  97.       model_dim: 1024 # WAS 512
  98.       heads: 16 # WAS 8
  99.       max_text_tokens: 402 # WAS 120
  100.       max_mel_tokens: 604 # WAS 250
  101.       max_conditioning_inputs: 2 # WAS 1
  102.       mel_length_compression: 1024
  103.       number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
  104.       number_mel_codes: 8194
  105.       start_mel_token: 8192
  106.       stop_mel_token: 8193
  107.       start_text_token: 255
  108.       train_solo_embeddings: False # missing in uv3/4
  109.       use_mel_codes_as_input: True # ditto
  110.       checkpointing: True
  111.       #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
  112.       #only_alignment_head: False  # uv3/4
  113.  
  114. path:
  115.   pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
  116.   strict_load: true
  117.   #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
  118.  
  119. # afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
  120. train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
  121.   niter: 50000
  122.   warmup_iter: -1
  123.   mega_batch_factor: 4    # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
  124.   val_freq: 60
  125.  
  126.   default_lr_scheme: MultiStepLR
  127.   gen_lr_steps: [20, 40, 56, 72] #[50000, 100000, 140000, 180000]
  128.   lr_gamma: 0.5
  129.  
  130. eval:
  131.   output_state: gen
  132.   injectors:
  133.     gen_inj_eval:
  134.       type: generator
  135.       generator: generator
  136.       in: hq
  137.       out: [gen, codebook_commitment_loss]
  138.  
  139. logger:
  140.   print_freq: 5
  141.   save_checkpoint_freq: 50 # CHANGEME: especially you should increase this it's really slow
  142.   visuals: [gen, mel]
  143.   visual_debug_rate: 500
  144.   is_mel_spectrogram: true
  145.  
Add Comment
Please, Sign In to add comment