Faceswap train.ini file

[global]
# options that apply to all models
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# how much of the extracted image to train on. a lower coverage will limit the model's scope to a zoomed-in central area while higher amounts can include the entire face. a trade-off exists between lower amounts given more detail versus higher amounts avoiding noticeable swap transitions. sensible values to use are:
#   62.5%% spans from eyebrow to eyebrow.
#   75.0%% spans from temple to temple.
#   87.5%% spans from ear to ear.
#   100.0%% is a mugshot.
#
# select a decimal number between 62.5 and 100.0
# [default: 68.75]
coverage = 68.75

# use icnr to tile the default initializer in a repeating pattern. this strategy is designed for pairing with sub-pixel / pixel shuffler to reduce the 'checkerboard effect' in image reconstruction.
#    https://arxiv.org/ftp/arxiv/papers/1707/1707.02937.pdf
#
# choose from: true, false
# [default: false]
icnr_init = False

# use convolution aware initialization for convolutional layers. this can help eradicate the vanishing and exploding gradient problem as well as lead to higher accuracy, lower loss and faster convergence.
# nb:
#    this can use more vram when creating a new model so you may want to lower the batch size for the first run. the batch size can be raised again when reloading the model.
#    multi-gpu is not supported for this option, so you should start the model on a single gpu. once training has started, you can stop training, enable multi-gpu and resume.
#    building the model will likely take several minutes as the calculations for this initialization technique are expensive. this will only impact starting a new model.
#
# choose from: true, false
# [default: false]
conv_aware_init = False

# the optimizer to use.
#    adam - adaptive moment optimization. a stochastic gradient descent method that is based on adaptive estimation of first-order and second-order moments.
#    nadam - adaptive moment optimization with nesterov momentum. much like adam but uses a different formula for calculating momentum.
#    rms-prop - root mean square propogation. maintains a moving (discounted) average of the square of the gradients. divides the gradient by the root of this average.
#
# choose from: ['adam', 'nadam', 'rms-prop']
# [default: adam]
optimizer = adam

# learning rate - how fast your network will learn (how large are the modifications to the model weights after one batch of training). values that are too large might result in model crashes and the inability of the model to find the best solution. values that are too small might be unable to escape from dead-ends and find the best global minimum.
#
# this option can be updated for existing models.
# select a decimal number between 1e-06 and 0.0001
# [default: 5e-05]
learning_rate = 5e-05

# use reflection padding rather than zero padding with convolutions. each convolution must pad the image boundaries to maintain the proper sizing. more complex padding schemes can reduce artifacts at the border of the image.
#    http://www-cs.engr.ccny.cuny.edu/~wolberg/cs470/hw/hw2_pad.txt
#
# choose from: true, false
# [default: false]
reflect_padding = False

# [nvidia only]. enable the tensorflow gpu 'allow_growth' configuration option. this option prevents tensorflow from allocating all of the gpu vram at launch but can lead to higher vram fragmentation and slower performance. should only be enabled if you are receiving errors regarding 'cudnn fails to initialize' when commencing training.
#
# this option can be updated for existing models.
# choose from: true, false
# [default: false]
allow_growth = False

# [nvidia only], nvidia gpus can run operations in float16 faster than in float32. mixed precision allows you to use a mix of float16 with float32, to get the performance benefits from float16 and the numeric stability benefits from float32.
#
# while mixed precision will run on most nvidia models, it will only speed up training on more recent gpus. those with compute capability 7.0 or higher will see the greatest performance benefit from mixed precision because they have tensor cores. older gpus offer no math performance benefit for using mixed precision, however memory and bandwidth savings can enable some speedups. generally rtx gpus and later will offer the most benefit.
#
# choose from: true, false
# [default: false]
mixed_precision = False

# [gpu only]. the number of faces to feed through the model at once when running the convert process.
#
# nb: increasing this figure is unlikely to improve convert speed, however, if you are getting out of memory errors, then you may want to reduce the batch size.
#
# this option can be updated for existing models.
# select an integer between 1 and 32
# [default: 16]
convert_batchsize = 16

[global.loss]
# loss configuration options
# loss is the mechanism by which a neural network judges how well it thinks that it is recreating a face.
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# the loss function to use.
#    mae - mean absolute error will guide reconstructions of each pixel towards its median value in the training dataset. robust to outliers but as a median, it can potentially ignore some infrequent image types in the dataset.
#    mse - mean squared error will guide reconstructions of each pixel towards its average value in the training dataset. as an avg, it will be suspectible to outliers and typically produces slightly blurrier results.
#    logcosh - log(cosh(x)) acts similiar to mse for small errors and to mae for large errors. like mse, it is very stable and prevents overshoots when errors are near zero. like mae, it is robust to outliers. nb: due to a bug in plaidml, this loss does not work on amd cards.
#    smooth_l1 --- modification of the mae loss to correct two of its disadvantages. this loss has improved stability and guidance for small errors.
#    l_inf_norm --- the l_inf norm will reduce the largest individual pixel error in an image. as each largest error is minimized sequentially, the overall error is improved. this loss will be extremely focused on outliers.
#    ssim - structural similarity index metric is a perception-based loss that considers changes in texture, luminance, contrast, and local spatial statistics of an image. potentially delivers more realistic looking images.
#    gmsd - gradient magnitude similarity deviation seeks to match the global standard deviation of the pixel to pixel differences between two images. similiar in approach to ssim. nb: this loss does not currently work on amd cards.
#    pixel_gradient_difference - instead of minimizing the difference between the absolute value of each pixel in two reference images, compute the pixel to pixel spatial difference in each image and then minimize that difference between two images. allows for large color shifts,but maintains the structure of the image.
#
# choose from: ['mae', 'mse', 'logcosh', 'smooth_loss', 'l_inf_norm', 'ssim', 'gmsd', 'pixel_gradient_diff']
# [default: ssim]
loss_function = ssim

# the loss function to use when learning a mask.
#    mae - mean absolute error will guide reconstructions of each pixel towards its median value in the training dataset. robust to outliers but as a median, it can potentially ignore some infrequent image types in the dataset.
#    mse - mean squared error will guide reconstructions of each pixel towards its average value in the training dataset. as an avg, it will be suspectible to outliers and typically produces slightly blurrier results.
#
# choose from: ['mae', 'mse']
# [default: mse]
mask_loss_function = mse

# the amount of l2 regularization to apply as a penalty to structural similarity loss functions.
#
# nb: you should only adjust this if you know what you are doing!
#
# l2 regularization applies a penalty term to the given loss function. this penalty will only be applied if ssim or gmsd is selected for the main loss function, otherwise it is ignored.
#
# the value given here is as a percentage weight of the main loss function. for example:
#    100 - will give equal weighting to the main loss and the penalty function.
#    25 - will give the penalty function 1/4 of the weight of the main loss function.
#    400 - will give the penalty function 4x as much importance as the main loss function.
#    0 - disables l2 regularization altogether.
#
# select an integer between 0 and 400
# [default: 100]
l2_reg_term = 100

# the amount of priority to give to the eyes.
#
# the value given here is as a multiplier of the main loss score. for example:
#    1 - the eyes will receive the same priority as the rest of the face.
#    10 - the eyes will be given a score 10 times higher than the rest of the face.
#
# nb: penalized mask loss must be enable to use this option.
#
# this option can be updated for existing models.
# select an integer between 1 and 40
# [default: 3]
eye_multiplier = 3

# the amount of priority to give to the mouth.
#
# the value given here is as a multiplier of the main loss score. for example:
#    1 - the mouth will receive the same priority as the rest of the face.
#    10 - the mouth will be given a score 10 times higher than the rest of the face.
#
# nb: penalized mask loss must be enable to use this option.
#
# this option can be updated for existing models.
# select an integer between 1 and 40
# [default: 2]
mouth_multiplier = 2

# image loss function is weighted by mask presence. for areas of the image without the facial mask, reconstuction errors will be ignored while the masked face area is prioritized. may increase overall quality by focusing attention on the core face area.
#
# choose from: true, false
# [default: true]
penalized_mask_loss = True

# the mask to be used for training. if you have selected 'learn mask' or 'penalized mask loss' you must select a value other than 'none'. the required mask should have been selected as part of the extract process. if it does not exist in the alignments file then it will be generated prior to training commencing.
#   none: don't use a mask.
#   components: mask designed to provide facial segmentation based on the positioning of landmark locations. a convex hull is constructed around the exterior of the landmarks to create a mask.
#   extended: mask designed to provide facial segmentation based on the positioning of landmark locations. a convex hull is constructed around the exterior of the landmarks and the mask is extended upwards onto the forehead.
#   vgg-clear: mask designed to provide smart segmentation of mostly frontal faces clear of obstructions. profile faces and obstructions may result in sub-par performance.
#   vgg-obstructed: mask designed to provide smart segmentation of mostly frontal faces. the mask model has been specifically trained to recognize some facial obstructions (hands and eyeglasses). profile faces may result in sub-par performance.
#   unet-dfl: mask designed to provide smart segmentation of mostly frontal faces. the mask model has been trained by community members and will need testing for further description. profile faces may result in sub-par performance.
#
# choose from: ['none', 'components', 'extended', 'unet-dfl', 'vgg-clear', 'vgg-obstructed']
# [default: extended]
mask_type = extended

# apply gaussian blur to the mask input. this has the effect of smoothing the edges of the mask, which can help with poorly calculated masks and give less of a hard edge to the predicted mask. the size is in pixels (calculated from a 128px mask). set to 0 to not apply gaussian blur. this value should be odd, if an even number is passed in then it will be rounded to the next odd number.
#
# select an integer between 0 and 9
# [default: 3]
mask_blur_kernel = 3

# sets pixels that are near white to white and near black to black. set to 0 for off.
#
# select an integer between 0 and 50
# [default: 4]
mask_threshold = 4

# dedicate a portion of the model to learning how to duplicate the input mask. increases vram usage in exchange for learning a quick ability to try to replicate more complex mask models.
#
# choose from: true, false
# [default: false]
learn_mask = False

[model.dfl_h128]
# dfl h128 model (adapted from https://github.com/iperov/deepfacelab)
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# lower memory mode. set to 'true' if having issues with vram useage.
# nb: models with a changed lowmem mode are not compatible with each other.
#
# choose from: true, false
# [default: false]
lowmem = False

[model.dfl_sae]
# dfl sae model (adapted from https://github.com/iperov/deepfacelab)
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# resolution (in pixels) of the input image to train on.
# be aware larger resolution will dramatically increase vram requirements.
#
# must be divisible by 16.
#
# select an integer between 64 and 256
# [default: 128]
input_size = 128

# controls gradient clipping of the optimizer. can prevent model corruption at the expense of vram.
#
# this option can be updated for existing models.
# choose from: true, false
# [default: true]
clipnorm = True

# model architecture:
#   'df': keeps the faces more natural.
#   'liae': can help fix overly different face shapes.
#
# choose from: ['df', 'liae']
# [default: df]
architecture = df

# face information is stored in autoencoder dimensions. if there are not enough dimensions then certain facial features may not be recognized.
# higher number of dimensions are better, but require more vram.
# set to 0 to use the architecture defaults (256 for liae, 512 for df).
#
# select an integer between 0 and 1024
# [default: 0]
autoencoder_dims = 0

# encoder dimensions per channel. higher number of encoder dimensions will help the model to recognize more facial features, but will require more vram.
#
# select an integer between 21 and 85
# [default: 42]
encoder_dims = 42

# decoder dimensions per channel. higher number of decoder dimensions will help the model to improve details, but will require more vram.
#
# select an integer between 10 and 85
# [default: 21]
decoder_dims = 21

# multiscale decoder can help to obtain better details.
#
# choose from: true, false
# [default: false]
multiscale_decoder = False

[model.dlight]
# a lightweight, high resolution dfaker variant (adapted from https://github.com/dfaker/df)
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# higher settings will allow learning more features such as tatoos, piercing,
# and wrinkles.
# strongly affects vram usage.
#
# choose from: ['lowmem', 'fair', 'best']
# [default: best]
features = best

# defines detail fidelity. lower setting can appear 'rugged' while 'good' might take onger time to train.
# affects vram usage.
#
# choose from: ['fast', 'good']
# [default: good]
details = good

# output image resolution (in pixels).
# be aware that larger resolution will increase vram requirements.
# nb: must be either 128, 256, or 384.
#
# select an integer between 128 and 384
# [default: 256]
output_size = 256

[model.original]
# original faceswap model.
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# lower memory mode. set to 'true' if having issues with vram useage.
# nb: models with a changed lowmem mode are not compatible with each other.
#
# choose from: true, false
# [default: false]
lowmem = False

[model.realface]
# an extra detailed variant of original model.
# incorporates ideas from bryanlyon and inspiration from the villain model.
# requires about 6gb-8gb of vram (batchsize 8-16).
#
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# resolution (in pixels) of the input image to train on.
# be aware larger resolution will dramatically increase vram requirements.
# higher resolutions may increase prediction accuracy, but does not effect the resulting output size.
# must be between 64 and 128 and be divisible by 16.
#
# select an integer between 64 and 128
# [default: 64]
input_size = 64

# output image resolution (in pixels).
# be aware that larger resolution will increase vram requirements.
# nb: must be between 64 and 256 and be divisible by 16.
#
# select an integer between 64 and 256
# [default: 128]
output_size = 128

# number of nodes for decoder. might affect your model's ability to learn in general.
# note that: lower values will affect the ability to predict details.
#
# select an integer between 768 and 2048
# [default: 1536]
dense_nodes = 1536

# encoder convolution layer complexity. sensible ranges: 128 to 150.
#
# select an integer between 96 and 160
# [default: 128]
complexity_encoder = 128

# decoder complexity.
#
# select an integer between 512 and 544
# [default: 512]
complexity_decoder = 512

[model.unbalanced]
# an unbalanced model with adjustable input size options.
# this is an unbalanced model so b>a swaps may not work well
#
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# resolution (in pixels) of the image to train on.
# be aware larger resolution will dramatically increasevram requirements.
# make sure your resolution is divisible by 64 (e.g. 64, 128, 256 etc.).
# nb: your faceset must be at least 1.6x larger than your required input size.
# (e.g. 160 is the maximum input size for a 256x256 faceset).
#
# select an integer between 64 and 512
# [default: 128]
input_size = 128

# lower memory mode. set to 'true' if having issues with vram useage.
# nb: models with a changed lowmem mode are not compatible with each other.
# nb: lowmem will override cutom nodes and complexity settings.
#
# choose from: true, false
# [default: false]
lowmem = False

# controls gradient clipping of the optimizer. can prevent model corruption at the expense of vram.
#
# choose from: true, false
# [default: true]
clipnorm = True

# number of nodes for decoder. don't change this unless you know what you are doing!
#
# select an integer between 512 and 4096
# [default: 1024]
nodes = 1024

# encoder convolution layer complexity. sensible ranges: 128 to 160.
#
# select an integer between 64 and 1024
# [default: 128]
complexity_encoder = 128

# decoder a complexity.
#
# select an integer between 64 and 1024
# [default: 384]
complexity_decoder_a = 384

# decoder b complexity.
#
# select an integer between 64 and 1024
# [default: 512]
complexity_decoder_b = 512

[model.villain]
# a higher resolution version of the original model by villainguy.
# extremely vram heavy. don't try to run this if you have a small gpu.
#
# nb: unless specifically stated, values changed here will only take effect when creating a new model.

# lower memory mode. set to 'true' if having issues with vram useage.
# nb: models with a changed lowmem mode are not compatible with each other.
#
# choose from: true, false
# [default: false]
lowmem = False

[trainer.original]
# original trainer options.
# warning: the defaults for augmentation will be fine for 99.9% of use cases. only change them if you absolutely know what you are doing!

# number of sample faces to display for each side in the preview when training.
#
# select an integer between 2 and 16
# [default: 14]
preview_images = 14

# percentage amount to randomly zoom each training image in and out.
#
# select an integer between 0 and 25
# [default: 5]
zoom_amount = 5

# percentage amount to randomly rotate each training image.
#
# select an integer between 0 and 25
# [default: 10]
rotation_range = 10

# percentage amount to randomly shift each training image horizontally and vertically.
#
# select an integer between 0 and 25
# [default: 5]
shift_range = 5

# percentage chance to randomly flip each training image horizontally.
# nb: this is ignored if the 'no-flip' option is enabled
#
# select an integer between 0 and 75
# [default: 50]
flip_chance = 50

# disable warp augmentation. warping is integral to the neural network training. if you decide to disable warping, you should only do so towards the end of a model's training session.
#
# this option can be updated for existing models.
# choose from: true, false
# [default: false]
disable_warp = False

# percentage amount to randomly alter the lightness of each training image.
# nb: this is ignored if the 'no-flip' option is enabled
#
# select an integer between 0 and 75
# [default: 30]
color_lightness = 30

# percentage amount to randomly alter the 'a' and 'b' colors of the l*a*b* color space of each training image.
# nb: this is ignored if the 'no-flip' option is enabled
#
# select an integer between 0 and 50
# [default: 8]
color_ab = 8

# percentage chance to perform contrast limited adaptive histogram equalization on each training image.
# nb: this is ignored if the 'no-augment-color' option is enabled
#
# this option can be updated for existing models.
# select an integer between 0 and 75
# [default: 50]
color_clahe_chance = 50

# the grid size dictates how much contrast limited adaptive histogram equalization is performed on any training image selected for clahe. contrast will be applied randomly with a gridsize of 0 up to the maximum. this value is a multiplier calculated from the training image size.
# nb: this is ignored if the 'no-augment-color' option is enabled
#
# select an integer between 1 and 8
# [default: 4]
color_clahe_max_size = 4