VoiceAPI / training /configs /hindi_female.yaml
Harshil748's picture
Add training scripts and comprehensive documentation
d722140
# Hindi Female VITS Training Configuration
# Dataset: OpenSLR Hindi + IndicTTS Hindi Female subset
model:
name: vits
hidden_channels: 192
filter_channels: 768
n_heads: 2
n_layers: 6
kernel_size: 3
p_dropout: 0.1
resblock: "1"
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
upsample_rates: [8, 8, 2, 2]
upsample_initial_channel: 512
upsample_kernel_sizes: [16, 16, 4, 4]
n_speakers: 1
gin_channels: 256
audio:
sample_rate: 22050
filter_length: 1024
hop_length: 256
win_length: 1024
n_mel_channels: 80
mel_fmin: 0.0
mel_fmax: null
max_wav_value: 32768.0
data:
training_files: data/hindi_female/metadata_train.csv
validation_files: data/hindi_female/metadata_val.csv
text_cleaners: [hindi_cleaners]
segment_size: 8192
add_blank: true
training:
learning_rate: 2e-4
betas: [0.8, 0.99]
eps: 1e-9
batch_size: 32
fp16: true
epochs: 1000
warmup_epochs: 50
checkpoint_interval: 10000
eval_interval: 1000
seed: 42
# Loss weights
c_mel: 45
c_kl: 1.0
language:
code: hi
name: Hindi
speaker:
id: hindi_female_001
gender: female