-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into huiyingl/nemo2sftpeft_notebook
- Loading branch information
Showing
32 changed files
with
8,180 additions
and
100 deletions.
There are no files selected for viewing
160 changes: 160 additions & 0 deletions
160
examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
name: megatron_t5_speechllm_tts_inference | ||
checkpoint_path: ??? | ||
|
||
trainer: | ||
devices: 1 | ||
accelerator: gpu | ||
num_nodes: 1 | ||
precision: 32 | ||
logger: False | ||
enable_checkpointing: False | ||
use_distributed_sampler: False | ||
max_epochs: 10000 | ||
max_steps: -1 | ||
log_every_n_steps: 10 | ||
val_check_interval: null | ||
check_val_every_n_epoch: 3 | ||
gradient_clip_val: 1.0 | ||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_wandb_logger: False | ||
resume_if_exists: False | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: val_loss | ||
save_top_k: 2 | ||
mode: min | ||
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below | ||
filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" | ||
model_parallel_size: ${model.tensor_model_parallel_size} | ||
save_best_model: True | ||
create_early_stopping_callback: False | ||
early_stopping_callback_params: | ||
monitor: "val_loss" | ||
mode: "min" | ||
min_delta: 0.001 | ||
patience: 10 | ||
verbose: True | ||
|
||
model: | ||
seed: 1234 | ||
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved | ||
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference' | ||
tensor_model_parallel_size: 1 | ||
pipeline_model_parallel_size: 1 | ||
global_batch_size: 16 | ||
micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1 | ||
validation_global_batch_size: ${model.global_batch_size} | ||
validation_micro_batch_size: ${model.micro_batch_size} | ||
validation_drop_last: False | ||
report_validation_metric: False | ||
validation_metric: accuracy | ||
num_speech_tokens: 10112 # Vocabulary size pertaining to speech | ||
seq_pattern: "parallel" # parallel, delay_parallel, flatten | ||
temperature: 0.85 # Temperature to be used for inference | ||
top_k: 80 # Top k to be used for inference | ||
max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for | ||
|
||
restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with | ||
language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required | ||
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. | ||
existing_tasks: [] | ||
new_tasks: ["squad"] | ||
codecmodel_type: nemo_codec | ||
codecmodel_path: ??? | ||
english_only_model: true | ||
context_conditioning: decoder | ||
use_flash_attention: false | ||
lm_vocab_size: 30000 | ||
task_templates: | ||
- taskname: "squad" | ||
prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}" | ||
total_virtual_tokens: 3 | ||
virtual_token_splits: [3] | ||
truncate_field: context | ||
answer_field: answer | ||
|
||
p_tuning: # P-tuning specific params | ||
encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default | ||
num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp | ||
dropout: 0.0 | ||
|
||
prompt_tuning: # Prompt tunin specific params | ||
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks | ||
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random | ||
|
||
data: | ||
grapheme_prefix: null | ||
train_ds: null | ||
validation_ds: null | ||
test_ds: ??? | ||
max_seq_length: 1536 | ||
sample_rate: 24000 | ||
add_eos: true | ||
add_bos: false | ||
decoder_starts_with_pad: False | ||
add_eos_to_decoder_output: True | ||
add_sentinel_to_input: True | ||
ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x> | ||
shuffle: true | ||
num_workers: 4 | ||
pin_memory: true | ||
speech_offset: 30000 | ||
train_task: all | ||
sup_data_path: None | ||
num_speech_codebooks: 8 | ||
codebook_fps: 86 | ||
context_duration_min: 2.9 | ||
context_duration_max: 2.9 | ||
context_slice_method: "fixed" | ||
phoneme_probability: 1.0 | ||
g2p: | ||
english: | ||
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p | ||
phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" | ||
heteronyms: "scripts/tts_dataset_files/heteronyms-052722" | ||
phoneme_probability: 0.8 | ||
ignore_ambiguous_words: False | ||
use_chars: True | ||
use_stresses: True | ||
grapheme_prefix: ${model.data.grapheme_prefix} | ||
spanish: | ||
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p | ||
phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict" | ||
phoneme_probability: 0.8 | ||
use_chars: True | ||
use_stresses: True | ||
ignore_ambiguous_words: False | ||
grapheme_prefix: ${model.data.grapheme_prefix} | ||
locale: "es-ES" | ||
mandarin: | ||
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p | ||
phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" | ||
word_segmenter: "jieba" | ||
phoneme_prefix: "" | ||
phoneme_case: "lower" | ||
tone_prefix: "#" | ||
ascii_letter_prefix: ${model.data.grapheme_prefix} | ||
ascii_letter_case: "upper" | ||
german: | ||
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p | ||
phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict" | ||
heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym" | ||
phoneme_probability: 0.8 | ||
ignore_ambiguous_words: False | ||
use_chars: True | ||
use_stresses: True | ||
grapheme_case: mixed | ||
grapheme_prefix: ${model.data.grapheme_prefix} | ||
locale: "de-DE" | ||
|
||
optim: | ||
name: fused_adam | ||
lr: 5e-5 | ||
weight_decay: 0.01 | ||
betas: | ||
- 0.9 | ||
- 0.98 |
213 changes: 213 additions & 0 deletions
213
examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
name: megatron_t5_speechllm_tts_inference | ||
checkpoint_path: ??? | ||
|
||
trainer: | ||
devices: 1 | ||
accelerator: gpu | ||
num_nodes: 1 | ||
precision: 32 | ||
logger: False | ||
enable_checkpointing: False | ||
use_distributed_sampler: False | ||
max_epochs: 10000 | ||
max_steps: -1 | ||
log_every_n_steps: 10 | ||
val_check_interval: null | ||
check_val_every_n_epoch: 3 | ||
gradient_clip_val: 1.0 | ||
|
||
exp_manager: | ||
exp_dir: null | ||
name: ${name} | ||
create_wandb_logger: False | ||
resume_if_exists: False | ||
resume_ignore_no_checkpoint: True | ||
create_checkpoint_callback: True | ||
checkpoint_callback_params: | ||
monitor: val_loss | ||
save_top_k: 2 | ||
mode: min | ||
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below | ||
filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}" | ||
model_parallel_size: ${model.tensor_model_parallel_size} | ||
save_best_model: True | ||
create_early_stopping_callback: False | ||
early_stopping_callback_params: | ||
monitor: "val_loss" | ||
mode: "min" | ||
min_delta: 0.001 | ||
patience: 10 | ||
verbose: True | ||
|
||
model: | ||
seed: 1234 | ||
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved | ||
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference' | ||
tensor_model_parallel_size: 1 | ||
pipeline_model_parallel_size: 1 | ||
global_batch_size: 16 | ||
micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1 | ||
validation_global_batch_size: ${model.global_batch_size} | ||
validation_micro_batch_size: ${model.micro_batch_size} | ||
validation_drop_last: False | ||
report_validation_metric: False | ||
validation_metric: accuracy | ||
num_speech_tokens: 10112 # Vocabulary size pertaining to speech | ||
seq_pattern: "parallel" # parallel, delay_parallel, flatten | ||
temperature: 0.85 # Temperature to be used for inference | ||
top_k: 80 # Top k to be used for inference | ||
max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for | ||
|
||
restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with | ||
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training. | ||
existing_tasks: [] | ||
new_tasks: ["squad"] | ||
codecmodel_type: nemo_codec | ||
codecmodel_path: ??? | ||
english_only_model: true | ||
context_conditioning: decoder | ||
train_from_scratch: true | ||
override_tokenizer_vocab_file: ??? | ||
use_flash_attention: false | ||
lm_vocab_size: 30000 | ||
|
||
frozen_model: | ||
tensor_model_parallel_size: 1 | ||
pipeline_model_parallel_size: 1 | ||
pipeline_model_parallel_split_rank: 0 | ||
make_vocab_size_divisible_by: 128 | ||
pre_process: true | ||
post_process: true | ||
gradient_as_bucket_view: true | ||
native_amp_init_scale: 4294967296 | ||
native_amp_growth_interval: 1000 | ||
fp16_lm_cross_entropy: false | ||
seed: 1234 | ||
use_cpu_initialization: false | ||
apex_transformer_log_level: 30 | ||
tokenizer: | ||
library: megatron | ||
type: BertWordPieceCase | ||
model: null | ||
vocab_file: null | ||
merge_file: null | ||
optim: | ||
name: null | ||
data: | ||
dataset_type: t5 | ||
encoder: | ||
arch: transformer | ||
bias_activation_fusion: false | ||
use_flash_attention: ${model.use_flash_attention} | ||
num_layers: 12 | ||
hidden_size: 768 | ||
ffn_hidden_size: 2048 | ||
num_attention_heads: 12 | ||
init_method_std: 0.015 | ||
hidden_dropout: 0.1 | ||
attention_dropout: 0.1 | ||
kv_channels: 64 | ||
activation: geglu | ||
decoder: | ||
arch: transformer | ||
bias_activation_fusion: false | ||
use_flash_attention: ${model.use_flash_attention} | ||
num_layers: 12 | ||
hidden_size: 768 | ||
ffn_hidden_size: 2048 | ||
num_attention_heads: 12 | ||
init_method_std: 0.015 | ||
hidden_dropout: 0.1 | ||
attention_dropout: 0.1 | ||
kv_channels: 64 | ||
activation: geglu | ||
|
||
task_templates: | ||
- taskname: "squad" | ||
prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}" | ||
total_virtual_tokens: 3 | ||
virtual_token_splits: [3] | ||
truncate_field: context | ||
answer_field: answer | ||
|
||
p_tuning: # P-tuning specific params | ||
encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default | ||
num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp | ||
dropout: 0.0 | ||
|
||
prompt_tuning: # Prompt tunin specific params | ||
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks | ||
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random | ||
|
||
data: | ||
grapheme_prefix: null | ||
train_ds: null | ||
validation_ds: null | ||
test_ds: ??? | ||
max_seq_length: 1536 | ||
sample_rate: 24000 | ||
add_eos: true | ||
add_bos: false | ||
decoder_starts_with_pad: False | ||
add_eos_to_decoder_output: True | ||
add_sentinel_to_input: True | ||
ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x> | ||
shuffle: true | ||
num_workers: 4 | ||
pin_memory: true | ||
speech_offset: 30000 | ||
train_task: all | ||
sup_data_path: None | ||
num_speech_codebooks: 8 | ||
codebook_fps: 86 | ||
context_duration_min: 2.9 | ||
context_duration_max: 2.9 | ||
context_slice_method: "fixed" | ||
phoneme_probability: 1.0 | ||
g2p: | ||
english: | ||
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p | ||
phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt" | ||
heteronyms: "scripts/tts_dataset_files/heteronyms-052722" | ||
phoneme_probability: 0.8 | ||
ignore_ambiguous_words: False | ||
use_chars: True | ||
use_stresses: True | ||
grapheme_prefix: ${model.data.grapheme_prefix} | ||
spanish: | ||
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p | ||
phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict" | ||
phoneme_probability: 0.8 | ||
use_chars: True | ||
use_stresses: True | ||
ignore_ambiguous_words: False | ||
grapheme_prefix: ${model.data.grapheme_prefix} | ||
locale: "es-ES" | ||
mandarin: | ||
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p | ||
phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt" | ||
word_segmenter: "jieba" | ||
phoneme_prefix: "" | ||
phoneme_case: "lower" | ||
tone_prefix: "#" | ||
ascii_letter_prefix: ${model.data.grapheme_prefix} | ||
ascii_letter_case: "upper" | ||
german: | ||
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p | ||
phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict" | ||
heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym" | ||
phoneme_probability: 0.8 | ||
ignore_ambiguous_words: False | ||
use_chars: True | ||
use_stresses: True | ||
grapheme_case: mixed | ||
grapheme_prefix: ${model.data.grapheme_prefix} | ||
locale: "de-DE" | ||
|
||
optim: | ||
name: fused_adam | ||
lr: 5e-5 | ||
weight_decay: 0.01 | ||
betas: | ||
- 0.9 | ||
- 0.98 |
Oops, something went wrong.