Skip to content

Commit

Permalink
Merge branch 'main' into huiyingl/nemo2sftpeft_notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
HuiyingLi authored Nov 15, 2024
2 parents eb0eafe + 5db91e7 commit 0c6b6ac
Show file tree
Hide file tree
Showing 32 changed files with 8,180 additions and 100 deletions.
160 changes: 160 additions & 0 deletions examples/tts/speechllm/conf/megatron_t5_speechllm_inference.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
name: megatron_t5_speechllm_tts_inference
checkpoint_path: ???

trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 32
logger: False
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: 10000
max_steps: -1
log_every_n_steps: 10
val_check_interval: null
check_val_every_n_epoch: 3
gradient_clip_val: 1.0

exp_manager:
exp_dir: null
name: ${name}
create_wandb_logger: False
resume_if_exists: False
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
create_early_stopping_callback: False
early_stopping_callback_params:
monitor: "val_loss"
mode: "min"
min_delta: 0.001
patience: 10
verbose: True

model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
global_batch_size: 16
micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
validation_global_batch_size: ${model.global_batch_size}
validation_micro_batch_size: ${model.micro_batch_size}
validation_drop_last: False
report_validation_metric: False
validation_metric: accuracy
num_speech_tokens: 10112 # Vocabulary size pertaining to speech
seq_pattern: "parallel" # parallel, delay_parallel, flatten
temperature: 0.85 # Temperature to be used for inference
top_k: 80 # Top k to be used for inference
max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for

restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
existing_tasks: []
new_tasks: ["squad"]
codecmodel_type: nemo_codec
codecmodel_path: ???
english_only_model: true
context_conditioning: decoder
use_flash_attention: false
lm_vocab_size: 30000
task_templates:
- taskname: "squad"
prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
total_virtual_tokens: 3
virtual_token_splits: [3]
truncate_field: context
answer_field: answer

p_tuning: # P-tuning specific params
encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
dropout: 0.0

prompt_tuning: # Prompt tunin specific params
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random

data:
grapheme_prefix: null
train_ds: null
validation_ds: null
test_ds: ???
max_seq_length: 1536
sample_rate: 24000
add_eos: true
add_bos: false
decoder_starts_with_pad: False
add_eos_to_decoder_output: True
add_sentinel_to_input: True
ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
shuffle: true
num_workers: 4
pin_memory: true
speech_offset: 30000
train_task: all
sup_data_path: None
num_speech_codebooks: 8
codebook_fps: 86
context_duration_min: 2.9
context_duration_max: 2.9
context_slice_method: "fixed"
phoneme_probability: 1.0
g2p:
english:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_prefix: ${model.data.grapheme_prefix}
spanish:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
phoneme_probability: 0.8
use_chars: True
use_stresses: True
ignore_ambiguous_words: False
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "es-ES"
mandarin:
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
word_segmenter: "jieba"
phoneme_prefix: ""
phoneme_case: "lower"
tone_prefix: "#"
ascii_letter_prefix: ${model.data.grapheme_prefix}
ascii_letter_case: "upper"
german:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_case: mixed
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "de-DE"

optim:
name: fused_adam
lr: 5e-5
weight_decay: 0.01
betas:
- 0.9
- 0.98
213 changes: 213 additions & 0 deletions examples/tts/speechllm/conf/megatron_t5_speechllm_inference_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
name: megatron_t5_speechllm_tts_inference
checkpoint_path: ???

trainer:
devices: 1
accelerator: gpu
num_nodes: 1
precision: 32
logger: False
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: 10000
max_steps: -1
log_every_n_steps: 10
val_check_interval: null
check_val_every_n_epoch: 3
gradient_clip_val: 1.0

exp_manager:
exp_dir: null
name: ${name}
create_wandb_logger: False
resume_if_exists: False
resume_ignore_no_checkpoint: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
filename: "megatron_t5_speechllm_tts--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True
create_early_stopping_callback: False
early_stopping_callback_params:
monitor: "val_loss"
mode: "min"
min_delta: 0.001
patience: 10
verbose: True

model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
global_batch_size: 16
micro_batch_size: 16 # micro batch size should equal global batch size when pipeline parallel = 1
validation_global_batch_size: ${model.global_batch_size}
validation_micro_batch_size: ${model.micro_batch_size}
validation_drop_last: False
report_validation_metric: False
validation_metric: accuracy
num_speech_tokens: 10112 # Vocabulary size pertaining to speech
seq_pattern: "parallel" # parallel, delay_parallel, flatten
temperature: 0.85 # Temperature to be used for inference
top_k: 80 # Top k to be used for inference
max_inference_timesteps: 1000 # Maximum number of timesteps to run inference for

restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
existing_tasks: []
new_tasks: ["squad"]
codecmodel_type: nemo_codec
codecmodel_path: ???
english_only_model: true
context_conditioning: decoder
train_from_scratch: true
override_tokenizer_vocab_file: ???
use_flash_attention: false
lm_vocab_size: 30000

frozen_model:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
gradient_as_bucket_view: true
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
fp16_lm_cross_entropy: false
seed: 1234
use_cpu_initialization: false
apex_transformer_log_level: 30
tokenizer:
library: megatron
type: BertWordPieceCase
model: null
vocab_file: null
merge_file: null
optim:
name: null
data:
dataset_type: t5
encoder:
arch: transformer
bias_activation_fusion: false
use_flash_attention: ${model.use_flash_attention}
num_layers: 12
hidden_size: 768
ffn_hidden_size: 2048
num_attention_heads: 12
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 64
activation: geglu
decoder:
arch: transformer
bias_activation_fusion: false
use_flash_attention: ${model.use_flash_attention}
num_layers: 12
hidden_size: 768
ffn_hidden_size: 2048
num_attention_heads: 12
init_method_std: 0.015
hidden_dropout: 0.1
attention_dropout: 0.1
kv_channels: 64
activation: geglu

task_templates:
- taskname: "squad"
prompt_template: "<|VIRTUAL_PROMPT_0|> {context} {question} {answer}"
total_virtual_tokens: 3
virtual_token_splits: [3]
truncate_field: context
answer_field: answer

p_tuning: # P-tuning specific params
encoder_type: "mlp" # Either "mlp" or "lstm", mlp is default
num_layers: 2 # 2 recommended for MLP, 1 recommended for LSTM, must be at least 2 for mlp
dropout: 0.0

prompt_tuning: # Prompt tunin specific params
new_prompt_init_methods: ['text'] # List of 'text' or 'random', should correspond to tasks listed in new tasks
new_prompt_init_text: ['some init text goes here'] # some init text if init method is text, or None if init method is random

data:
grapheme_prefix: null
train_ds: null
validation_ds: null
test_ds: ???
max_seq_length: 1536
sample_rate: 24000
add_eos: true
add_bos: false
decoder_starts_with_pad: False
add_eos_to_decoder_output: True
add_sentinel_to_input: True
ul2_prompt_token: null # <extra_id_s>, <extra_id_r>, <extra_id_x>
shuffle: true
num_workers: 4
pin_memory: true
speech_offset: 30000
train_task: all
sup_data_path: None
num_speech_codebooks: 8
codebook_fps: 86
context_duration_min: 2.9
context_duration_max: 2.9
context_slice_method: "fixed"
phoneme_probability: 1.0
g2p:
english:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv23.01.txt"
heteronyms: "scripts/tts_dataset_files/heteronyms-052722"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_prefix: ${model.data.grapheme_prefix}
spanish:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/es_ES/es_ES_nv230301.dict"
phoneme_probability: 0.8
use_chars: True
use_stresses: True
ignore_ambiguous_words: False
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "es-ES"
mandarin:
_target_: nemo.collections.tts.g2p.models.zh_cn_pinyin.ChineseG2p
phoneme_dict: "scripts/tts_dataset_files/zh/36finals/ipa_dict_nv23.05.txt"
word_segmenter: "jieba"
phoneme_prefix: ""
phoneme_case: "lower"
tone_prefix: "#"
ascii_letter_prefix: ${model.data.grapheme_prefix}
ascii_letter_case: "upper"
german:
_target_: nemo.collections.tts.g2p.models.i18n_ipa.IpaG2p
phoneme_dict: "scripts/tts_dataset_files/de/de_nv230119.dict"
heteronyms: "scripts/tts_dataset_files/de/de_nv230119.heteronym"
phoneme_probability: 0.8
ignore_ambiguous_words: False
use_chars: True
use_stresses: True
grapheme_case: mixed
grapheme_prefix: ${model.data.grapheme_prefix}
locale: "de-DE"

optim:
name: fused_adam
lr: 5e-5
weight_decay: 0.01
betas:
- 0.9
- 0.98
Loading

0 comments on commit 0c6b6ac

Please sign in to comment.