-
Notifications
You must be signed in to change notification settings - Fork 1
/
fine-tuning.yaml
98 lines (84 loc) · 2.06 KB
/
fine-tuning.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
max_seq_len: 2048
global_seed: 1337
# Run Name
run_name: fine-tuning
# Model
model:
name: hf_causal_lm
pretrained: true
pretrained_model_name_or_path: vinai/RecGPT-7B
config_overrides:
max_seq_len: ${max_seq_len}
attn_config:
attn_impl: triton
alibi: true
prefix_lm: false
attn_uses_sequence_id: false
# Tokenizer
tokenizer:
name: vinai/RecGPT-7B
kwargs:
model_max_length: ${max_seq_len}
# Dataloaders
train_loader:
name: finetuning
dataset:
hf_name: path/to/sample_instruction_following_dataset
split: train
shuffle: true
max_seq_len: ${max_seq_len}
shuffle_seed: ${global_seed}
decoder_only_format: true
allow_pad_trimming: false
drop_last: true
num_workers: 8
# Optimization
scheduler:
name: cosine_with_warmup # Or using: linear_decay_with_warmup
t_warmup: 200ba # To be adjusted, for example: 1/20 the total number of training steps
alpha_f: 0.1 # 0.0 for linear_decay_with_warmup
optimizer:
name: decoupled_lionw # Or decoupled_adamw
lr: 1e-5 # To be adjusted
betas:
- 0.9
- 0.98
weight_decay: 1e-7
eps: 1e-07
algorithms:
gradient_clipping:
clipping_type: norm
clipping_threshold: 1.0
max_duration: 2ep
eval_interval: 1
eval_first: false
eval_subset_num_batches: -1
global_train_batch_size: 128 # To be adjusted, for example: 16 * the number of GPUs
# System
seed: ${global_seed}
device_eval_batch_size: 1
device_train_microbatch_size: 1
precision: amp_bf16
# FSDP
fsdp_config:
sharding_strategy: FULL_SHARD
mixed_precision: PURE
activation_checkpointing: false
activation_checkpointing_reentrant: false
activation_cpu_offload: false
limit_all_gathers: true
verbose: false
# Logging
progress_bar: false
log_to_console: true
console_log_interval: 10ba
callbacks:
speed_monitor:
window_size: 10
lr_monitor: {}
memory_monitor: {}
runtime_estimator: {}
# Checkpoint to local filesystem or remote object store
save_interval: 1ep
save_num_checkpoints_to_keep: 2 # Important, this cleans up checkpoints saved to DISK
save_folder: path/to/your/saving/folder