config.ini

[preprocess]

# For raptor
# in_tensor_dir : ./data/train-ready/pred-full/

# For comet
# in_tensor_dir : /scratch/mtari008/37154933/pred-full-deepnovo/

# For expanse
in_tensor_dir : /lclhome/mtari008/job_2436627/nist_massiv_80k_ch_graymass/

############ INPUT PARAMETERS ############
[input]

# file paths
msp_files : /lclhome/mtari008/DeepSNAP/data/msp
mgf_dir : /lclhome/mtari008/data/spectra/labeled
prep_dir : /lclhome/mtari008/data/deepatles/train_ready/nist-masive-deepnovo-5k-ch1-3-len7-30-200-mod-mass
; prep_dir : /lclhome/mtari008/data/deepatles/train_ready/pt-5k-ch1-3-len7-20-200-mod #proteome tools data most models are trained on.
val_dir : /lclhome/mtari008/data/deepatles/train_ready/nist-masive-deepnovo-5k-ch1-3-len7-30-200-mod-mass

# The array size to store a spectrum.
spec_size : 50000

# Max charge value to be used to read spectrum files.
charge : 5

# Whether to use modifications or not.
use_mods : True

# Max mods per peptide
num_mods: 5

# Number of species the training dataset contains.
num_species : 9 

master_port : 12346

rank : 1

############ DATABASE SEARCH PARAMETERS ############
[search]
mgf_dir : /lclhome/mtari008/data/spectra/unlabeled/uti-pxd004713/
prep_path : /lclhome/mtari008/data/deepatles/prep_spectra/uti-pxd004713
pep_dir : /lclhome/mtari008/data/peps/refup-single
out_pin_dir : /lclhome/mtari008/DeepAtles/percolator-refup-no-filt-uti-pxd004713
index_path : /lclhome/mtari008/DeepAtles/index

model_name : 512-embed-2-lstm-SnapLoss2D-80k-nist-massive-no-mc-semi-r2r2r-22.pt

# Batch sizes for forward pass through the network
spec_batch_size : 16384
pep_batch_size : 16384

# Batch size for database search
search_spec_batch_size : 256

precursor_tolerance : 7 # Precursor tolerance to use during database search (Da or ppm)
precursor_tolerance_type : ppm # either ppm or Da

keep_psms : 5 # Number of top scoring psms to keep

# Number of modified peptides to be generated to search against. 
# Different than the one in input section
num_mods : 1

charge: 4 # charge to be used during search

############ FILTERING PARAMETERS ############
[filter]
length_filter: False
len_tol_neg: 0
len_tol_pos: 0
missed_cleavages_filter: False
modification_filter: False

############### OUT OF CORE PARAMETERS ##############
[ooc]
chunk_size: 10000000

############ MACHINE LEARNING PARAMETERS ############
[ml]

batch_size : 1024

test_size : 0.2

max_spec_len : 200
min_pep_len: 7
max_pep_len : 30
# slightly larger than max_pep_len to account for modifications
pep_seq_len : 36
max_clvs : 2
embedding_dim : 1024
encoder_layers : 4
num_heads : 16

train_count : 0

ce_weight_clv : 1
ce_weight_mod : 1
mse_weight : 3

dropout : 0.3

lr : 0.0001

weight_decay : 0.0001

epochs : 500

margin : 0.2

read_split_listing : False

############ DEFAULT VALUES ############
# DO NOT CHANGE
[default]
msp_file : /data/human_consensus_final_true_lib.msp
mgf_files : /data/
spec_size : 8000
charge : 2
use_mods : False
batch_size : 1024