adding model selection in config.txt for slurm job submission

jeffersonfparil · Jun 24, 2024 · 7f713cd · 7f713cd
1 parent 1006ee9
commit 7f713cd
Show file tree

Hide file tree

Showing 7 changed files with 21 additions and 16 deletions.
diff --git a/R/main.R b/R/main.R
@@ -38,7 +38,7 @@
 #'      (see ?fn_load_phenotype for details)
 #'  - $pheno_idx_col_y: column number in the phenotype file corresponding to the numeric phenotype data
 #'      (see ?fn_load_phenotype for details)
-#'  - $pheno_na_strings: strings of characters corresponding to missing data in the phenotype file
+#'  - $pheno_vec_na_strings: strings of characters corresponding to missing data in the phenotype file
 #'      (see ?fn_load_phenotype for details)
 #'  - $pheno_bool_remove_outliers: remove outliers from the phenotype file?
 #'  - $pheno_bool_remove_NA: remove samples missing phenotype data in the phenotype file?
@@ -236,7 +236,7 @@
 #'     pheno_idx_col_id=1,
 #'     pheno_idx_col_pop=2,
 #'     pheno_idx_col_y=3,
-#'     pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
+#'     pheno_vec_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
 #'     pheno_bool_remove_outliers=TRUE,
 #'     pheno_bool_remove_NA=FALSE,
 #'     bool_within=TRUE,
@@ -286,7 +286,7 @@ gp = function(args) {
     #     pheno_idx_col_id=1,
     #     pheno_idx_col_pop=2,
     #     pheno_idx_col_y=3,
-    #     pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
+    #     pheno_vec_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
     #     pheno_bool_remove_outliers=FALSE,
     #     pheno_bool_remove_NA=FALSE,
     #     bool_within=TRUE,
@@ -318,7 +318,7 @@ gp = function(args) {
         idx_col_id=args$pheno_idx_col_id,
         idx_col_pop=args$pheno_idx_col_pop,
         idx_col_y=args$pheno_idx_col_y,
-        na_strings=args$pheno_na_strings,
+        na_strings=args$pheno_vec_na_strings,
         verbose=args$verbose
     )
     if (methods::is(list_pheno, "gpError")) {return(list_pheno)}

diff --git a/inst/exec_Rscript/0-submit.sh b/inst/exec_Rscript/0-submit.sh
@@ -9,10 +9,10 @@ CONFIG_NREPS=$(sed "s/\"/'/g" config.txt | sed -n '4p')
 CONFIG_DIR_OUT=$(sed "s/\"/'/g" config.txt | sed -n '5p')
 CONFIG_JOB_NAME=$(sed "s/\"/'/g" config.txt | sed -n '6p')
 CONFIG_ACCOUNT_NAME=$(sed "s/\"/'/g" config.txt | sed -n '7p')
-CONFIG_NTASKS=$(sed "s/\"/'/g" config.txt | sed -n '8p')
-CONFIG_NCPUS=$(sed "s/\"/'/g" config.txt | sed -n '9p')
-CONFIG_MEM=$(sed "s/\"/'/g" config.txt | sed -n '10p')
-CONFIG_TIME_LIMIT=$(sed "s/\"/'/g" config.txt | sed -n '11p')
+CONFIG_NCPUS=$(sed "s/\"/'/g" config.txt | sed -n '8p')
+CONFIG_MEM=$(sed "s/\"/'/g" config.txt | sed -n '9p')
+CONFIG_TIME_LIMIT=$(sed "s/\"/'/g" config.txt | sed -n '10p')
+CONFIG_MODELS=$(sed "s/\"/'/g" config.txt | sed -n '11p')
 ### Create the checks and submission scripts using the config variables
 sed "s|GENOTYPE_DATA_RDS=\${DIR_SRC}/input/test_geno.Rds|$CONFIG_GENO|g" 1-checks_and_submision.sh | \
     sed "s|PHENOTYPE_DATA_TSV=\${DIR_SRC}/input/test_pheno.tsv|$CONFIG_PHENO|g" | \
@@ -24,10 +24,10 @@ sed "s|GENOTYPE_DATA_RDS=\${DIR_SRC}/input/test_geno.Rds|$CONFIG_GENO|g" 1-check
 ### Create the slurm job scripts using the config variables
 sed "s|SBATCH --job-name='GS'|$CONFIG_JOB_NAME|g" 2-gp_slurm_job.sh | \
     sed "s|SBATCH --account='dbiopast1'|$CONFIG_ACCOUNT_NAME|g" | \
-    sed "s|SBATCH --ntasks=1|$CONFIG_NTASKS|g" | \
     sed "s|SBATCH --cpus-per-task=16|$CONFIG_NCPUS|g" | \
     sed "s|SBATCH --mem=100G|$CONFIG_MEM|g" | \
-    sed "s|SBATCH --time=1-0:0:00|$CONFIG_TIME_LIMIT|g" \
+    sed "s|SBATCH --time=1-0:0:00|$CONFIG_TIME_LIMIT|g" | \
+    sed "s|--vec-models-to-test ridge,lasso,elastic_net,Bayes_A,Bayes_B,Bayes_C,gBLUP|--vec-models-to-test $CONFIG_MODELS|g" \
 > 2-gp_slurm_job-${RUN_NAME}.sh
 ### Check input and submit the slurm job
 chmod +x 1-checks_and_submision-${RUN_NAME}.sh

diff --git a/inst/exec_Rscript/2-gp_slurm_job.sh b/inst/exec_Rscript/2-gp_slurm_job.sh
@@ -110,6 +110,7 @@ Rscript ${DIR_SRC}/gp.R \
     --pheno-idx-col-y $COLUMN_ID \
     --bool-within TRUE \
     --bool-across $BOOL_ACROSS \
+    --vec-models-to-test ridge,lasso,elastic_net,Bayes_A,Bayes_B,Bayes_C,gBLUP \
     --n-folds $KFOLDS \
     --n-reps $NREPS \
     --bool-parallel TRUE \

diff --git a/inst/exec_Rscript/config.txt b/inst/exec_Rscript/config.txt
@@ -5,7 +5,7 @@ NREPS=2
 DIR_OUT=${DIR_SRC}
 SBATCH --job-name="test"
 SBATCH --account="dbiopast2"
-SBATCH --ntasks=1
 SBATCH --cpus-per-task=4
 SBATCH --mem=10G
-SBATCH --time=0-0:10:00
+SBATCH --time=0-0:10:00
+ridge,Bayes_A,Bayes_B,Bayes_C,gBLUP
diff --git a/inst/exec_Rscript/gp.R b/inst/exec_Rscript/gp.R
@@ -28,7 +28,7 @@ parser$add_argument("--pheno-header",                                    dest="p
 parser$add_argument("--pheno-idx-col-id",                                dest="pheno_idx_col_id",                                type="integer", default=1,                                                                         help="Column number in the phenotype file corresponding to the sample names [default=1].")
 parser$add_argument("--pheno-idx-col-pop",                               dest="pheno_idx_col_pop",                               type="integer", default=2,                                                                         help="Column number in the phenotype file corresponding to the population/grouping names [default=2].")
 parser$add_argument("--pheno-idx-col-y",                                 dest="pheno_idx_col_y",                                 type="integer", default=3,                                                                         help="Column number in the phenotype file corresponding to the numeric phenotype data [default=3].")
-parser$add_argument("--pheno-na-strings",                                dest="pheno_na_strings",                                type="character", default=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),                     help="Strings of characters corresponding to missing data in the phenotype file [default=c('', '-', 'NA', 'na', 'NaN', 'missing', 'MISSING')].")
+parser$add_argument("--pheno-na-strings",                                dest="pheno_vec_na_strings",                                type="character", default=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),                     help="Strings of characters corresponding to missing data in the phenotype file [default=c('', '-', 'NA', 'na', 'NaN', 'missing', 'MISSING')].")
 parser$add_argument("--pheno-bool-remove-outliers",                      dest="pheno_bool_remove_outliers",                      type="logical", default=FALSE,                                                                      help="Remove outliers from the phenotype file [default=FALSE]?")
 parser$add_argument("--pheno-bool-remove-NA",                            dest="pheno_bool_remove_NA",                            type="logical", default=FALSE,                                                                     help="Remove samples missing phenotype data in the phenotype file? [default=FALSE].")
 parser$add_argument("--bool-within",                                     dest="bool_within",                                     type="logical", default=TRUE,                                                                      help="Perform within population k-fold cross-validation? [default=TRUE].")
@@ -63,7 +63,11 @@ print(paste0("     - with a total of ", args$n_threads, " threads available and
 print(paste0("       a total memory of ", args$max_mem_Gb, " Gb."))
 print(paste0("Start time: ", time_ini))
 print("Input parameters:")
+### Parse input vectors
+args$vec_models_to_test = unlist(strsplit(gsub(" ", "", args$vec_models_to_test), ","))
+args$pheno_na_strings = unlist(strsplit(gsub(" ", "", args$pheno_na_strings), ","))
 print(args)
+### Run
 fname_out_Rds = gp::gp(args=args)
 time_fin = Sys.time()
 time_duration_minutes = as.numeric(difftime(time_fin, time_ini, units="min"))

diff --git a/man/gp.Rd b/man/gp.Rd
diff --git a/tests/testthat/test-main.R b/tests/testthat/test-main.R
@@ -31,7 +31,7 @@ test_that("gp", {
         pheno_idx_col_id=1,
         pheno_idx_col_pop=2,
         pheno_idx_col_y=3,
-        pheno_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
+        pheno_vec_na_strings=c("", "-", "NA", "na", "NaN", "missing", "MISSING"),
         pheno_bool_remove_outliers=TRUE,
         pheno_bool_remove_NA=FALSE,
         bool_within=TRUE,