Feedback changes added

Vitek-Lab · Nov 12, 2024 · 054ccbb · 054ccbb
1 parent 8237878
commit 054ccbb
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 40 deletions.
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -16,18 +16,19 @@ jobs:
     - name: Set Up SSH Access
       run: |
         mkdir -p ~/.ssh
-        echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
+        touch ~/.ssh/id_rsa
         chmod 600 ~/.ssh/id_rsa
-        ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts
+        echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
+        ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1
 
     - name: Transfer Files to HPC
       run: |
-        scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/home/raina.ans/R
+        scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1
 
     - name: Submit Slurm Job and Capture Job ID
       id: submit_job
       run: |
-        ssh raina.ans@login-00.discovery.neu.edu "cd R && sbatch config.slurm" | tee slurm_job_id.txt
+        ssh raina.ans@login-00.discovery.neu.edu "cd R && sbatch config.slurm" | tee slurm_job_id.txt 
         slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) 
         echo "Slurm Job ID is $slurm_job_id"
         echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV  

diff --git a/benchmark/benchmark.R b/benchmark/benchmark.R
@@ -26,27 +26,10 @@ calculateResult <- function(summarized, label){
   cat("Expected Log Change Ecoli:", ecoli_median, "\n")
   cat("Expected Log Change Yeast:", yeast_median, "\n")
 
-  #calculate SD and mean
-
-
-  # Kept the code for Individual Boxplots
-
-  # boxplot(human_comparisonResult$log2FC,
-  #         main = "Boxplot of log2FC for Human",
-  #         ylab = "log2FC",
-  #         col = "lightblue")
-  # 
-  # 
   boxplot(ecoli_comparisonResult$log2FC,
           main = "Boxplot of log2FC for E. coli",
           ylab = "log2FC",
           col = "lightgreen")
-  # 
-  # boxplot(yeast_comparisonResult$log2FC,
-  #         main = "Boxplot of log2FC for Yeast",
-  #         ylab = "log2FC",
-  #         col = "lightpink")
-
   combined_data <- list(
     Human = human_comparisonResult$log2FC,
     Ecoli = ecoli_comparisonResult$log2FC,
@@ -57,14 +40,14 @@ calculateResult <- function(summarized, label){
   unique_ecoli_proteins <- unique(ecoli_comparisonResult$Protein)
   unique_yeast_proteins <- unique(yeast_comparisonResult$Protein)
 
-  all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins))  # find out the significant proteins in FragData
+  all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins)) 
 
   extracted_proteins <- sapply(all_proteins, function(x) {
-    split_string <- strsplit(x, "\\|")[[1]]  # Split the string by '|'
+    split_string <- strsplit(x, "\\|")[[1]]  
     if (length(split_string) >= 2) {
-      return(split_string[2])  # Return the second element
+      return(split_string[2])  
     } else {
-      return(NA)  # Return NA if there's no second element
+      return(NA)  
     }
   })
 
@@ -114,8 +97,7 @@ calculateResult <- function(summarized, label){
 
 start_time <- Sys.time()
 
-# Use fread directly to read the CSV
-fragpipe_raw = data.table::fread("..//data//FragPipeMsStatsBenchmarking.csv")
+fragpipe_raw = data.table::fread("/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats.csv")
 
 head(fragpipe_raw)
 
@@ -127,11 +109,9 @@ fragpipe_raw$BioReplicate = unlist(lapply(fragpipe_raw$Run, function(x){
   paste(str_split(x, "\\_")[[1]][4:7], collapse="_")
 }))
 
-# Convert to MSstats format
 msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)
 
 
-# Define the tasks with descriptive labels
 data_process_tasks <- list(
   list(
     label = "Data process with Normalized Data",
@@ -151,29 +131,20 @@ data_process_tasks <- list(
   )
 )
 
-# Start the timer
 start_time <- Sys.time()
 
-# Use mclapply to run the dataProcess tasks in parallel
-num_cores <- detectCores() - 1  # Use one less than the total cores available
+num_cores <- detectCores() - 1 
 
-# Run data processing tasks in parallel and collect results with labels
 summarized_results <- mclapply(data_process_tasks, function(task) {
   list(label = task$label, summarized = task$result())
 }, mc.cores = num_cores)
 
-# Run calculateResult on each summarized result in parallel
 results_list <- mclapply(summarized_results, function(res) {
   calculateResult(res$summarized, res$label)
 }, mc.cores = num_cores)
 
-# Combine all results into a single data frame
 final_results <- do.call(rbind, results_list)
-
-# End the timer
 end_time <- Sys.time()
 total_time <- end_time - start_time
-
-# Display the final results and execution time
 print(final_results)
 print(paste("Total Execution Time:", total_time))
diff --git a/benchmark/config.slurm b/benchmark/config.slurm
@@ -5,7 +5,7 @@
 #SBATCH --time=01:00:00         # Set the maximum run time
 #SBATCH --ntasks=1              # Number of tasks (one process)
 #SBATCH --cpus-per-task=8      # Use 8 CPU cores for the task
-#SBATCH --mem=128G              # Request 256GB of memory
+#SBATCH --mem=256G              # Request 256GB of memory
 #SBATCH --partition=short       # Use the 'short' partition (or change as needed)
 
 module load R-geospatial