Skip to content

Commit

Permalink
Feedback changes added
Browse files Browse the repository at this point in the history
  • Loading branch information
Anshuman Raina committed Nov 12, 2024
1 parent 8237878 commit 054ccbb
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 40 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,19 @@ jobs:
- name: Set Up SSH Access
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
touch ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa
ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1
- name: Transfer Files to HPC
run: |
scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/home/raina.ans/R
scp benchmark/benchmark.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1
- name: Submit Slurm Job and Capture Job ID
id: submit_job
run: |
ssh raina.ans@login-00.discovery.neu.edu "cd R && sbatch config.slurm" | tee slurm_job_id.txt
ssh raina.ans@login-00.discovery.neu.edu "cd R && sbatch config.slurm" | tee slurm_job_id.txt
slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt)
echo "Slurm Job ID is $slurm_job_id"
echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV
Expand Down
41 changes: 6 additions & 35 deletions benchmark/benchmark.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,10 @@ calculateResult <- function(summarized, label){
cat("Expected Log Change Ecoli:", ecoli_median, "\n")
cat("Expected Log Change Yeast:", yeast_median, "\n")

#calculate SD and mean


# Kept the code for Individual Boxplots

# boxplot(human_comparisonResult$log2FC,
# main = "Boxplot of log2FC for Human",
# ylab = "log2FC",
# col = "lightblue")
#
#
boxplot(ecoli_comparisonResult$log2FC,
main = "Boxplot of log2FC for E. coli",
ylab = "log2FC",
col = "lightgreen")
#
# boxplot(yeast_comparisonResult$log2FC,
# main = "Boxplot of log2FC for Yeast",
# ylab = "log2FC",
# col = "lightpink")

combined_data <- list(
Human = human_comparisonResult$log2FC,
Ecoli = ecoli_comparisonResult$log2FC,
Expand All @@ -57,14 +40,14 @@ calculateResult <- function(summarized, label){
unique_ecoli_proteins <- unique(ecoli_comparisonResult$Protein)
unique_yeast_proteins <- unique(yeast_comparisonResult$Protein)

all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins)) # find out the significant proteins in FragData
all_proteins <- c(union(unique_ecoli_proteins, unique_yeast_proteins))

extracted_proteins <- sapply(all_proteins, function(x) {
split_string <- strsplit(x, "\\|")[[1]] # Split the string by '|'
split_string <- strsplit(x, "\\|")[[1]]
if (length(split_string) >= 2) {
return(split_string[2]) # Return the second element
return(split_string[2])
} else {
return(NA) # Return NA if there's no second element
return(NA)
}
})

Expand Down Expand Up @@ -114,8 +97,7 @@ calculateResult <- function(summarized, label){

start_time <- Sys.time()

# Use fread directly to read the CSV
fragpipe_raw = data.table::fread("..//data//FragPipeMsStatsBenchmarking.csv")
fragpipe_raw = data.table::fread("/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats.csv")

head(fragpipe_raw)

Expand All @@ -127,11 +109,9 @@ fragpipe_raw$BioReplicate = unlist(lapply(fragpipe_raw$Run, function(x){
paste(str_split(x, "\\_")[[1]][4:7], collapse="_")
}))

# Convert to MSstats format
msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)


# Define the tasks with descriptive labels
data_process_tasks <- list(
list(
label = "Data process with Normalized Data",
Expand All @@ -151,29 +131,20 @@ data_process_tasks <- list(
)
)

# Start the timer
start_time <- Sys.time()

# Use mclapply to run the dataProcess tasks in parallel
num_cores <- detectCores() - 1 # Use one less than the total cores available
num_cores <- detectCores() - 1

# Run data processing tasks in parallel and collect results with labels
summarized_results <- mclapply(data_process_tasks, function(task) {
list(label = task$label, summarized = task$result())
}, mc.cores = num_cores)

# Run calculateResult on each summarized result in parallel
results_list <- mclapply(summarized_results, function(res) {
calculateResult(res$summarized, res$label)
}, mc.cores = num_cores)

# Combine all results into a single data frame
final_results <- do.call(rbind, results_list)

# End the timer
end_time <- Sys.time()
total_time <- end_time - start_time

# Display the final results and execution time
print(final_results)
print(paste("Total Execution Time:", total_time))
2 changes: 1 addition & 1 deletion benchmark/config.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#SBATCH --time=01:00:00 # Set the maximum run time
#SBATCH --ntasks=1 # Number of tasks (one process)
#SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task
#SBATCH --mem=128G # Request 256GB of memory
#SBATCH --mem=256G # Request 256GB of memory
#SBATCH --partition=short # Use the 'short' partition (or change as needed)

module load R-geospatial
Expand Down

0 comments on commit 054ccbb

Please sign in to comment.