Merge pull request #2 from eipm/bio-330-star-storage-fix

Bio 330 star storage fix
eipm · Oct 27, 2020 · eace820 · eace820
2 parents 0845d24 + 3e7d8cf
commit eace820
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 56 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -70,8 +70,8 @@ COPY --from=bioinformatics_base ${star_dir}/source/STAR ${star_dir}/source/STAR
 #===========================#
 # Installing tools          #
 #===========================#
-RUN mkdir -p /scripts /resources /results
-RUN chmod ugo+wx /results
+RUN mkdir -p /scripts /resources /results /STAR_tmp
+RUN chmod ugo+wx /results /STAR_tmp
 COPY ERVmapping.sh /scripts
 COPY templates/ERValign.sh /scripts
 COPY templates/ERVcount.sh /scripts

diff --git a/README.md b/README.md
@@ -68,8 +68,7 @@ This option can only have 3 values: { `ALL`, `STAR`, `BED` }:
 * `STAR` to only perform the alignment;
 * `BED` to only run the ERV quantification.
 
-### Optional parameters (recommended)
-
+### <a id='optparam'></a>Optional parameters (*recommended*) 
 There are a few parameters that can be added to the ERVmap image to make the process more efficient.
 
 * `--cpus 20`: if you have a multi-core system (and you should have one), you can specify the number of CPUs to use (e.g. 20);
@@ -86,22 +85,25 @@ There are also other parameters from Docker that should be included before `ervm
 ## Nextflow version
 
 To run this pipeline using [Nextflow](https://www.nextflow.io/), simply run the following:
-`nextflow -C nextflow.config  run main.nf`
+`nextflow -C nextflow.config run main.nf`
 where `nextflow.config` include the minimum set of parameters to run ERVmap within the docker container. Specifically:
 
 ```bash
 params {
-    inputDir='' # external path of the input data
-    outputDir='' # external path of the output results
-    localOutDir='' # internal path of the results
-    outPrefix='test.' # [optional] it defines the prefix for the results
-    cpus=1 # Number of cpus/threads to use for the alignment
-    limitMemory=32000000 # memory limit for samtools
-    debug='off' # either [on|off]
+    genome='/path/to/genome'               # external path to the indexed genome for the STAR aligner
+    inputDir='path/to/input/folder'        # external path of the input data
+    inputPattern="*{1,2}.fastq.gz"         # pattern to search for input FASTQ files
+    outputDir='/path/to/output/folder'     # external path of the output results
+    starTmpDir='/path/to/STAR/temp/folder' # external path of the STAR aligner temporary folder. REQUIRED
+    localOutDir='.'                        # internal path of the results
+    cpus=20                                # Number of cpus/threads to use for the alignment 
+    limitMemory=1850861158                 # memory limit for STAR
+    debug='off'                            # either [on|off] 
 }
 ```
+**NOTE:** Adjust the memory settings of the docker container if needed, but recall that STAR requires about 32G of RAM (see [Optional Parameters](#optparam)).
 
-**NOTE**: another critical parameter is the location of the indexed genome. This information must be included in the config as `containerOptions = '--memory 35G --memory-swap 100G -v /path/to/genome:/genome:ro'` (replace `/path/to/genome` with the external location of the genome)
+**NOTE:** If you need to keep the BAM files generated, please make sure to make a copy. By default, the BAMs remains in the nextflow `work` folder and only symbolic links are available in `outputDir`. By cleaning up the `work` folder, e.g. by running `nextflow clean`, the bam files will be removed. The ERVmap results are copied into `outputDir` and thus are permanent.
 
 ----
 

diff --git a/main.nf b/main.nf
@@ -3,50 +3,56 @@
 
 // check parameters
 if (!params.inputDir) {
-    exit 1, "inputDir parameter is missing."
+    exit 1, 'inputDir parameter is missing.'
+}
+if (!new File(params.inputDir).exists()) {
+    exit 1, "The input folder does not exists."+params.inputDir+"\n"
 }
 if (!params.outputDir) {
     exit 1, "outputDir parameter is missing."
 }
-if ( !new File(params.inputDir).exists()) {
-    exit 1, "The input folder does not exists."+params.inputDir+"\n"
+if (!params.starTmpDir) {
+    exit 1, "starTmpDir parameter is missing."
+}
+
+if (!new File(params.starTmpDir).exists()) {
+    exit 1, 'The STAR temporary folder does not exists. ('+params.starTmpDir+')\n'
 }
-if (!params.outPrefix) {
-    exit 1, "Output prefix parameter is missing."
+if (!params.localOutputDir) {
+    params.localOutputDir='bam'
 }
 if (!params.debug) {
     exit 1, "Debug prefix parameter is missing."
 }
-if (!params.localOutDir) {
-    params.localOutDir='bam'
-}
 
-pairFiles_ch = Channel.fromFilePairs( params.inputDir+"/*{1,2}.fastq.gz", size: 2, checkIfExists: true )
+
+pairFiles_ch = Channel.fromFilePairs( params.inputDir+"/"+params.inputPattern, size: 2, checkIfExists: true )
+
 
 process ERValign {
     tag "${sample}"
 
     // executor configuration
     time '8h'
-    // memory '35.GB'
+    memory '35 GB'
     scratch true
     cpus params.cpus
-    storeDir params.outputDir
+    publishDir params.outputDir
 
     // other configuration
     echo true
     errorStrategy 'terminate'
 
     input:
-    val(outPrefix) from params.outPrefix
-    val(localOutDir) from params.localOutDir
-    val(limitMemory) from params.limitMemory
-    val(debug) from params.debug
-    tuple val(sample), file(reads) from pairFiles_ch
+    tuple val(sample), file (reads) from pairFiles_ch
+    val (localOutputDir) from params.localOutputDir
+    val (limitMemory) from params.limitMemory
+    val (debug) from params.debug
 
     output:
-    path ( "${localOutDir}/${outPrefix}Aligned.sortedByCoord.out.bam" ) into bam_ch
-    path ( "${localOutDir}/${outPrefix}Aligned.sortedByCoord.out.bam.bai" ) into bai_ch
+    path ("${localOutputDir}/${sample}.Aligned.sortedByCoord.out.bam") into bam_ch
+    path ("${localOutputDir}/${sample}.Aligned.sortedByCoord.out.bam.bai") into bai_ch
+    val (sample) into prefix_ch
 
     shell:
     template 'ERValign.sh'
@@ -64,20 +70,19 @@ process ERVcount {
     // other configuration
     echo true
     errorStrategy 'terminate'
-    mode = 'BED'
     stageInMode 'symlink'
 
     input:
-    val(outPrefix) from params.outPrefix
-    val(debug) from params.debug
+    val (sample) from prefix_ch
+    val (debug) from params.debug
     path (bam) from bam_ch
     path (bai) from bai_ch
 
     output: 
-    path (params.outPrefix+'ERVresults.txt') into final_results_ch
+    path ("${sample}"+'.ERVresults.txt') into final_results_ch
 
     shell:
-    template "ERVcount.sh"
+    template 'ERVcount.sh'
 }
 
 // ~~~~~~~~~~~~~~~ PIPELINE COMPLETION EVENTS ~~~~~~~~~~~~~~~~~~~ //

diff --git a/nextflow.config b/nextflow.config
@@ -9,19 +9,21 @@ docker {
 }
 
 params {
-    inputDir=''
-    outputDir=''
-    localOutDir=''
-    outPrefix='test.'
-    cpus=1
-    limitMemory=32000000
+    genome='/path/to/genome'
+    inputDir='path/to/input/folder'
+    inputPattern="*{1,2}.fastq.gz"
+    outputDir='/path/to/output/folder'
+    starTmpDir='/path/to/STAR/temp/folder'
+    localOutDir='.'
+    cpus=3
+    limitMemory=1850861158
     debug='off'
 }
 
 process {
     withName: ERValign {
         container = 'eipm/ervmap:latest'
-        containerOptions = '--memory 35G --memory-swap 100G -v /path/to/genome:/genome:ro'
+        containerOptions = "--memory 35G --memory-swap 100G -v ${params.genome}:/genome:ro -v ${params.starTmpDir}:/STAR_tmp"
     }
 
     withName: ERVcount {

diff --git a/templates/ERValign.sh b/templates/ERValign.sh
@@ -24,16 +24,12 @@ logMsg() {
     fi
 } 
 
-function usage() {
-    echo "Usage: ERValign.sh <-r1|--read1> SAMPLE_1.fastq.gz <-r2|--read2> SAMPLE_1.fastq.gz [-o|--output] results/SAMPLE [-c|--cpus] Ncpus [-l|--limit-ram] 35129075129 [-d|--debug {off|on}]\nA genome folder should be present in /genome"
-}
-
 # initializing parameters for STAR
 READS="!{reads}"
 CPUS=!{task.cpus}
 LIMIT_RAM=!{limitMemory}
-OUT_PREFIX="!{outPrefix}"
-LOCAL_OUTDIR="!{localOutDir}"
+OUT_PREFIX="!{sample}."
+LOCAL_OUTDIR="!{localOutputDir}"
 
 # checking the prefix of the output BAM
 if [ -z ${OUT_PREFIX+x} ];then
@@ -46,7 +42,8 @@ if [ -z ${LIMIT_RAM+x} ];then export LIMIT_RAM=35129075129;fi
 [ -e "/genome/genomeParameters.txt" ] || logMsg "ERROR" "The indexed genome cannot be found. Check that it is present and you have read permissions."
 
 logMsg "DEBUG" "OUT_PREFIX:($OUT_PREFIX)"
-logMsg "DEBUG" "Reads: ($READS)"l
+logMsg "DEBUG" "Local OutDir: $(pwd)/$LOCAL_OUTDIR"
+logMsg "DEBUG" "Reads: ($READS)"
 logMsg "DEBUG" "CPUs:($CPUS)"
 logMsg "DEBUG" "Limit RAM:($LIMIT_RAM)"
 
@@ -55,7 +52,7 @@ logMsg "INFO" "-------- START ERValign ---------"
 BAM="$LOCAL_OUTDIR/$OUT_PREFIX""Aligned.sortedByCoord.out.bam"
 
 logMsg "INFO" "---- Alignment ----"
-STAR --genomeDir /genome --runThreadN $CPUS --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM $LIMIT_RAM --outFilterMultimapNmax 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.02 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesIn $READS --readFilesCommand zcat --outFileNamePrefix $LOCAL_OUTDIR/$OUT_PREFIX
+STAR --genomeDir /genome --runThreadN $CPUS --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM $LIMIT_RAM --outFilterMultimapNmax 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.02 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesIn $READS --readFilesCommand zcat --outFileNamePrefix $LOCAL_OUTDIR/$OUT_PREFIX --outTmpDir /STAR_tmp/$OUT_PREFIX""tmp
 [ $? == 0 ] || logMsg  "ERROR" "The alignment didn't complete succesfully. Check the logs."
 
 logMsg "INFO" "---- Alignment Complete ----"

diff --git a/templates/ERVcount.sh b/templates/ERVcount.sh
@@ -1,9 +1,5 @@
 #!/bin/bash
 
-function usage() {
-    echo "Usage: ERVcount.sh <-b|--bam> Aligned.bam [-o|--output] results/SAMPLE [-d|--debug {off|on}]"
-}
-
 BAM="!{bam}"
 _DEBUG="!{debug}"
 
@@ -28,7 +24,7 @@ logMsg() {
     fi
 }  
 
-OUT_PREFIX=!{outPrefix}
+OUT_PREFIX="!{sample}."
 if [ -z ${OUT_PREFIX+x} ];then
     OUT_PREFIX="$RANDOM""_"
     logMsg "WARN" "This prefix will be used as output: $OUT_PREFIX"