Skip to content

Commit

Permalink
Merge pull request #2 from eipm/bio-330-star-storage-fix
Browse files Browse the repository at this point in the history
Bio 330 star storage fix
  • Loading branch information
asboner authored Oct 27, 2020
2 parents 0845d24 + 3e7d8cf commit eace820
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 56 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ COPY --from=bioinformatics_base ${star_dir}/source/STAR ${star_dir}/source/STAR
#===========================#
# Installing tools #
#===========================#
RUN mkdir -p /scripts /resources /results
RUN chmod ugo+wx /results
RUN mkdir -p /scripts /resources /results /STAR_tmp
RUN chmod ugo+wx /results /STAR_tmp
COPY ERVmapping.sh /scripts
COPY templates/ERValign.sh /scripts
COPY templates/ERVcount.sh /scripts
Expand Down
24 changes: 13 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ This option can only have 3 values: { `ALL`, `STAR`, `BED` }:
* `STAR` to only perform the alignment;
* `BED` to only run the ERV quantification.

### Optional parameters (recommended)

### <a id='optparam'></a>Optional parameters (*recommended*)
There are a few parameters that can be added to the ERVmap image to make the process more efficient.

* `--cpus 20`: if you have a multi-core system (and you should have one), you can specify the number of CPUs to use (e.g. 20);
Expand All @@ -86,22 +85,25 @@ There are also other parameters from Docker that should be included before `ervm
## Nextflow version

To run this pipeline using [Nextflow](https://www.nextflow.io/), simply run the following:
`nextflow -C nextflow.config run main.nf`
`nextflow -C nextflow.config run main.nf`
where `nextflow.config` include the minimum set of parameters to run ERVmap within the docker container. Specifically:

```bash
params {
inputDir='' # external path of the input data
outputDir='' # external path of the output results
localOutDir='' # internal path of the results
outPrefix='test.' # [optional] it defines the prefix for the results
cpus=1 # Number of cpus/threads to use for the alignment
limitMemory=32000000 # memory limit for samtools
debug='off' # either [on|off]
genome='/path/to/genome' # external path to the indexed genome for the STAR aligner
inputDir='path/to/input/folder' # external path of the input data
inputPattern="*{1,2}.fastq.gz" # pattern to search for input FASTQ files
outputDir='/path/to/output/folder' # external path of the output results
starTmpDir='/path/to/STAR/temp/folder' # external path of the STAR aligner temporary folder. REQUIRED
localOutDir='.' # internal path of the results
cpus=20 # Number of cpus/threads to use for the alignment
limitMemory=1850861158 # memory limit for STAR
debug='off' # either [on|off]
}
```
**NOTE:** Adjust the memory settings of the docker container if needed, but recall that STAR requires about 32G of RAM (see [Optional Parameters](#optparam)).

**NOTE**: another critical parameter is the location of the indexed genome. This information must be included in the config as `containerOptions = '--memory 35G --memory-swap 100G -v /path/to/genome:/genome:ro'` (replace `/path/to/genome` with the external location of the genome)
**NOTE:** If you need to keep the BAM files generated, please make sure to make a copy. By default, the BAMs remains in the nextflow `work` folder and only symbolic links are available in `outputDir`. By cleaning up the `work` folder, e.g. by running `nextflow clean`, the bam files will be removed. The ERVmap results are copied into `outputDir` and thus are permanent.

----

Expand Down
51 changes: 28 additions & 23 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,50 +3,56 @@

// check parameters
if (!params.inputDir) {
exit 1, "inputDir parameter is missing."
exit 1, 'inputDir parameter is missing.'
}
if (!new File(params.inputDir).exists()) {
exit 1, "The input folder does not exists."+params.inputDir+"\n"
}
if (!params.outputDir) {
exit 1, "outputDir parameter is missing."
}
if ( !new File(params.inputDir).exists()) {
exit 1, "The input folder does not exists."+params.inputDir+"\n"
if (!params.starTmpDir) {
exit 1, "starTmpDir parameter is missing."
}

if (!new File(params.starTmpDir).exists()) {
exit 1, 'The STAR temporary folder does not exists. ('+params.starTmpDir+')\n'
}
if (!params.outPrefix) {
exit 1, "Output prefix parameter is missing."
if (!params.localOutputDir) {
params.localOutputDir='bam'
}
if (!params.debug) {
exit 1, "Debug prefix parameter is missing."
}
if (!params.localOutDir) {
params.localOutDir='bam'
}

pairFiles_ch = Channel.fromFilePairs( params.inputDir+"/*{1,2}.fastq.gz", size: 2, checkIfExists: true )

pairFiles_ch = Channel.fromFilePairs( params.inputDir+"/"+params.inputPattern, size: 2, checkIfExists: true )


process ERValign {
tag "${sample}"

// executor configuration
time '8h'
// memory '35.GB'
memory '35 GB'
scratch true
cpus params.cpus
storeDir params.outputDir
publishDir params.outputDir

// other configuration
echo true
errorStrategy 'terminate'

input:
val(outPrefix) from params.outPrefix
val(localOutDir) from params.localOutDir
val(limitMemory) from params.limitMemory
val(debug) from params.debug
tuple val(sample), file(reads) from pairFiles_ch
tuple val(sample), file (reads) from pairFiles_ch
val (localOutputDir) from params.localOutputDir
val (limitMemory) from params.limitMemory
val (debug) from params.debug

output:
path ( "${localOutDir}/${outPrefix}Aligned.sortedByCoord.out.bam" ) into bam_ch
path ( "${localOutDir}/${outPrefix}Aligned.sortedByCoord.out.bam.bai" ) into bai_ch
path ("${localOutputDir}/${sample}.Aligned.sortedByCoord.out.bam") into bam_ch
path ("${localOutputDir}/${sample}.Aligned.sortedByCoord.out.bam.bai") into bai_ch
val (sample) into prefix_ch

shell:
template 'ERValign.sh'
Expand All @@ -64,20 +70,19 @@ process ERVcount {
// other configuration
echo true
errorStrategy 'terminate'
mode = 'BED'
stageInMode 'symlink'

input:
val(outPrefix) from params.outPrefix
val(debug) from params.debug
val (sample) from prefix_ch
val (debug) from params.debug
path (bam) from bam_ch
path (bai) from bai_ch

output:
path (params.outPrefix+'ERVresults.txt') into final_results_ch
path ("${sample}"+'.ERVresults.txt') into final_results_ch

shell:
template "ERVcount.sh"
template 'ERVcount.sh'
}

// ~~~~~~~~~~~~~~~ PIPELINE COMPLETION EVENTS ~~~~~~~~~~~~~~~~~~~ //
Expand Down
16 changes: 9 additions & 7 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,21 @@ docker {
}

params {
inputDir=''
outputDir=''
localOutDir=''
outPrefix='test.'
cpus=1
limitMemory=32000000
genome='/path/to/genome'
inputDir='path/to/input/folder'
inputPattern="*{1,2}.fastq.gz"
outputDir='/path/to/output/folder'
starTmpDir='/path/to/STAR/temp/folder'
localOutDir='.'
cpus=3
limitMemory=1850861158
debug='off'
}

process {
withName: ERValign {
container = 'eipm/ervmap:latest'
containerOptions = '--memory 35G --memory-swap 100G -v /path/to/genome:/genome:ro'
containerOptions = "--memory 35G --memory-swap 100G -v ${params.genome}:/genome:ro -v ${params.starTmpDir}:/STAR_tmp"
}

withName: ERVcount {
Expand Down
13 changes: 5 additions & 8 deletions templates/ERValign.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,12 @@ logMsg() {
fi
}

function usage() {
echo "Usage: ERValign.sh <-r1|--read1> SAMPLE_1.fastq.gz <-r2|--read2> SAMPLE_1.fastq.gz [-o|--output] results/SAMPLE [-c|--cpus] Ncpus [-l|--limit-ram] 35129075129 [-d|--debug {off|on}]\nA genome folder should be present in /genome"
}

# initializing parameters for STAR
READS="!{reads}"
CPUS=!{task.cpus}
LIMIT_RAM=!{limitMemory}
OUT_PREFIX="!{outPrefix}"
LOCAL_OUTDIR="!{localOutDir}"
OUT_PREFIX="!{sample}."
LOCAL_OUTDIR="!{localOutputDir}"

# checking the prefix of the output BAM
if [ -z ${OUT_PREFIX+x} ];then
Expand All @@ -46,7 +42,8 @@ if [ -z ${LIMIT_RAM+x} ];then export LIMIT_RAM=35129075129;fi
[ -e "/genome/genomeParameters.txt" ] || logMsg "ERROR" "The indexed genome cannot be found. Check that it is present and you have read permissions."

logMsg "DEBUG" "OUT_PREFIX:($OUT_PREFIX)"
logMsg "DEBUG" "Reads: ($READS)"l
logMsg "DEBUG" "Local OutDir: $(pwd)/$LOCAL_OUTDIR"
logMsg "DEBUG" "Reads: ($READS)"
logMsg "DEBUG" "CPUs:($CPUS)"
logMsg "DEBUG" "Limit RAM:($LIMIT_RAM)"

Expand All @@ -55,7 +52,7 @@ logMsg "INFO" "-------- START ERValign ---------"
BAM="$LOCAL_OUTDIR/$OUT_PREFIX""Aligned.sortedByCoord.out.bam"

logMsg "INFO" "---- Alignment ----"
STAR --genomeDir /genome --runThreadN $CPUS --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM $LIMIT_RAM --outFilterMultimapNmax 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.02 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesIn $READS --readFilesCommand zcat --outFileNamePrefix $LOCAL_OUTDIR/$OUT_PREFIX
STAR --genomeDir /genome --runThreadN $CPUS --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM $LIMIT_RAM --outFilterMultimapNmax 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverLmax 0.02 --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --readFilesIn $READS --readFilesCommand zcat --outFileNamePrefix $LOCAL_OUTDIR/$OUT_PREFIX --outTmpDir /STAR_tmp/$OUT_PREFIX""tmp
[ $? == 0 ] || logMsg "ERROR" "The alignment didn't complete succesfully. Check the logs."

logMsg "INFO" "---- Alignment Complete ----"
Expand Down
6 changes: 1 addition & 5 deletions templates/ERVcount.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
#!/bin/bash

function usage() {
echo "Usage: ERVcount.sh <-b|--bam> Aligned.bam [-o|--output] results/SAMPLE [-d|--debug {off|on}]"
}

BAM="!{bam}"
_DEBUG="!{debug}"

Expand All @@ -28,7 +24,7 @@ logMsg() {
fi
}

OUT_PREFIX=!{outPrefix}
OUT_PREFIX="!{sample}."
if [ -z ${OUT_PREFIX+x} ];then
OUT_PREFIX="$RANDOM""_"
logMsg "WARN" "This prefix will be used as output: $OUT_PREFIX"
Expand Down

0 comments on commit eace820

Please sign in to comment.