diff --git a/CHANGELOG.md b/CHANGELOG.md index 09cffbc..989e890 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Initial release of nf-core/seqinspector, created with the [nf-core](https://nf-c - [#50](https://github.com/nf-core/seqinspector/pull/50) Add an optional subsampling step. - [#51](https://github.com/nf-core/seqinspector/pull/51) Add nf-test to CI. - [#63](https://github.com/nf-core/seqinspector/pull/63) Contribution guidelines added about displaying results for new tools +- [#67](https://github.com/nf-core/seqinspector/pull/67) Add FASTQ linting for early validation ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index 8a4e350..954c017 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,8 @@ ## Pipeline tools +- [FQ](https://github.com/stjude-rust-labs/fq) + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. diff --git a/README.md b/README.md index 6cf36dc..6f25df4 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,10 @@ workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. --> +1. Lint FASTQs with ([`fq`](https://github.com/stjude-rust-labs/fq)) 1. Subsample reads ([`Seqtk`](https://github.com/lh3/seqtk)) -2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -3. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) +1. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) ## Usage diff --git a/conf/modules.config b/conf/modules.config index d3c597b..df0411e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,6 +18,15 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] + withName: 'FQ_LINT' { + ext.args = { params.fq_lint_args } + errorStrategy = { + task.exitStatus in ((130..145) + 104) ? 'retry' : + params.continue_with_lint_fail ? 'ignore' : + 'finish' + } + } + withName: SEQTK_SAMPLE { ext.args = '-s100' } diff --git a/modules.json b/modules.json index 7e57ea1..c78482c 100644 --- a/modules.json +++ b/modules.json @@ -10,6 +10,11 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "fq/lint": { + "branch": "master", + "git_sha": "a1abf90966a2a4016d3c3e41e228bfcbd4811ccc", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", diff --git a/modules/nf-core/fq/lint/environment.yml b/modules/nf-core/fq/lint/environment.yml new file mode 100644 index 0000000..74b1460 --- /dev/null +++ b/modules/nf-core/fq/lint/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::fq=0.12.0 diff --git a/modules/nf-core/fq/lint/main.nf b/modules/nf-core/fq/lint/main.nf new file mode 100644 index 0000000..943314c --- /dev/null +++ b/modules/nf-core/fq/lint/main.nf @@ -0,0 +1,33 @@ +process FQ_LINT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fq:0.12.0--h9ee0642_0': + 'biocontainers/fq:0.12.0--h9ee0642_0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fq_lint.txt"), emit: lint + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + fq lint \\ + $args \\ + $fastq > ${prefix}.fq_lint.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fq: \$(echo \$(fq lint --version | sed 's/fq-lint //g')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fq/lint/meta.yml b/modules/nf-core/fq/lint/meta.yml new file mode 100644 index 0000000..7240fb5 --- /dev/null +++ b/modules/nf-core/fq/lint/meta.yml @@ -0,0 +1,43 @@ +name: "fq_lint" +description: fq lint is a FASTQ file pair validator. +keywords: + - lint + - fastq + - validate +tools: + - "fq": + description: "fq is a library to generate and validate FASTQ file pairs." + homepage: "https://github.com/stjude-rust-labs/fq" + documentation: "https://github.com/stjude-rust-labs/fq" + tool_dev_url: "https://github.com/stjude-rust-labs/fq" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: FASTQ file list + pattern: "*.fastq{,.gz}" +output: + - lint: + - meta: + type: file + description: Lint output + pattern: "*.fq_lint.txt" + - "*.fq_lint.txt": + type: file + description: Lint output + pattern: "*.fq_lint.txt" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" +maintainers: + - "@adamrtalbot" diff --git a/modules/nf-core/fq/lint/tests/main.nf.test b/modules/nf-core/fq/lint/tests/main.nf.test new file mode 100644 index 0000000..ec2eaf8 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process FQ_LINT" + script "../main.nf" + process "FQ_LINT" + + tag "modules" + tag "modules_nfcore" + tag "fq" + tag "fq/lint" + + test("test_fq_lint_success") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert process.out.lint.get(0).get(1) ==~ ".*/test.fq_lint.txt" }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("fq-lint start") }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("read 100 records") }, + { assert path(process.out.lint.get(0).get(1)).getText().contains("fq-lint end") }, + ) + } + + } + + test("test_fq_lint_fail") { + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/candidatus_portiera_aleyrodidarum/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll ( + { assert !process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/fq/lint/tests/main.nf.test.snap b/modules/nf-core/fq/lint/tests/main.nf.test.snap new file mode 100644 index 0000000..fec8e52 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "test_fq_lint_fail": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "lint": [ + + ], + "versions": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T16:37:02.133847389" + } +} \ No newline at end of file diff --git a/modules/nf-core/fq/lint/tests/tags.yml b/modules/nf-core/fq/lint/tests/tags.yml new file mode 100644 index 0000000..9c9c323 --- /dev/null +++ b/modules/nf-core/fq/lint/tests/tags.yml @@ -0,0 +1,2 @@ +fq/lint: + - modules/nf-core/fq/lint/** diff --git a/nextflow.config b/nextflow.config index 38eb312..5a57625 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,6 +13,13 @@ params { // Input options input = null sample_size = 0 + + // Options + skip_linting = false + fq_lint_args = "" + continue_with_lint_fail = false + + // References genome = null fasta = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 49742b2..669f2ae 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -31,7 +31,6 @@ }, "outdir": { "type": "string", - "default": null, "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" @@ -50,6 +49,30 @@ } } }, + "validation_options": { + "title": "Validation options", + "type": "object", + "description": "Options for validating and screening FASTQ files.", + "default": "", + "properties": { + "skip_linting": { + "type": "boolean", + "default": false, + "description": "Whether to lint the FASTQs before performing QC on the sequences", + "help_text": "FASTQ files will be linted with FQ early in the pipeline. If they fail validation, the pipeline will terminate preventing expensive quality control steps being performed on the other samples. If ignoring FQ is enabled, quality control will be performed on the remaining samples." + }, + "fq_lint_args": { + "type": "string", + "description": "Arguments to pass to FQ lint", + "help_text": "Arguments to pass to FQ lint. This can be used to disable overly strict linting. See https://github.com/stjude-rust-labs/fq?tab=readme-ov-file#lint for more information." + }, + "continue_with_lint_fail": { + "type": "boolean", + "description": "Whether to continue with the pipeline if linting fails for a single sample.", + "help_text": "If set to true, the pipeline will continue with the remaining samples if linting fails for a single sample. If set to false, the pipeline will terminate if linting fails for a single sample." + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -233,6 +256,9 @@ { "$ref": "#/$defs/input_output_options" }, + { + "$ref": "#/$defs/validation_options" + }, { "$ref": "#/$defs/reference_genome_options" }, diff --git a/tests/rnaseq.main.nf.test b/tests/rnaseq.main.nf.test new file mode 100644 index 0000000..411e58c --- /dev/null +++ b/tests/rnaseq.main.nf.test @@ -0,0 +1,87 @@ +nextflow_pipeline { + + name "Test Workflow main.nf on NovaSeq6000 data" + script "../main.nf" + tag "seqinspector" + tag "PIPELINE" + + test("rnaseq data test fail linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + } + } + + then { + assertAll( + // Linting should fail! + { assert workflow.failed } + ) + } + } + + test("rnaseq data test skip linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + skip_linting = true + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + + test("rnaseq data test ignore linting") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + continue_with_lint_fail = true + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt") + ) + }, + ) + } + } + + test("rnaseq data test add args to fq/lint") { + + when { + config "./rnaseq.main.nf.test.config" + params { + outdir = "$outputDir" + fq_lint_args = "--disable-validator P001" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_citations.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_fastqc.txt"), + path("$outputDir/multiqc/global_report/multiqc_data/multiqc_general_stats.txt") + ) + }, + ) + } + } +} diff --git a/tests/rnaseq.main.nf.test.config b/tests/rnaseq.main.nf.test.config new file mode 100644 index 0000000..f85acb5 --- /dev/null +++ b/tests/rnaseq.main.nf.test.config @@ -0,0 +1,8 @@ +// Load the basic test config +includeConfig 'nextflow.config' + +// Load the correct samplesheet for that test +params { + input = params.pipelines_testdata_base_path + '626c8fab639062eade4b10747e919341cbf9b41a/samplesheet/v3.10/samplesheet_test.csv' + +} diff --git a/workflows/seqinspector.nf b/workflows/seqinspector.nf index 7a2dfae..88fb06c 100644 --- a/workflows/seqinspector.nf +++ b/workflows/seqinspector.nf @@ -5,6 +5,7 @@ */ include { SEQTK_SAMPLE } from '../modules/nf-core/seqtk/sample/main' +include { FQ_LINT } from '../modules/nf-core/fq/lint/main' include { FASTQC } from '../modules/nf-core/fastqc/main' include { MULTIQC as MULTIQC_GLOBAL } from '../modules/nf-core/multiqc/main' @@ -32,6 +33,24 @@ workflow SEQINSPECTOR { ch_multiqc_extra_files = Channel.empty() ch_multiqc_reports = Channel.empty() + // + // MODULE: Run FQ_LINT to catch early errors + // + if ( !params.skip_linting ) { + FQ_LINT ( + ch_samplesheet + ) + ch_versions = ch_versions.mix(FQ_LINT.out.versions.first()) + // This catches all FASTQs that pass linting + // If you use an error strategy that allows FQ_LINT to fail, + // only valid FASTQ files will be passed to the next module + ch_samplesheet = FQ_LINT.out.lint + .join(ch_samplesheet) + .map { meta, fq_lint, reads -> + [meta, reads] + } + } + // // MODULE: Run Seqtk sample to perform subsampling //