From ae434815ac53df40d229299857e8475f19dbae73 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Wed, 12 Jun 2024 16:34:06 -0700 Subject: [PATCH] Add empty fastqs to test dataset (#25) * Add empty fastqs to test dataset * handle failed qualimap * Print results of check_outputs * bump version --- .github/scripts/check_outputs.py | 10 ++- .github/scripts/run_pipeline.sh | 22 +++-- .github/scripts/run_tests_locally.sh | 9 ++ .github/scripts/simulate_reads.sh | 5 ++ bin/qualimap_bamqc_genome_results_to_csv.py | 94 +++++++++++++-------- modules/alignment_variants.nf | 33 ++++++-- nextflow.config | 2 +- 7 files changed, 123 insertions(+), 52 deletions(-) create mode 100755 .github/scripts/run_tests_locally.sh diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py index 88c1f55..e788996 100755 --- a/.github/scripts/check_outputs.py +++ b/.github/scripts/check_outputs.py @@ -5,6 +5,7 @@ import glob import json import os +import sys import urllib.request from jsonschema import validate @@ -94,14 +95,17 @@ def main(args): output_path = args.output with open(output_path, 'w') as f: - writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore') - writer.writeheader() + file_writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore') + stdout_writer = csv.DictWriter(sys.stdout, fieldnames=output_fields, extrasaction='ignore') + stdout_writer.writeheader() + file_writer.writeheader() for test in tests: if test["test_passed"]: test["test_result"] = "PASS" else: test["test_result"] = "FAIL" - writer.writerow(test) + stdout_writer.writerow(test) + file_writer.writerow(test) for test in tests: if not test['test_passed']: diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh index 63c7681..eb99308 100755 --- a/.github/scripts/run_pipeline.sh +++ b/.github/scripts/run_pipeline.sh @@ -2,15 +2,25 @@ set -eo pipefail -sed -i 's/cpus = 8/cpus = 4/g' nextflow.config -sed -i 's/cpus = 12/cpus = 4/g' nextflow.config -sed -i 's/cpus = 16/cpus = 4/g' nextflow.config -sed -i 's/cpus = 24/cpus = 4/g' nextflow.config -sed -i 's/cpus = 24/cpus = 4/g' nextflow.config -sed -i "s/memory = '36G'/memory = '2G'/g" nextflow.config +# Check for a sign that we're in the GitHub Actions environment. +# Prevents these settings from being applied in other environments. +if [ -z "${GITHUB_ACTIONS}" ]; then + echo "Not running in GitHub Actions environment." +else + echo "Running in GitHub Actions environment." + echo "Adjusting nextflow configuration for GitHub Actions environment." + sed -i 's/cpus = 8/cpus = 4/g' nextflow.config + sed -i 's/cpus = 12/cpus = 4/g' nextflow.config + sed -i 's/cpus = 16/cpus = 4/g' nextflow.config + sed -i 's/cpus = 24/cpus = 4/g' nextflow.config + sed -i 's/cpus = 24/cpus = 4/g' nextflow.config + sed -i "s/memory = '36G'/memory = '2G'/g" nextflow.config +fi + nextflow run main.nf \ -profile conda \ + -resume \ --cache ${HOME}/.conda/envs \ --fastq_input .github/data/fastq \ --outdir .github/data/test_output \ diff --git a/.github/scripts/run_tests_locally.sh b/.github/scripts/run_tests_locally.sh new file mode 100755 index 0000000..0d8863e --- /dev/null +++ b/.github/scripts/run_tests_locally.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +.github/scripts/download_refs.sh + +.github/scripts/simulate_reads.sh + +.github/scripts/run_pipeline.sh + +.github/scripts/check_outputs.sh diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh index 7001cce..7c6daca 100755 --- a/.github/scripts/simulate_reads.sh +++ b/.github/scripts/simulate_reads.sh @@ -33,3 +33,8 @@ while IFS=',' read -r sample_id assembly; do done < .github/data/reads_to_simulate.csv +touch .github/data/fastq/empty_R1.fastq +touch .github/data/fastq/empty_R2.fastq + +gzip -f .github/data/fastq/empty_R1.fastq +gzip -f .github/data/fastq/empty_R2.fastq diff --git a/bin/qualimap_bamqc_genome_results_to_csv.py b/bin/qualimap_bamqc_genome_results_to_csv.py index d52ec42..51a3e48 100755 --- a/bin/qualimap_bamqc_genome_results_to_csv.py +++ b/bin/qualimap_bamqc_genome_results_to_csv.py @@ -5,78 +5,86 @@ import json import sys -def main(args): - with open(args.qualimap_bamqc_genome_results, 'r') as f: - output_data = { - 'sample_id': args.sample_id, - 'read_type': args.read_type, - } + +def parse_qualimap_bamqc_genome_results(qualimap_bamqc_genome_results): + """ + Parse the Qualimap BAMQC genome results file. + + :param qualimap_bamqc_genome_results: Path to the Qualimap BAMQC genome results file + :type qualimap_bamqc_genome_results: str + :return: The parsed Qualimap BAMQC genome results + :rtype: dict + """ + qualimap_bamqc_genome_results_data = {} + with open(qualimap_bamqc_genome_results, 'r') as f: for line in f: line = line.strip() if line.startswith('number of reads'): number_of_reads = line.split('=')[1].strip().replace(',', '') - output_data['num_reads'] = int(number_of_reads) + qualimap_bamqc_genome_results_data['num_reads'] = int(number_of_reads) if line.startswith('number of mapped reads'): num_mapped_reads = line.split('=')[1].strip().split(' ')[0].replace(',', '') - output_data['num_mapped_reads'] = int(num_mapped_reads) + qualimap_bamqc_genome_results_data['num_mapped_reads'] = int(num_mapped_reads) percent_mapped_reads = line.split('=')[1].strip().split(' ')[1].strip().replace('(', '').replace(')', '').replace('%', '') - output_data['percent_mapped_reads'] = round(float(percent_mapped_reads), 2) + qualimap_bamqc_genome_results_data['percent_mapped_reads'] = round(float(percent_mapped_reads), 2) if line.startswith('number of secondary alignments'): num_secondary_alignments = int(line.split('=')[1].strip().replace(',', '')) - output_data['num_secondary_alignments'] = num_secondary_alignments + qualimap_bamqc_genome_results_data['num_secondary_alignments'] = num_secondary_alignments if line.startswith('duplication rate'): duplication_rate = line.split('=')[1].strip().replace('%', '') - output_data['duplication_rate_percent'] = round(float(duplication_rate), 2) + qualimap_bamqc_genome_results_data['duplication_rate_percent'] = round(float(duplication_rate), 2) if line.startswith('mean coverageData'): mean_coverage = line.split('=')[1].strip().strip('X').replace(',', '') - output_data['mean_depth_coverage'] = round(float(mean_coverage), 2) + qualimap_bamqc_genome_results_data['mean_depth_coverage'] = round(float(mean_coverage), 2) if line.startswith('std coverageData'): stdev_coverage = line.split('=')[1].strip().strip('X').replace(',', '') - output_data['stdev_depth_coverage'] = round(float(stdev_coverage), 2) + qualimap_bamqc_genome_results_data['stdev_depth_coverage'] = round(float(stdev_coverage), 2) if line.startswith('mean mapping quality'): mean_mapping_quality = line.split('=')[1].strip() - output_data['mean_mapping_quality'] = round(float(mean_mapping_quality), 2) + qualimap_bamqc_genome_results_data['mean_mapping_quality'] = round(float(mean_mapping_quality), 2) if line.startswith('general error rate'): general_error_rate = line.split('=')[1].strip() - output_data['error_rate'] = round(float(general_error_rate), 2) + qualimap_bamqc_genome_results_data['error_rate'] = round(float(general_error_rate), 2) if line.startswith('number of mismatches'): number_of_mismatches = line.split('=')[1].strip().replace(',', '') - output_data['number_of_mismatches'] = int(number_of_mismatches) + qualimap_bamqc_genome_results_data['number_of_mismatches'] = int(number_of_mismatches) if line.startswith('number of insertions'): number_of_insertions = line.split('=')[1].strip().replace(',', '') - output_data['number_of_insertions'] = int(number_of_insertions) + qualimap_bamqc_genome_results_data['number_of_insertions'] = int(number_of_insertions) if line.startswith('mapped reads with insertion percentage'): mapped_reads_with_insertion_percentage = line.split('=')[1].strip().replace('%', '') - output_data['mapped_reads_with_insertion_percentage'] = round(float(mapped_reads_with_insertion_percentage), 2) + qualimap_bamqc_genome_results_data['mapped_reads_with_insertion_percentage'] = round(float(mapped_reads_with_insertion_percentage), 2) if line.startswith('number of deletions'): number_of_deletions = line.split('=')[1].strip().replace(',', '') - output_data['number_of_deletions'] = int(number_of_deletions) + qualimap_bamqc_genome_results_data['number_of_deletions'] = int(number_of_deletions) if line.startswith('mapped reads with deletion percentage'): mapped_reads_with_deletion_percentage = line.split('=')[1].strip().replace('%', '') - output_data['mapped_reads_with_deletion_percentage'] = round(float(mapped_reads_with_deletion_percentage), 2) + qualimap_bamqc_genome_results_data['mapped_reads_with_deletion_percentage'] = round(float(mapped_reads_with_deletion_percentage), 2) if 'reference with a coverageData >= 5X' in line: proportion_genome_covered_over_5x = float(line.split(' ')[3].strip('%')) / 100 - output_data['proportion_genome_covered_over_5x'] = round(proportion_genome_covered_over_5x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_5x'] = round(proportion_genome_covered_over_5x, 4) if 'reference with a coverageData >= 10X' in line: proportion_genome_covered_over_10x = float(line.split(' ')[3].strip('%')) / 100 - output_data['proportion_genome_covered_over_10x'] = round(proportion_genome_covered_over_10x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_10x'] = round(proportion_genome_covered_over_10x, 4) if 'reference with a coverageData >= 20X' in line: proportion_genome_covered_over_20x = float(line.split(' ')[3].strip('%')) / 100 - output_data['proportion_genome_covered_over_20x'] = round(proportion_genome_covered_over_20x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_20x'] = round(proportion_genome_covered_over_20x, 4) if 'reference with a coverageData >= 30X' in line: proportion_genome_covered_over_30x = float(line.split(' ')[3].strip('%')) / 100 - output_data['proportion_genome_covered_over_30x'] = round(proportion_genome_covered_over_30x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_30x'] = round(proportion_genome_covered_over_30x, 4) if 'reference with a coverageData >= 40X' in line: proportion_genome_covered_over_40x = float(line.split(' ')[3].strip('%')) / 100 - output_data['proportion_genome_covered_over_40x'] = round(proportion_genome_covered_over_40x, 4) + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_40x'] = round(proportion_genome_covered_over_40x, 4) if 'reference with a coverageData >= 50X' in line: proportion_genome_covered_over_50x = float(line.split(' ')[3].strip('%')) / 100 - output_data['proportion_genome_covered_over_50x'] = round(proportion_genome_covered_over_50x, 4) - + qualimap_bamqc_genome_results_data['proportion_genome_covered_over_50x'] = round(proportion_genome_covered_over_50x, 4) + + return qualimap_bamqc_genome_results_data + - output_fieldnames = [ - 'sample_id', - 'read_type', +def main(args): + + parsed_qualimap_bamqc_genome_results_fieldnames = [ 'mean_depth_coverage', 'stdev_depth_coverage', 'num_reads', @@ -99,6 +107,25 @@ def main(args): 'proportion_genome_covered_over_50x', ] + output_data = { + 'sample_id': args.sample_id, + 'read_type': args.read_type, + } + + if args.failed: + for field in parsed_qualimap_bamqc_genome_results_fieldnames: + output_data[field] = None + else: + qualimap_bamqc_genome_results_data = parse_qualimap_bamqc_genome_results(args.qualimap_bamqc_genome_results) + output_data = { + 'sample_id': args.sample_id, + 'read_type': args.read_type, + } + for field in parsed_qualimap_bamqc_genome_results_fieldnames: + output_data[field] = qualimap_bamqc_genome_results_data.get(field, None) + + output_fieldnames = ['sample_id', 'read_type'] + parsed_qualimap_bamqc_genome_results_fieldnames + writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='unix', extrasaction='ignore', quoting=csv.QUOTE_MINIMAL) writer.writeheader() writer.writerow(output_data) @@ -106,8 +133,9 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument('qualimap_bamqc_genome_results') - parser.add_argument('-s', '--sample-id') - parser.add_argument('-t', '--read-type', choices=['short', 'long'], default='short') + parser.add_argument('-q', '--qualimap-bamqc-genome-results', type=str, help='Path to the Qualimap BAMQC genome results file. May be omitted when using the --failed flag to generate a row with all stats set to None (blank)') + parser.add_argument('-s', '--sample-id', type=str, help='Sample ID') + parser.add_argument('-t', '--read-type', choices=['short', 'long'], default='short', help='Read type') + parser.add_argument('-f', '--failed', action='store_true', help='Flag to indicate that the sample failed QC and no input data was provided') args = parser.parse_args() main(args) diff --git a/modules/alignment_variants.nf b/modules/alignment_variants.nf index 65639dc..a78e097 100644 --- a/modules/alignment_variants.nf +++ b/modules/alignment_variants.nf @@ -150,8 +150,8 @@ process qualimap_bamqc { output: tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_alignment_qc.csv"), emit: alignment_qc - tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report - tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report, optional: true + tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results, optional: true tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_bamqc_provenance.yml"), emit: provenance script: @@ -165,7 +165,12 @@ process qualimap_bamqc { printf -- " value: null\\n" >> ${sample_id}_${short_long}_qualimap_bamqc_provenance.yml printf -- " - parameter: --cov-hist-lim\\n" >> ${sample_id}_${short_long}_qualimap_bamqc_provenance.yml printf -- " value: ${params.qualimap_coverage_histogram_limit}\\n" >> ${sample_id}_${short_long}_qualimap_bamqc_provenance.yml - + + # Assume qualimap exits successfully + # If it fails we will re-assign the exit code + # and generate an empty qualimap alignment qc + qualimap_exit_code=0 + qualimap \ --java-mem-size=${params.qualimap_memory} \ bamqc \ @@ -176,16 +181,26 @@ process qualimap_bamqc { -nt ${task.cpus} \ -bam ${alignment[0]} \ -outformat PDF \ - --outdir ${sample_id}_${short_long}_bamqc + --outdir ${sample_id}_${short_long}_bamqc \ + || qualimap_exit_code=\$? - qualimap_bamqc_genome_results_to_csv.py \ + if [ \${qualimap_exit_code} -ne 0 ]; then + echo "Qualimap failed with exit code \${qualimap_exit_code}" + echo "Generating empty qualimap alignment qc" + qualimap_bamqc_genome_results_to_csv.py \ -s ${sample_id} \ --read-type ${short_long} \ - ${sample_id}_${short_long}_bamqc/genome_results.txt \ + --failed \ > ${sample_id}_${short_long}_qualimap_alignment_qc.csv - - cp ${sample_id}_${short_long}_bamqc/report.pdf ${sample_id}_${short_long}_qualimap_report.pdf - cp ${sample_id}_${short_long}_bamqc/genome_results.txt ${sample_id}_${short_long}_qualimap_genome_results.txt + else + qualimap_bamqc_genome_results_to_csv.py \ + -s ${sample_id} \ + --read-type ${short_long} \ + --qualimap-bamqc-genome-results ${sample_id}_${short_long}_bamqc/genome_results.txt \ + > ${sample_id}_${short_long}_qualimap_alignment_qc.csv + cp ${sample_id}_${short_long}_bamqc/report.pdf ${sample_id}_${short_long}_qualimap_report.pdf + cp ${sample_id}_${short_long}_bamqc/genome_results.txt ${sample_id}_${short_long}_qualimap_genome_results.txt + fi """ } diff --git a/nextflow.config b/nextflow.config index 4b71980..06aa433 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,7 +4,7 @@ manifest { description = 'Alignment and variant calling pipeline' mainScript = 'main.nf' nextflowVersion = '>=20.01.0' - version = '0.1.8' + version = '0.1.9' } params {