Skip to content

Commit

Permalink
Add empty fastqs to test dataset (#25)
Browse files Browse the repository at this point in the history
* Add empty fastqs to test dataset

* handle failed qualimap

* Print results of check_outputs

* bump version
  • Loading branch information
dfornika authored Jun 12, 2024
1 parent 1dda68a commit ae43481
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 52 deletions.
10 changes: 7 additions & 3 deletions .github/scripts/check_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import glob
import json
import os
import sys
import urllib.request

from jsonschema import validate
Expand Down Expand Up @@ -94,14 +95,17 @@ def main(args):

output_path = args.output
with open(output_path, 'w') as f:
writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
writer.writeheader()
file_writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore')
stdout_writer = csv.DictWriter(sys.stdout, fieldnames=output_fields, extrasaction='ignore')
stdout_writer.writeheader()
file_writer.writeheader()
for test in tests:
if test["test_passed"]:
test["test_result"] = "PASS"
else:
test["test_result"] = "FAIL"
writer.writerow(test)
stdout_writer.writerow(test)
file_writer.writerow(test)

for test in tests:
if not test['test_passed']:
Expand Down
22 changes: 16 additions & 6 deletions .github/scripts/run_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,25 @@

set -eo pipefail

sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
sed -i 's/cpus = 12/cpus = 4/g' nextflow.config
sed -i 's/cpus = 16/cpus = 4/g' nextflow.config
sed -i 's/cpus = 24/cpus = 4/g' nextflow.config
sed -i 's/cpus = 24/cpus = 4/g' nextflow.config
sed -i "s/memory = '36G'/memory = '2G'/g" nextflow.config
# Check for a sign that we're in the GitHub Actions environment.
# Prevents these settings from being applied in other environments.
if [ -z "${GITHUB_ACTIONS}" ]; then
echo "Not running in GitHub Actions environment."
else
echo "Running in GitHub Actions environment."
echo "Adjusting nextflow configuration for GitHub Actions environment."
sed -i 's/cpus = 8/cpus = 4/g' nextflow.config
sed -i 's/cpus = 12/cpus = 4/g' nextflow.config
sed -i 's/cpus = 16/cpus = 4/g' nextflow.config
sed -i 's/cpus = 24/cpus = 4/g' nextflow.config
sed -i 's/cpus = 24/cpus = 4/g' nextflow.config
sed -i "s/memory = '36G'/memory = '2G'/g" nextflow.config
fi


nextflow run main.nf \
-profile conda \
-resume \
--cache ${HOME}/.conda/envs \
--fastq_input .github/data/fastq \
--outdir .github/data/test_output \
Expand Down
9 changes: 9 additions & 0 deletions .github/scripts/run_tests_locally.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

.github/scripts/download_refs.sh

.github/scripts/simulate_reads.sh

.github/scripts/run_pipeline.sh

.github/scripts/check_outputs.sh
5 changes: 5 additions & 0 deletions .github/scripts/simulate_reads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ while IFS=',' read -r sample_id assembly; do

done < .github/data/reads_to_simulate.csv

touch .github/data/fastq/empty_R1.fastq
touch .github/data/fastq/empty_R2.fastq

gzip -f .github/data/fastq/empty_R1.fastq
gzip -f .github/data/fastq/empty_R2.fastq
94 changes: 61 additions & 33 deletions bin/qualimap_bamqc_genome_results_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,78 +5,86 @@
import json
import sys

def main(args):
with open(args.qualimap_bamqc_genome_results, 'r') as f:
output_data = {
'sample_id': args.sample_id,
'read_type': args.read_type,
}

def parse_qualimap_bamqc_genome_results(qualimap_bamqc_genome_results):
"""
Parse the Qualimap BAMQC genome results file.
:param qualimap_bamqc_genome_results: Path to the Qualimap BAMQC genome results file
:type qualimap_bamqc_genome_results: str
:return: The parsed Qualimap BAMQC genome results
:rtype: dict
"""
qualimap_bamqc_genome_results_data = {}
with open(qualimap_bamqc_genome_results, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('number of reads'):
number_of_reads = line.split('=')[1].strip().replace(',', '')
output_data['num_reads'] = int(number_of_reads)
qualimap_bamqc_genome_results_data['num_reads'] = int(number_of_reads)
if line.startswith('number of mapped reads'):
num_mapped_reads = line.split('=')[1].strip().split(' ')[0].replace(',', '')
output_data['num_mapped_reads'] = int(num_mapped_reads)
qualimap_bamqc_genome_results_data['num_mapped_reads'] = int(num_mapped_reads)
percent_mapped_reads = line.split('=')[1].strip().split(' ')[1].strip().replace('(', '').replace(')', '').replace('%', '')
output_data['percent_mapped_reads'] = round(float(percent_mapped_reads), 2)
qualimap_bamqc_genome_results_data['percent_mapped_reads'] = round(float(percent_mapped_reads), 2)
if line.startswith('number of secondary alignments'):
num_secondary_alignments = int(line.split('=')[1].strip().replace(',', ''))
output_data['num_secondary_alignments'] = num_secondary_alignments
qualimap_bamqc_genome_results_data['num_secondary_alignments'] = num_secondary_alignments
if line.startswith('duplication rate'):
duplication_rate = line.split('=')[1].strip().replace('%', '')
output_data['duplication_rate_percent'] = round(float(duplication_rate), 2)
qualimap_bamqc_genome_results_data['duplication_rate_percent'] = round(float(duplication_rate), 2)
if line.startswith('mean coverageData'):
mean_coverage = line.split('=')[1].strip().strip('X').replace(',', '')
output_data['mean_depth_coverage'] = round(float(mean_coverage), 2)
qualimap_bamqc_genome_results_data['mean_depth_coverage'] = round(float(mean_coverage), 2)
if line.startswith('std coverageData'):
stdev_coverage = line.split('=')[1].strip().strip('X').replace(',', '')
output_data['stdev_depth_coverage'] = round(float(stdev_coverage), 2)
qualimap_bamqc_genome_results_data['stdev_depth_coverage'] = round(float(stdev_coverage), 2)
if line.startswith('mean mapping quality'):
mean_mapping_quality = line.split('=')[1].strip()
output_data['mean_mapping_quality'] = round(float(mean_mapping_quality), 2)
qualimap_bamqc_genome_results_data['mean_mapping_quality'] = round(float(mean_mapping_quality), 2)
if line.startswith('general error rate'):
general_error_rate = line.split('=')[1].strip()
output_data['error_rate'] = round(float(general_error_rate), 2)
qualimap_bamqc_genome_results_data['error_rate'] = round(float(general_error_rate), 2)
if line.startswith('number of mismatches'):
number_of_mismatches = line.split('=')[1].strip().replace(',', '')
output_data['number_of_mismatches'] = int(number_of_mismatches)
qualimap_bamqc_genome_results_data['number_of_mismatches'] = int(number_of_mismatches)
if line.startswith('number of insertions'):
number_of_insertions = line.split('=')[1].strip().replace(',', '')
output_data['number_of_insertions'] = int(number_of_insertions)
qualimap_bamqc_genome_results_data['number_of_insertions'] = int(number_of_insertions)
if line.startswith('mapped reads with insertion percentage'):
mapped_reads_with_insertion_percentage = line.split('=')[1].strip().replace('%', '')
output_data['mapped_reads_with_insertion_percentage'] = round(float(mapped_reads_with_insertion_percentage), 2)
qualimap_bamqc_genome_results_data['mapped_reads_with_insertion_percentage'] = round(float(mapped_reads_with_insertion_percentage), 2)
if line.startswith('number of deletions'):
number_of_deletions = line.split('=')[1].strip().replace(',', '')
output_data['number_of_deletions'] = int(number_of_deletions)
qualimap_bamqc_genome_results_data['number_of_deletions'] = int(number_of_deletions)
if line.startswith('mapped reads with deletion percentage'):
mapped_reads_with_deletion_percentage = line.split('=')[1].strip().replace('%', '')
output_data['mapped_reads_with_deletion_percentage'] = round(float(mapped_reads_with_deletion_percentage), 2)
qualimap_bamqc_genome_results_data['mapped_reads_with_deletion_percentage'] = round(float(mapped_reads_with_deletion_percentage), 2)
if 'reference with a coverageData >= 5X' in line:
proportion_genome_covered_over_5x = float(line.split(' ')[3].strip('%')) / 100
output_data['proportion_genome_covered_over_5x'] = round(proportion_genome_covered_over_5x, 4)
qualimap_bamqc_genome_results_data['proportion_genome_covered_over_5x'] = round(proportion_genome_covered_over_5x, 4)
if 'reference with a coverageData >= 10X' in line:
proportion_genome_covered_over_10x = float(line.split(' ')[3].strip('%')) / 100
output_data['proportion_genome_covered_over_10x'] = round(proportion_genome_covered_over_10x, 4)
qualimap_bamqc_genome_results_data['proportion_genome_covered_over_10x'] = round(proportion_genome_covered_over_10x, 4)
if 'reference with a coverageData >= 20X' in line:
proportion_genome_covered_over_20x = float(line.split(' ')[3].strip('%')) / 100
output_data['proportion_genome_covered_over_20x'] = round(proportion_genome_covered_over_20x, 4)
qualimap_bamqc_genome_results_data['proportion_genome_covered_over_20x'] = round(proportion_genome_covered_over_20x, 4)
if 'reference with a coverageData >= 30X' in line:
proportion_genome_covered_over_30x = float(line.split(' ')[3].strip('%')) / 100
output_data['proportion_genome_covered_over_30x'] = round(proportion_genome_covered_over_30x, 4)
qualimap_bamqc_genome_results_data['proportion_genome_covered_over_30x'] = round(proportion_genome_covered_over_30x, 4)
if 'reference with a coverageData >= 40X' in line:
proportion_genome_covered_over_40x = float(line.split(' ')[3].strip('%')) / 100
output_data['proportion_genome_covered_over_40x'] = round(proportion_genome_covered_over_40x, 4)
qualimap_bamqc_genome_results_data['proportion_genome_covered_over_40x'] = round(proportion_genome_covered_over_40x, 4)
if 'reference with a coverageData >= 50X' in line:
proportion_genome_covered_over_50x = float(line.split(' ')[3].strip('%')) / 100
output_data['proportion_genome_covered_over_50x'] = round(proportion_genome_covered_over_50x, 4)

qualimap_bamqc_genome_results_data['proportion_genome_covered_over_50x'] = round(proportion_genome_covered_over_50x, 4)

return qualimap_bamqc_genome_results_data


output_fieldnames = [
'sample_id',
'read_type',
def main(args):

parsed_qualimap_bamqc_genome_results_fieldnames = [
'mean_depth_coverage',
'stdev_depth_coverage',
'num_reads',
Expand All @@ -99,15 +107,35 @@ def main(args):
'proportion_genome_covered_over_50x',
]

output_data = {
'sample_id': args.sample_id,
'read_type': args.read_type,
}

if args.failed:
for field in parsed_qualimap_bamqc_genome_results_fieldnames:
output_data[field] = None
else:
qualimap_bamqc_genome_results_data = parse_qualimap_bamqc_genome_results(args.qualimap_bamqc_genome_results)
output_data = {
'sample_id': args.sample_id,
'read_type': args.read_type,
}
for field in parsed_qualimap_bamqc_genome_results_fieldnames:
output_data[field] = qualimap_bamqc_genome_results_data.get(field, None)

output_fieldnames = ['sample_id', 'read_type'] + parsed_qualimap_bamqc_genome_results_fieldnames

writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='unix', extrasaction='ignore', quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerow(output_data)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('qualimap_bamqc_genome_results')
parser.add_argument('-s', '--sample-id')
parser.add_argument('-t', '--read-type', choices=['short', 'long'], default='short')
parser.add_argument('-q', '--qualimap-bamqc-genome-results', type=str, help='Path to the Qualimap BAMQC genome results file. May be omitted when using the --failed flag to generate a row with all stats set to None (blank)')
parser.add_argument('-s', '--sample-id', type=str, help='Sample ID')
parser.add_argument('-t', '--read-type', choices=['short', 'long'], default='short', help='Read type')
parser.add_argument('-f', '--failed', action='store_true', help='Flag to indicate that the sample failed QC and no input data was provided')
args = parser.parse_args()
main(args)
33 changes: 24 additions & 9 deletions modules/alignment_variants.nf
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ process qualimap_bamqc {

output:
tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_alignment_qc.csv"), emit: alignment_qc
tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report
tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results
tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_report.pdf"), emit: report, optional: true
tuple val(sample_id), val(short_long), path("${sample_id}_${short_long}_qualimap_genome_results.txt"), emit: genome_results, optional: true
tuple val(sample_id), path("${sample_id}_${short_long}_qualimap_bamqc_provenance.yml"), emit: provenance

script:
Expand All @@ -165,7 +165,12 @@ process qualimap_bamqc {
printf -- " value: null\\n" >> ${sample_id}_${short_long}_qualimap_bamqc_provenance.yml
printf -- " - parameter: --cov-hist-lim\\n" >> ${sample_id}_${short_long}_qualimap_bamqc_provenance.yml
printf -- " value: ${params.qualimap_coverage_histogram_limit}\\n" >> ${sample_id}_${short_long}_qualimap_bamqc_provenance.yml
# Assume qualimap exits successfully
# If it fails we will re-assign the exit code
# and generate an empty qualimap alignment qc
qualimap_exit_code=0
qualimap \
--java-mem-size=${params.qualimap_memory} \
bamqc \
Expand All @@ -176,16 +181,26 @@ process qualimap_bamqc {
-nt ${task.cpus} \
-bam ${alignment[0]} \
-outformat PDF \
--outdir ${sample_id}_${short_long}_bamqc
--outdir ${sample_id}_${short_long}_bamqc \
|| qualimap_exit_code=\$?
qualimap_bamqc_genome_results_to_csv.py \
if [ \${qualimap_exit_code} -ne 0 ]; then
echo "Qualimap failed with exit code \${qualimap_exit_code}"
echo "Generating empty qualimap alignment qc"
qualimap_bamqc_genome_results_to_csv.py \
-s ${sample_id} \
--read-type ${short_long} \
${sample_id}_${short_long}_bamqc/genome_results.txt \
--failed \
> ${sample_id}_${short_long}_qualimap_alignment_qc.csv
cp ${sample_id}_${short_long}_bamqc/report.pdf ${sample_id}_${short_long}_qualimap_report.pdf
cp ${sample_id}_${short_long}_bamqc/genome_results.txt ${sample_id}_${short_long}_qualimap_genome_results.txt
else
qualimap_bamqc_genome_results_to_csv.py \
-s ${sample_id} \
--read-type ${short_long} \
--qualimap-bamqc-genome-results ${sample_id}_${short_long}_bamqc/genome_results.txt \
> ${sample_id}_${short_long}_qualimap_alignment_qc.csv
cp ${sample_id}_${short_long}_bamqc/report.pdf ${sample_id}_${short_long}_qualimap_report.pdf
cp ${sample_id}_${short_long}_bamqc/genome_results.txt ${sample_id}_${short_long}_qualimap_genome_results.txt
fi
"""
}

Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ manifest {
description = 'Alignment and variant calling pipeline'
mainScript = 'main.nf'
nextflowVersion = '>=20.01.0'
version = '0.1.8'
version = '0.1.9'
}

params {
Expand Down

0 comments on commit ae43481

Please sign in to comment.