Skip to content

Commit

Permalink
[WIP]: switch min_mapq to a proper MappingQuality
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Feb 9, 2024
1 parent b084993 commit 7767da4
Show file tree
Hide file tree
Showing 8 changed files with 243 additions and 98 deletions.
4 changes: 3 additions & 1 deletion src/derive/command/junction_annotation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::path::PathBuf;

use anyhow::Context;
use clap::Args;
use noodles::sam::record::MappingQuality;
use num_format::Locale;
use num_format::ToFormattedString;
use tracing::debug;
Expand Down Expand Up @@ -45,7 +46,7 @@ pub struct JunctionAnnotationArgs {
/// Set to 0 to disable this filter and allow reads _without_
/// a mapping quality to be considered.
#[arg(short, long, value_name = "U8", default_value = "30")]
min_mapq: u8,
min_mapq: Option<MappingQuality>,

/// Do not count supplementary alignments.
#[arg(long)]
Expand Down Expand Up @@ -95,6 +96,7 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {

debug!("Done reading GFF.");

// (1.5) Initialize variables (including opening the BAM).
let mut counter = RecordCounter::new();
let mut results = JunctionAnnotationResults::default();
let params = compute::JunctionAnnotationParameters {
Expand Down
24 changes: 14 additions & 10 deletions src/derive/command/strandedness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ use anyhow::Context;
use clap::Args;
use noodles::bam;
use noodles::gff;
use noodles::sam::record::MappingQuality;
use rust_lapper::{Interval, Lapper};
use tracing::debug;
use tracing::info;

use crate::derive::strandedness::compute;
use crate::derive::strandedness::compute::ParsedBAMFile;
use crate::derive::strandedness::results;
use crate::utils::args::parse_min_mapq;
use crate::utils::formats;

/// Clap arguments for the `ngs derive strandedness` subcommand.
Expand Down Expand Up @@ -45,10 +47,10 @@ pub struct DeriveStrandednessArgs {
num_genes: usize,

/// Minimum mapping quality for a record to be considered.
/// Set to 0 to disable this filter and allow reads _without_
/// a mapping quality to be considered.
#[arg(long, value_name = "U8", default_value = "30")]
min_mapq: u8,
/// Default is to ignore MAPQ values (allowing <missing> MAPQs to be considered).
/// Specify any u8 value to enable this filter.
#[arg(long, value_name = "U8", default_value = "30", value_parser = parse_min_mapq)]
min_mapq: Option<MappingQuality>,

/// Consider all genes, not just protein coding genes.
#[arg(long)]
Expand Down Expand Up @@ -88,7 +90,7 @@ pub struct DeriveStrandednessArgs {
pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
info!("Starting derive strandedness subcommand.");

// (1) Parse the GFF file and collect all gene features.
// (1) Parse the GFF file and collect all gene and exon features.
debug!("Reading all records in GFF.");
let mut gff = formats::gff::open(&args.features_gff)
.with_context(|| format!("opening GFF file: {}", args.features_gff.display()))?;
Expand Down Expand Up @@ -130,9 +132,6 @@ pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
exon_records.push(record);
}
}

debug!("Tabulating GFF gene and exon features.");

if gene_records.is_empty() {
bail!("No gene records matched criteria. Check your GFF file and `--gene-feature-name` and `--all-genes` options.");
}
Expand All @@ -145,6 +144,9 @@ pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
exon_records.len()
);

// (2) Parse exon features into proper data structure.
debug!("Tabulating GFF exon features.");

let mut exon_intervals: HashMap<&str, Vec<Interval<usize, gff::record::Strand>>> =
HashMap::new();
for record in &exon_records {
Expand Down Expand Up @@ -180,6 +182,7 @@ pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {

debug!("Done reading GFF.");

// (3) Initialize variables (including opening the BAM).
let mut reader = File::open(&args.src)
.map(bam::Reader::new)
.with_context(|| format!("opening BAM file: {}", args.src.display()))?;
Expand Down Expand Up @@ -219,8 +222,9 @@ pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
exons: exon_metrics,
reads: results::ReadRecordMetrics::default(),
};

let mut result: Option<results::DerivedStrandednessResult> = None;

// (4) Run the strandedness test.
for try_num in 1..=args.max_tries {
info!("Starting try {} of {}", try_num, args.max_tries);

Expand All @@ -247,7 +251,7 @@ pub fn derive(args: DeriveStrandednessArgs) -> anyhow::Result<()> {
info!("Strandedness test failed after {} tries.", args.max_tries);
}

// (4) Print the output to stdout as JSON (more support for different output
// (5) Print the output to stdout as JSON (more support for different output
// types may be added in the future, but for now, only JSON).
let output = serde_json::to_string_pretty(&result).unwrap();
print!("{}", output);
Expand Down
178 changes: 137 additions & 41 deletions src/derive/junction_annotation/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ use anyhow::Ok;
use noodles::core::Position;
use noodles::sam::alignment::Record;
use noodles::sam::record::cigar::op::Kind;
use noodles::sam::record::MappingQuality;
use noodles::sam::Header;
use std::collections::HashMap;
use std::collections::HashSet;

use crate::derive::junction_annotation::results;
use crate::utils::alignment::filter_by_mapq;

/// Struct to hold starts and ends of exons.
pub struct ExonSets<'a> {
Expand All @@ -30,7 +32,7 @@ pub struct JunctionAnnotationParameters {

/// Minumum mapping quality for a record to be considered.
/// 0 if MAPQ shouldn't be considered.
pub min_mapq: u8,
pub min_mapq: Option<MappingQuality>,

/// Do not count supplementary alignments.
pub no_supplementary: bool,
Expand Down Expand Up @@ -65,6 +67,27 @@ fn increment_junction_map(
);
}

/// Function to filter out records based on their flags.
fn filter_by_flags(record: &Record, params: &JunctionAnnotationParameters) -> bool {
let flags = record.flags();
if flags.is_unmapped()
|| (params.no_supplementary && flags.is_supplementary())
|| (!params.count_secondary && flags.is_secondary())
|| (!params.count_duplicates && flags.is_duplicate())
{
return true;
}
false
}

/// Function to filter out records that don't have introns.
fn filter_by_cigar(record: &Record) -> bool {
!record
.cigar()
.iter()
.any(|op| matches!(op.kind(), Kind::Skip))
}

/// Main function to annotate junctions one record at a time.
pub fn process(
record: &Record,
Expand All @@ -79,42 +102,24 @@ pub fn process(
_ => bail!("Could not parse read name"),
};

// (2) Parse the flags so we can see if the read should be ignored.
let flags = record.flags();

if flags.is_unmapped()
|| (params.no_supplementary && flags.is_supplementary())
|| (!params.count_secondary && flags.is_secondary())
|| (!params.count_duplicates && flags.is_duplicate())
{
// (2) Filter by record flags.
if filter_by_flags(record, params) {
results.records.filtered_by_flags += 1;
return Ok(());
}

// (3) Parse the CIGAR string from the record.
// (3) Filter by CIGAR.
// We only care about reads with introns, so if there are no introns
// we can skip this read.
let cigar = record.cigar();
if !cigar.iter().any(|op| matches!(op.kind(), Kind::Skip)) {
if filter_by_cigar(record) {
results.records.not_spliced += 1;
return Ok(());
}

// (4) If the user is filtering by MAPQ, check if this read passes.
// Log if the read is filtered out for a too low MAPQ or a missing MAPQ.
if params.min_mapq > 0 {
match record.mapping_quality() {
Some(mapq) => {
if mapq.get() < params.min_mapq {
results.records.low_mapq += 1;
return Ok(());
}
}
None => {
results.records.missing_mapq += 1;
return Ok(());
}
}
// (4) Filter by MAPQ
if filter_by_mapq(record, params.min_mapq) {
results.records.bad_mapq += 1;
return Ok(());
}

// (5) Parse the reference sequence from the record.
Expand Down Expand Up @@ -144,7 +149,7 @@ pub fn process(

// (8) Find introns
let cur_pos = start;
for op in cigar.iter() {
for op in record.cigar().iter() {
match op.kind() {
// This is an intron.
Kind::Skip => {
Expand Down Expand Up @@ -355,14 +360,111 @@ mod tests {
use noodles::sam::record::ReadName;
use std::num::NonZeroUsize;

#[test]
fn test_filter_by_flags() {
// Setup
let mut record = Record::default();
let params = JunctionAnnotationParameters {
min_intron_length: 10,
min_read_support: 2,
min_mapq: Some(MappingQuality::new(30).unwrap()),
no_supplementary: false,
count_secondary: false,
count_duplicates: false,
};

// Test that records are filtered out correctly
record.flags_mut().set(0x4.into(), true);
assert!(filter_by_flags(&record, &params));
record.flags_mut().set(0x4.into(), false);
record.flags_mut().set(0x800.into(), true);
assert!(!filter_by_flags(&record, &params));
record.flags_mut().set(0x800.into(), false);
record.flags_mut().set(0x100.into(), true);
assert!(filter_by_flags(&record, &params));
record.flags_mut().set(0x100.into(), false);
record.flags_mut().set(0x400.into(), true);
assert!(filter_by_flags(&record, &params));
record.flags_mut().set(0x400.into(), false);
assert!(!filter_by_flags(&record, &params));
}

#[test]
fn test_filter_by_cigar() {
// Setup
let mut record = Record::default();

// Test that records are filtered out correctly
*record.cigar_mut() = "10M10N10M".parse().unwrap();
assert!(!filter_by_cigar(&record));
*record.cigar_mut() = "10M".parse().unwrap();
assert!(filter_by_cigar(&record));
}

#[test]
fn test_filter_junction_map() {
// Setup
let mut junction_map = results::JunctionsMap::default();
junction_map.insert(
"sq1".to_string(),
HashMap::from([
((Position::new(1).unwrap(), Position::new(11).unwrap()), 1),
((Position::new(1).unwrap(), Position::new(5).unwrap()), 1),
]),
);
junction_map.insert(
"sq2".to_string(),
HashMap::from([((Position::new(1).unwrap(), Position::new(11).unwrap()), 2)]),
);
let min_intron_length = 10;
let min_read_support = 2;
let mut metrics = results::SummaryResults::default();

// Test that junctions are filtered out correctly
filter_junction_map(
&mut junction_map,
min_intron_length,
min_read_support,
&mut metrics,
);
assert_eq!(junction_map.len(), 1);
assert_eq!(junction_map.get("sq1"), None);
assert_eq!(junction_map.get("sq2").unwrap().len(), 1);
assert_eq!(metrics.intron_too_short, 1);
assert_eq!(metrics.junctions_with_not_enough_read_support, 2);
assert_eq!(metrics.total_rejected_junctions, 2);
}

#[test]
fn test_tally_junctions_and_support() {
// Setup
let mut junction_map = results::JunctionsMap::default();
junction_map.insert(
"sq1".to_string(),
HashMap::from([
((Position::new(1).unwrap(), Position::new(11).unwrap()), 1),
((Position::new(1).unwrap(), Position::new(5).unwrap()), 1),
]),
);
junction_map.insert(
"sq2".to_string(),
HashMap::from([((Position::new(1).unwrap(), Position::new(11).unwrap()), 2)]),
);

// Test that junctions are tallied correctly
let (juncs, support) = tally_junctions_and_support(&junction_map);
assert_eq!(juncs, 3);
assert_eq!(support, 4);
}

#[test]
fn test_process_and_summarize() {
// Setup
let mut results = results::JunctionAnnotationResults::default();
let params = JunctionAnnotationParameters {
min_intron_length: 10,
min_read_support: 2,
min_mapq: 30,
min_mapq: Some(MappingQuality::new(30).unwrap()),
no_supplementary: false,
count_secondary: false,
count_duplicates: false,
Expand Down Expand Up @@ -413,8 +515,7 @@ mod tests {
assert_eq!(results.records.processed, 1);
assert_eq!(results.records.filtered_by_flags, 0);
assert_eq!(results.records.not_spliced, 0);
assert_eq!(results.records.low_mapq, 0);
assert_eq!(results.records.missing_mapq, 0);
assert_eq!(results.records.bad_mapq, 0);

// Test that unmapped gets ignored
let mut record = Record::default();
Expand All @@ -429,8 +530,7 @@ mod tests {
assert_eq!(results.records.processed, 1);
assert_eq!(results.records.filtered_by_flags, 1);
assert_eq!(results.records.not_spliced, 0);
assert_eq!(results.records.low_mapq, 0);
assert_eq!(results.records.missing_mapq, 0);
assert_eq!(results.records.bad_mapq, 0);

// Test partial novel junction
let mut record = Record::default();
Expand All @@ -445,8 +545,7 @@ mod tests {
assert_eq!(results.records.processed, 2);
assert_eq!(results.records.filtered_by_flags, 1);
assert_eq!(results.records.not_spliced, 0);
assert_eq!(results.records.low_mapq, 0);
assert_eq!(results.records.missing_mapq, 0);
assert_eq!(results.records.bad_mapq, 0);

// Test partial novel junction (again for more read support)
let mut record = Record::default();
Expand All @@ -461,8 +560,7 @@ mod tests {
assert_eq!(results.records.processed, 3);
assert_eq!(results.records.filtered_by_flags, 1);
assert_eq!(results.records.not_spliced, 0);
assert_eq!(results.records.low_mapq, 0);
assert_eq!(results.records.missing_mapq, 0);
assert_eq!(results.records.bad_mapq, 0);

// Test that supplementary alignments get counted
let mut record = Record::default();
Expand All @@ -478,8 +576,7 @@ mod tests {
assert_eq!(results.records.processed, 4);
assert_eq!(results.records.filtered_by_flags, 1);
assert_eq!(results.records.not_spliced, 0);
assert_eq!(results.records.low_mapq, 0);
assert_eq!(results.records.missing_mapq, 0);
assert_eq!(results.records.bad_mapq, 0);

// Test that secondary alignments don't get counted
let mut record = Record::default();
Expand All @@ -495,8 +592,7 @@ mod tests {
assert_eq!(results.records.processed, 4);
assert_eq!(results.records.filtered_by_flags, 2);
assert_eq!(results.records.not_spliced, 0);
assert_eq!(results.records.low_mapq, 0);
assert_eq!(results.records.missing_mapq, 0);
assert_eq!(results.records.bad_mapq, 0);

// TODO: Below tests are not working as expected. Need to fix them.
// Test complete novel junction
Expand Down
Loading

0 comments on commit 7767da4

Please sign in to comment.