Skip to content

Commit

Permalink
feat(derive): junction-annotation subcommand
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Dec 26, 2023
1 parent 489c2bb commit 14db813
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 29 deletions.
25 changes: 21 additions & 4 deletions src/derive/command/junction_annotation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pub struct JunctionAnnotationArgs {
features_gff: PathBuf,

/// Name of the exon region feature for the gene model used.
#[arg(short, long, value_name = "STRING", default_value = "exon")]
#[arg(long, value_name = "STRING", default_value = "exon")]
pub exon_feature_name: String,

/// Minimum intron length to consider.
Expand All @@ -46,8 +46,22 @@ pub struct JunctionAnnotationArgs {
pub min_read_support: u8,

/// Minumum mapping quality for a record to be considered.
/// Set to 0 to disable this filter and allow reads _without_
/// a mapping quality to be considered.
#[arg(short, long, value_name = "U8", default_value = "30")]
pub min_mapq: u8,

/// Do not count supplementary alignments.
#[arg(long)]
pub no_supplementary: bool,

/// Do count secondary alignments.
#[arg(long)]
pub count_secondary: bool,

/// Do count duplicates.
#[arg(long)]
pub count_duplicates: bool,
}

/// Main function for the `ngs derive junction_annotation` subcommand.
Expand All @@ -74,11 +88,11 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
debug!("Tabulating GFF exon features.");
for record in &exon_records {
let seq_name = record.reference_sequence_name();
let start = record.start().into();
let end = record.end().into();
let start: usize = record.start().into();
let end: usize = record.end().into();

exon_starts.entry(seq_name).or_default().push(start);
exon_ends.entry(seq_name).or_default().push(end);
exon_ends.entry(seq_name).or_default().push(end + 1); // TODO why +1? It works
}

debug!("Finalizing GFF features lookup.");
Expand All @@ -100,6 +114,9 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
fuzzy_junction_match_range: args.fuzzy_junction_match_range,
min_read_support: args.min_read_support,
min_mapq: args.min_mapq,
no_supplementary: args.no_supplementary,
count_secondary: args.count_secondary,
count_duplicates: args.count_duplicates,
};

let ParsedBAMFile {
Expand Down
50 changes: 34 additions & 16 deletions src/derive/junction_annotation/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ pub struct JunctionAnnotationParameters {
pub min_read_support: u8,

/// Minumum mapping quality for a record to be considered.
/// 0 if MAPQ shouldn't be considered.
pub min_mapq: u8,

/// Do not count supplementary alignments.
pub no_supplementary: bool,

/// Do count secondary alignments.
pub count_secondary: bool,

/// Do count duplicates.
pub count_duplicates: bool,
}

/// Main function to annotate junctions one record at a time.
Expand All @@ -39,16 +49,19 @@ pub fn process(
_ => bail!("Could not parse read name"),
};

// (2) Parse the flags so we can see if the read is mapped.
// (2) Parse the flags so we can see if the read should be ignored.
let flags = record.flags();

// (3) If the read is unmapped, just return—no need to throw an error.
if flags.is_unmapped() {
if flags.is_unmapped()
|| (params.no_supplementary && flags.is_supplementary())
|| (!params.count_secondary && flags.is_secondary())
|| (!params.count_duplicates && flags.is_duplicate())
{
results.records.ignored_flags += 1;
return Ok(());
}

// (4) Parse the CIGAR string from the record.
// (3) Parse the CIGAR string from the record.
// We only care about reads with introns, so if there are no introns
// we can skip this read.
let cigar = record.cigar();
Expand All @@ -57,19 +70,24 @@ pub fn process(
return Ok(());
}

// (5) If the read has a MAPQ below our threshold, just return.
// No need to throw an error, unless the MAPQ could not be parsed.
match record.mapping_quality() {
Some(mapq) => {
if mapq.get() < params.min_mapq {
results.records.low_mapq += 1;
// (4) If the user is filtering by MAPQ, check if this read passes.
// Log if the read is filtered out for a too low MAPQ or a missing MAPQ.
if params.min_mapq > 0 {
match record.mapping_quality() {
Some(mapq) => {
if mapq.get() < params.min_mapq {
results.records.low_mapq += 1;
return Ok(());
}
}
None => {
results.records.missing_mapq += 1;
return Ok(());
}
}
_ => results.records.couldnt_parse += 1,
}

// (6) Parse the reference sequence id from the record.
// (5) Parse the reference sequence id from the record.
let id = match record.reference_sequence_id() {
Some(id) => id,
_ => {
Expand All @@ -80,7 +98,7 @@ pub fn process(
}
};

// (7) Map the parsed reference sequence id to a reference sequence name.
// (6) Map the parsed reference sequence id to a reference sequence name.
let seq_name = match header
.reference_sequences()
.get_index(id)
Expand All @@ -95,20 +113,20 @@ pub fn process(
}
};

// (8) Check if there will be annotations for this reference sequence.
// (7) Check if there will be annotations for this reference sequence.
let mut ref_is_annotated = true;
if !exon_starts.contains_key(&seq_name) || !exon_ends.contains_key(&seq_name) {
ref_is_annotated = false;
}

// (9) Calculate the start position of this read. This will
// (8) Calculate the start position of this read. This will
// later be used to find the position of any introns.
let start = match record.alignment_start() {
Some(s) => usize::from(s),
_ => bail!("Could not parse record's start position."),
};

// (10) Find introns
// (9) Find introns
let mut cur_pos = start;
for op in cigar.iter() {
match op.kind() {
Expand Down
57 changes: 48 additions & 9 deletions src/derive/junction_annotation/results.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
//! Results related to the `ngs derive junction_annotation` subcommand.

use serde::Deserialize;
use serde::ser::SerializeStruct;
use serde::Serialize;
use serde::Serializer;
use std::collections::HashMap;
use std::num::NonZeroUsize;

/// Lists of annotated junctions.
#[derive(Clone, Default, Serialize, Deserialize)]
#[derive(Clone, Default)]
pub struct JunctionAnnotations {
/// Known junctions. The outer key is the referece name, and the value is another
/// HashMap. The inner key is the (start, end) coordinates of the junction,
Expand All @@ -30,17 +31,52 @@ pub struct JunctionAnnotations {
pub unannotated_reference: HashMap<String, HashMap<(NonZeroUsize, NonZeroUsize), usize>>,
}

impl Serialize for JunctionAnnotations {
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
let mut known = Vec::new();
for (ref_name, junctions) in &self.known {
for ((start, end), count) in junctions {
known.push((ref_name, start, end, count));
}
}

let mut partial_novel = Vec::new();
for (ref_name, junctions) in &self.partial_novel {
for ((start, end), count) in junctions {
partial_novel.push((ref_name, start, end, count));
}
}

let mut complete_novel = Vec::new();
for (ref_name, junctions) in &self.complete_novel {
for ((start, end), count) in junctions {
complete_novel.push((ref_name, start, end, count));
}
}

let mut unannotated_reference = Vec::new();
for (ref_name, junctions) in &self.unannotated_reference {
for ((start, end), count) in junctions {
unannotated_reference.push((ref_name, start, end, count));
}
}

let mut s = serializer.serialize_struct("JunctionAnnotations", 4)?;
s.serialize_field("known", &known)?;
s.serialize_field("partial_novel", &partial_novel)?;
s.serialize_field("complete_novel", &complete_novel)?;
s.serialize_field("unannotated_reference", &unannotated_reference)?;
s.end()
}
}

/// General record metrics that are tallied as a part of the
/// junction_annotation subcommand.
#[derive(Clone, Default, Serialize, Deserialize)]
#[derive(Clone, Default, Serialize)]
pub struct RecordMetrics {
/// The number of records that have been fully processed.
/// Should equal Metrics::SummaryMetrics::total_spliced_reads.
pub processed: usize,

/// The number of records that couldn't be parsed.
pub couldnt_parse: usize,

/// The number of records that have been ignored because of their flags.
/// (i.e. they were unmapped, duplicates, secondary, or supplementary)
/// The last 3 conditions can be toggled on/off with CL flags
Expand All @@ -53,10 +89,13 @@ pub struct RecordMetrics {
/// The number of records with junctions that have been ignored because
/// they failed the MAPQ filter.
pub low_mapq: usize,

/// The number of records whose MAPQ couldn't be parsed and were thus ignored.
pub missing_mapq: usize,
}

/// Summary statistics for the junction_annotation subcommand.
#[derive(Clone, Default, Serialize, Deserialize)]
#[derive(Clone, Default, Serialize)]
pub struct SummaryResults {
/// The total number of junctions observed in the file.
pub total_junctions: usize,
Expand Down Expand Up @@ -101,7 +140,7 @@ pub struct SummaryResults {

/// Main Results struct. This struct aggregates all of the minor metrics structs
/// outlined in this file so they can be kept track of as a unit.
#[derive(Clone, Default, Serialize, Deserialize)]
#[derive(Clone, Default, Serialize)]
pub struct JunctionAnnotationResults {
/// Lists of annotated junctions.
pub junction_annotations: JunctionAnnotations,
Expand Down

0 comments on commit 14db813

Please sign in to comment.