Skip to content

Commit

Permalink
feat: min_coverage_breadth
Browse files Browse the repository at this point in the history
fix: outdated tests

replaced keep_if_any_meets_coverage with min_coverage_breadth, which
allows specifying frequency of pools that need to meet coverage
requirement. (renamed min_coverage to min_coverage_depth for clarity)

made tests up to date with new flags
  • Loading branch information
cjdjpj committed Jul 1, 2024
1 parent 8d5a45c commit aed3abc
Show file tree
Hide file tree
Showing 15 changed files with 102 additions and 55 deletions.
48 changes: 20 additions & 28 deletions src/base/pileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use crate::base::*;
use ndarray::prelude::*;
use std::collections::HashSet;
use std::fs::{File, OpenOptions};
use std::io::{self, prelude::*, BufReader, BufWriter, Error, ErrorKind, SeekFrom};
use std::str;
Expand Down Expand Up @@ -267,39 +266,32 @@ impl Filter for PileupLine {
}
}

if filter_stats.keep_if_any_meets_coverage {
// At least 1 pool needs to have been covered min_coverage times
let mut met_coverage_requirement = false;
for c in &self.coverages {
if c >= &filter_stats.min_coverage {
met_coverage_requirement = true;
break;
}
}
if !met_coverage_requirement{
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let pools_covered = self.coverages.iter()
.filter(|&&c| c >= filter_stats.min_coverage_depth)
.take(min_coverage_breadth as usize)
.count();

} else {
// All the pools need to have been covered at least min_coverage times
for c in &self.coverages {
if c < &filter_stats.min_coverage {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}
if pools_covered != min_coverage_breadth as usize {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
let mut covered_alleles = HashSet::new();
let mut unique_alleles = Vec::new();

for pool in &self.read_codes{
for read in pool{
covered_alleles.insert(read);
'outer: for pool in &self.read_codes {
for &read in pool {
if !unique_alleles.contains(&read) {
unique_alleles.push(read);
if unique_alleles.len() == 2 {
break 'outer;
}
}
}
}

if covered_alleles.len() < 2{
if unique_alleles.len() < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}
Expand Down Expand Up @@ -591,10 +583,10 @@ mod tests {
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_if_any_meets_coverage: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2],
Expand Down
4 changes: 2 additions & 2 deletions src/base/structs_and_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ pub struct FileSyncPhen {
pub struct FilterStats {
pub remove_ns: bool,
pub remove_monoallelic: bool,
pub keep_if_any_meets_coverage: bool,
pub keep_lowercase_reference: bool,
pub max_base_error_rate: f64,
pub min_coverage: u64,
pub min_coverage_depth: u64,
pub min_coverage_breadth: f64,
pub min_allele_frequency: f64,
pub max_missingness_rate: f64,
pub pool_sizes: Vec<f64>,
Expand Down
14 changes: 10 additions & 4 deletions src/base/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ impl Filter for LocusCounts {
sum_coverage
.iter()
.fold(sum_coverage[0], |min, &x| if x < min { x } else { min });
if min_sum_coverage < filter_stats.min_coverage as f64 {
if min_sum_coverage < filter_stats.min_coverage_depth as f64 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
// // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus
// for i in 0..self.matrix.nrows() {
// if sum_coverage[i] < filter_stats.min_coverage as f64 {
// if sum_coverage[i] < filter_stats.min_coverage_depth as f64 {
// for j in 0..self.matrix.ncols() {
// self.matrix[(i, j)] = f64::NAN as u64;
// }
Expand Down Expand Up @@ -1569,8 +1569,11 @@ mod tests {
let frequencies = *(counts.to_frequencies().unwrap());
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down Expand Up @@ -1675,8 +1678,11 @@ mod tests {
// let phen = file_phen.lparse().unwrap();
// let filter_stats = FilterStats {
// remove_ns: true,
// remove_monoallelic: false,
// keep_lowercase_reference: false,
// max_base_error_rate: 0.005,
// min_coverage: 0,
// min_coverage_depth: 0,
// min_coverage_breadth: 1.0,
// min_allele_frequency: 0.0001,
// max_missingness_rate: 0.2,
// pool_sizes: phen.pool_sizes,
Expand Down
30 changes: 26 additions & 4 deletions src/base/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,29 @@ impl Filter for VcfLine {
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
// All the pools needs be have been covered at least min_coverage times
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let mut pools_covered = 0;
for i in 0..self.allele_depths.len() {
let c = &self.allele_depths[i].iter().fold(0, |sum, x| sum + x);
if c < &filter_stats.min_coverage {
if c >= &filter_stats.min_coverage_depth {
pools_covered += 1;
}
if pools_covered == min_coverage_breadth {
break;
}
}
if pools_covered != min_coverage_breadth {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
if self.alternative_alleles.len() == 0 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
}
}

// Filter by minimum allele frequency,
//// First convert the counts per pool into frequencies
let allele_frequencies = match self.to_frequencies() {
Expand Down Expand Up @@ -509,16 +525,22 @@ mod tests {
);
let filter_stats_1 = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2; 10],
};
let filter_stats_2 = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 10,
min_coverage_depth: 10,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.0,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2; 10],
Expand Down
5 changes: 4 additions & 1 deletion src/gp/cv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,11 @@ mod tests {
let n_threads = 2;
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2, 0.2],
Expand Down
5 changes: 4 additions & 1 deletion src/gwas/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,11 @@ mod tests {
Array2::from_shape_vec((5, 2), vec![1, 9, 2, 8, 3, 7, 4, 6, 5, 5]).unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
Expand Down
5 changes: 4 additions & 1 deletion src/gwas/gwalpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,8 +416,11 @@ mod tests {
.reversed_axes();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
Expand Down
2 changes: 1 addition & 1 deletion src/gwas/ols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,7 +549,7 @@ mod tests {
// let filter_stats = FilterStats {
// remove_ns: true,
// max_base_error_rate: 0.005,
// min_coverage: 1,
// min_coverage_depth: 1,
// min_allele_frequency: 0.005,
// pool_sizes: vec![20.0, 20.0, 20.0, 20.0, 20.0],
// };
Expand Down
5 changes: 4 additions & 1 deletion src/imputation/adaptive_ld_knn_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down
5 changes: 4 additions & 1 deletion src/imputation/filtering_missing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,11 @@ mod tests {
////////////////////////////////////////////////////////////////////////////////////////////////////////
let filter_stats = FilterStats {
remove_ns: false,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 1.0,
min_coverage: 0,
min_coverage_depth: 0,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.000001,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down
5 changes: 4 additions & 1 deletion src/imputation/mean_imputation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![20., 20., 20., 20., 20.],
Expand Down
14 changes: 7 additions & 7 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,12 @@ struct Args {
/// Maximum base sequencing error rate
#[clap(long, default_value_t = 0.01)]
max_base_error_rate: f64,
/// Minimum depth of coverage (loci with at least one pool below this threshold will be omitted)
/// Minimum breadth of coverage (loci with less than this proportion of pools below min_coverage_depth will be omitted)
#[clap(long, default_value_t = 1.0)]
min_coverage_breadth: f64,
/// Minimum depth of coverage (loci with less than min_coverage_breadth pools below this threshold will be omitted)
#[clap(long, default_value_t = 1)]
min_coverage: u64,
min_coverage_depth: u64,
/// Minimum allele frequency (per locus, alleles which fail to pass this threshold will be omitted allowing control over multiallelic loci)
#[clap(long, default_value_t = 0.001)]
min_allele_frequency: f64,
Expand All @@ -55,9 +58,6 @@ struct Args {
/// Keep ambiguous reads during SNP filtering, i.e. keep them coded as Ns
#[clap(long, action)]
keep_ns: bool,
/// Keep loci if any population meets min_coverage, verses only keeping if ALL loci meet coverage (default)
#[clap(long, action)]
keep_if_any_meets_coverage: bool,
/// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
#[clap(long, action)]
remove_monoallelic: bool,
Expand Down Expand Up @@ -196,10 +196,10 @@ fn main() {
let filter_stats = base::FilterStats {
remove_ns: !args.keep_ns,
remove_monoallelic: args.remove_monoallelic,
keep_if_any_meets_coverage: args.keep_if_any_meets_coverage,
keep_lowercase_reference: args.keep_lowercase_reference,
max_base_error_rate: args.max_base_error_rate,
min_coverage: args.min_coverage,
min_coverage_breadth: args.min_coverage_breadth,
min_coverage_depth: args.min_coverage_depth,
min_allele_frequency: args.min_allele_frequency,
max_missingness_rate: args.max_missingness_rate,
pool_sizes: phen.pool_sizes.clone(),
Expand Down
5 changes: 4 additions & 1 deletion src/popgen/gudmc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -484,8 +484,11 @@ mod tests {
let file_sync_phen = *(file_sync, file_phen).lparse().unwrap();
let filter_stats = FilterStats {
remove_ns: false,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.01,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.001,
max_missingness_rate: 0.0,
pool_sizes: vec![42.0, 42.0, 42.0, 42.0, 42.0],
Expand Down
5 changes: 4 additions & 1 deletion src/tables/chisq_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,11 @@ mod tests {
};
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.01,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2, 0.2],
Expand Down
5 changes: 4 additions & 1 deletion src/tables/fisher_exact_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,11 @@ mod tests {
Array2::from_shape_vec((3, 2), vec![0, 3, 1, 5, 2, 6]).unwrap();
let filter_stats = FilterStats {
remove_ns: true,
remove_monoallelic: false,
keep_lowercase_reference: false,
max_base_error_rate: 0.005,
min_coverage: 1,
min_coverage_depth: 1,
min_coverage_breadth: 1.0,
min_allele_frequency: 0.005,
max_missingness_rate: 0.0,
pool_sizes: vec![0.2, 0.2, 0.2],
Expand Down

0 comments on commit aed3abc

Please sign in to comment.