From 2338ac0a10ec2b0437aff6068d43339a75fc68f0 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Wed, 12 Jun 2024 19:31:29 -0400 Subject: [PATCH 01/28] work on tokenizer config --- gtars/Cargo.toml | 5 +- gtars/docs/universes.svg | 905 +++++++++++++++++++++++++ gtars/src/tokenizers/config.rs | 7 +- gtars/src/tokenizers/mod.rs | 4 +- gtars/src/tokenizers/tree_tokenizer.rs | 137 +++- gtars/tests/data/chroms.bed | 24 + gtars/tests/data/tokenizer.toml | 6 + gtars/tests/data/tokenizer.yaml | 10 - 8 files changed, 1056 insertions(+), 42 deletions(-) create mode 100644 gtars/docs/universes.svg create mode 100644 gtars/tests/data/chroms.bed create mode 100644 gtars/tests/data/tokenizer.toml delete mode 100644 gtars/tests/data/tokenizer.yaml diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index 5fa382a..b2cfef8 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -4,6 +4,7 @@ version = "0.0.14" edition = "2021" description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package." license = "MIT" +readme = "../README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -13,8 +14,8 @@ bytes = "1.6.0" clap = { version = "4.4.7", features = ["derive"] } flate2 = "1.0.28" rust-lapper = "1.1.0" -serde = { version = "^1.0", features = ["derive"] } -serde_yaml = "^0.9" +serde = {version = "1.0.203", features=["derive"]} +toml = "0.8.14" # polars = { version = "0.35.4", features = ["decompress", "decompress-fast", "ndarray"] } diff --git a/gtars/docs/universes.svg b/gtars/docs/universes.svg new file mode 100644 index 0000000..8d16ba7 --- /dev/null +++ b/gtars/docs/universes.svg @@ -0,0 +1,905 @@ + + + +...U1U2SEU1U2SE++0VUNprimary(required)secondary(not required)specialtokensexcluderangesQueryU1U2Tokens?n-tiary(not required)U1U2S[42, 101, 256, 999]&ADBC diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs index a89191d..5f2b54a 100644 --- a/gtars/src/tokenizers/config.rs +++ b/gtars/src/tokenizers/config.rs @@ -1,7 +1,8 @@ use serde::{Deserialize, Serialize}; -#[derive(Deserialize, Serialize, Debug, PartialEq )] +#[derive(Deserialize, Serialize, Debug, PartialEq)] pub struct TokenizerConfig { pub universe: String, - pub excluderanges: Option, -} \ No newline at end of file + pub hierarchical_universes: Option>, + pub exclude_ranges: Option, +} diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 8b981c7..4e865ad 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -2,12 +2,12 @@ //! //! There is currently only one tokenizer - the `TreeTokenizer` pub mod cli; +pub mod config; pub mod fragment_tokenizer; pub mod soft_tokenizer; pub mod special_tokens; pub mod traits; pub mod tree_tokenizer; -pub mod config; /// constants for the tokenizer module. pub mod consts { @@ -17,7 +17,7 @@ pub mod consts { } // expose the TreeTokenizer struct to users of this crate +pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; pub use tree_tokenizer::TreeTokenizer; -pub use config::TokenizerConfig; \ No newline at end of file diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 8fc922c..73220db 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::fs::read_to_string; use std::path::Path; use anyhow::Result; @@ -7,11 +8,14 @@ use rust_lapper::{Interval, Lapper}; use crate::common::consts::special_tokens::*; use crate::common::models::{Region, RegionSet, TokenizedRegionSet, Universe}; use crate::common::utils::extract_regions_from_bed_file; +use crate::tokenizers::config::TokenizerConfig; use crate::tokenizers::traits::{Pad, SpecialTokens, Tokenizer}; pub struct TreeTokenizer { pub universe: Universe, tree: HashMap>, + secondary_trees: Option>>>, + exclude_ranges: Option>>>>, } impl TryFrom<&Path> for TreeTokenizer { @@ -23,7 +27,100 @@ impl TryFrom<&Path> for TreeTokenizer { /// # Returns /// A new TreeTokenizer fn try_from(value: &Path) -> Result { - let mut universe = Universe::try_from(value)?; + // read in yaml from file + let yaml_str = read_to_string(value)?; + let config: TokenizerConfig = toml::from_str(&yaml_str)?; + + // create initial universe from the *required* universe field + let mut universe = Universe::try_from(Path::new(&config.universe))?; + + let mut tree: HashMap> = HashMap::new(); + let mut intervals: HashMap>> = HashMap::new(); + + for region in universe.regions.iter() { + // create interval + let interval = Interval { + start: region.start, + stop: region.end, + val: universe.convert_region_to_id(region).unwrap(), + }; + + // use chr to get the vector of intervals + let chr_intervals = intervals.entry(region.chr.to_owned()).or_default(); + + // push interval to vector + chr_intervals.push(interval); + } + + for (chr, chr_intervals) in intervals.iter() { + let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); + tree.insert(chr.to_string(), lapper); + } + + // create secondary trees if they exist + let mut secondary_trees: Vec>> = Vec::new(); + if let Some(hierarchical_universes) = config.hierarchical_universes { + for universe_path in hierarchical_universes { + let tree: HashMap> = HashMap::new(); + + // extract regions from the bed file + let path_to_bed = Path::new(&universe_path); + let regions = extract_regions_from_bed_file(path_to_bed)?; + + // insert these into the universe so they can get id's assigned + for region in regions { + universe.insert_token(®ion); + + // create interval + let interval = Interval { + start: region.start, + stop: region.end, + val: universe.convert_region_to_id(®ion).unwrap(), + }; + + // use chr to get the vector of intervals + let chr_intervals = intervals.entry(region.chr.to_owned()).or_default(); + + // push interval to vector + chr_intervals.push(interval); + } + + let mut s_tree: HashMap> = HashMap::new(); + for (chr, chr_intervals) in intervals.iter() { + let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); + s_tree.insert(chr.to_string(), lapper); + } + + secondary_trees.push(tree); + } + }; + + // create exclude ranges if they exist (no need to increment universe, since these are completely ignored) + let mut exclude_ranges: Vec>>> = Vec::new(); + + if let Some(exclude_ranges_path) = config.exclude_ranges { + let path_to_bed = Path::new(&exclude_ranges_path); + let regions = extract_regions_from_bed_file(path_to_bed)?; + + let mut exclude_intervals: HashMap>> = HashMap::new(); + + for region in regions { + // create interval + let interval = Interval { + start: region.start, + stop: region.end, + val: 0, + }; + + // use chr to get the vector of intervals + let chr_intervals = exclude_intervals.entry(region.chr.to_owned()).or_default(); + + // push interval to vector + chr_intervals.push(interval); + } + + exclude_ranges.push(exclude_intervals); + } // add special tokens to the universe // unk @@ -75,30 +172,20 @@ impl TryFrom<&Path> for TreeTokenizer { end: SEP_END as u32, }); - let mut tree: HashMap> = HashMap::new(); - let mut intervals: HashMap>> = HashMap::new(); - - for region in universe.regions.iter() { - // create interval - let interval = Interval { - start: region.start, - stop: region.end, - val: universe.convert_region_to_id(region).unwrap(), - }; - - // use chr to get the vector of intervals - let chr_intervals = intervals.entry(region.chr.to_owned()).or_default(); - - // push interval to vector - chr_intervals.push(interval); - } - - for (chr, chr_intervals) in intervals.iter() { - let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); - tree.insert(chr.to_string(), lapper); - } - - Ok(TreeTokenizer { universe, tree }) + Ok(TreeTokenizer { + universe, + tree, + secondary_trees: if !secondary_trees.is_empty() { + Some(secondary_trees) + } else { + None + }, + exclude_ranges: if !exclude_ranges.is_empty() { + Some(exclude_ranges) + } else { + None + }, + }) } } diff --git a/gtars/tests/data/chroms.bed b/gtars/tests/data/chroms.bed new file mode 100644 index 0000000..e88deb7 --- /dev/null +++ b/gtars/tests/data/chroms.bed @@ -0,0 +1,24 @@ +ch1 0 248956422 +ch2 0 242193529 +ch3 0 198295559 +ch4 0 190214555 +ch5 0 181538259 +ch6 0 170805979 +ch7 0 159345973 +ch8 0 145138636 +ch9 0 138394717 +ch10 0 133797422 +ch11 0 135086622 +ch12 0 133275309 +ch13 0 114364328 +ch14 0 107043718 +ch15 0 101991189 +ch16 0 90338345 +ch17 0 83257441 +ch18 0 80373285 +ch19 0 58617616 +ch20 0 64444167 +ch21 0 46709983 +ch22 0 50818468 +chX 0 156040895 +chY 0 57227415 \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.toml b/gtars/tests/data/tokenizer.toml new file mode 100644 index 0000000..9573b0b --- /dev/null +++ b/gtars/tests/data/tokenizer.toml @@ -0,0 +1,6 @@ +universe = "peaks.bed.gz" + +[[hierarchical_universe]] +file = "chroms.bed" + +exclude_ranges = "excluderanges.bed.gz" \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.yaml b/gtars/tests/data/tokenizer.yaml deleted file mode 100644 index 8c7c288..0000000 --- a/gtars/tests/data/tokenizer.yaml +++ /dev/null @@ -1,10 +0,0 @@ -universe: peaks.bed.gz -exclude_ranges: excluderanges.bed.gz -special_tokens: - unk: 0 - pad: 1 - mask: 2 - eos: 3 - bos: 4 - cls: 5 - sep: 6 \ No newline at end of file From 47a33ae6cd23c7240f965a1d803bd48085907e48 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Thu, 13 Jun 2024 12:50:55 -0400 Subject: [PATCH 02/28] update tests --- gtars/Cargo.toml | 3 +- gtars/src/common/mod.rs | 122 ++++++++++++++ gtars/src/common/utils.rs | 33 ++++ gtars/src/tokenizers/mod.rs | 55 ++++++ gtars/src/tokenizers/tree_tokenizer.rs | 152 +++++++---------- gtars/tests/README.md | 1 + gtars/tests/data/chroms.bed | 48 +++--- gtars/tests/data/tokenizer.toml | 7 +- gtars/tests/test.rs | 221 ------------------------- 9 files changed, 300 insertions(+), 342 deletions(-) create mode 100644 gtars/tests/README.md delete mode 100644 gtars/tests/test.rs diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index b2cfef8..0dfe4cc 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -18,8 +18,7 @@ serde = {version = "1.0.203", features=["derive"]} toml = "0.8.14" # polars = { version = "0.35.4", features = ["decompress", "decompress-fast", "ndarray"] } - - [dev-dependencies] rstest = "0.18.2" tempfile = "3.8.1" +pretty_assertions = "1.4.0" \ No newline at end of file diff --git a/gtars/src/common/mod.rs b/gtars/src/common/mod.rs index f61f1d9..09fe2e0 100644 --- a/gtars/src/common/mod.rs +++ b/gtars/src/common/mod.rs @@ -1,3 +1,125 @@ pub mod consts; pub mod models; pub mod utils; + +#[cfg(test)] +mod tests { + use pretty_assertions::assert_eq; + use rstest::*; + use tempfile::NamedTempFile; + + use super::models::{Region, RegionSet}; + use super::utils::extract_regions_from_bed_file; + use std::io::Read; + use std::path::Path; + + #[fixture] + fn path_to_data() -> &'static str { + "tests/data" + } + + #[fixture] + fn path_to_bed_file() -> &'static str { + "tests/data/peaks.bed" + } + + #[fixture] + fn path_to_bed_file_gzipped() -> &'static str { + "tests/data/peaks.bed.gz" + } + + #[fixture] + fn path_to_anndata_file() -> &'static str { + "tests/data/pbmc_hg38.h5ad" + } + + #[fixture] + fn path_to_r2v_repo() -> &'static str { + "databio/r2v-luecken2021-hg38-v2" + } + + #[fixture] + fn bb_bed_id() -> &'static str { + "fa09672b962809b408b356728d81640e" + } + + #[fixture] + fn path_to_gtok_file() -> &'static str { + "tests/data/out/tokens.gtok" + } + + #[rstest] + fn test_region() { + let region = Region { + chr: "chr1".to_string(), + start: 100, + end: 200, + }; + + assert_eq!(region.chr, "chr1"); + assert_eq!(region.start, 100); + assert_eq!(region.end, 200); + } + + #[rstest] + fn test_extract_regions_from_bed_file(path_to_bed_file: &str) { + let path = Path::new(path_to_bed_file); + let regions = extract_regions_from_bed_file(path); + assert!(regions.is_ok(), "Failed to extract regions from BED file"); + let regions = regions.unwrap(); + assert!(regions.len() == 25); + } + + #[rstest] + fn test_extract_regions_from_bed_file_gzipped(path_to_bed_file_gzipped: &str) { + let path = Path::new(path_to_bed_file_gzipped); + let regions = extract_regions_from_bed_file(path); + assert!(regions.is_ok(), "Failed to extract regions from BED file"); + let regions = regions.unwrap(); + assert_eq!(regions.len(), 25); + } + + #[rstest] + fn test_region_set_from_bed(path_to_bed_file: &str) { + let path = Path::new(path_to_bed_file); + let rs = RegionSet::try_from(path).unwrap(); + + assert_eq!(rs.len(), 25); + } + + #[rstest] + fn test_region_set_from_bytes(path_to_bed_file: &str) { + let path = Path::new(path_to_bed_file); + let rs = RegionSet::try_from(path).unwrap(); + + let mut bytes: Vec = Vec::new(); + + std::fs::File::open(path) + .unwrap() + .read_to_end(&mut bytes) + .unwrap(); + + let rs2 = RegionSet::from(bytes.as_slice()); + + assert_eq!(rs2.len(), rs.len()); + } + + #[rstest] + fn test_region_set_to_bed(path_to_bed_file: &str) { + let path = Path::new(path_to_bed_file); + let rs = RegionSet::try_from(path).unwrap(); + + // create a temporary file + let tmp_file = NamedTempFile::new().unwrap(); + let tmp_path = tmp_file.into_temp_path(); + let tmp_path = Path::new(tmp_path.to_str().unwrap()); + + // write the region set to the temporary file + rs.to_bed(tmp_path).unwrap(); + + // read the temporary file back in as a region set + let rs2 = RegionSet::try_from(tmp_path).unwrap(); + + assert_eq!(rs2.len(), 25); + } +} diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index 8c28c35..8d2df25 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -7,8 +7,10 @@ use std::path::Path; use anyhow::{Context, Result}; use flate2::read::GzDecoder; +use rust_lapper::{Interval, Lapper}; use crate::common::models::region::Region; +use crate::common::models::universe::Universe; pub fn get_dynamic_reader(path: &Path) -> Result>> { let is_gzipped = path.extension() == Some(OsStr::new("gz")); @@ -85,3 +87,34 @@ pub fn extract_regions_from_bed_file(path: &Path) -> Result> { Ok(regions) } + +pub fn create_interval_tree_from_universe( + universe: &Universe, +) -> HashMap> { + // instantiate the tree and list of intervals + let mut tree: HashMap> = HashMap::new(); + let mut intervals: HashMap>> = HashMap::new(); + + for region in universe.regions.iter() { + // create interval + let interval = Interval { + start: region.start, + stop: region.end, + val: universe.convert_region_to_id(region).unwrap(), + }; + + // use chr to get the vector of intervals + let chr_intervals = intervals.entry(region.chr.clone()).or_default(); + + // push interval to vector + chr_intervals.push(interval); + } + + // build the tree + for (chr, chr_intervals) in intervals.iter() { + let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); + tree.insert(chr.to_string(), lapper); + } + + tree +} diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 4e865ad..59be33f 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -21,3 +21,58 @@ pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; pub use tree_tokenizer::TreeTokenizer; + +#[cfg(test)] +mod tests { + + use crate::common::models::RegionSet; + use std::path::Path; + + use super::*; + use pretty_assertions::assert_eq; + use rstest::*; + + #[fixture] + fn path_to_bed_file() -> &'static str { + "tests/data/peaks.bed" + } + + #[fixture] + fn path_to_config_file() -> &'static str { + "tests/data/tokenizer.toml" + } + + #[fixture] + fn path_to_tokenize_bed_file() -> &'static str { + "tests/data/to_tokenize.bed" + } + + #[rstest] + fn test_create_tokenizer_from_bed(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens + } + + #[rstest] + fn test_create_tokenizer_from_config(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens + } + + #[rstest] + fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); + let tokenized_regions = tokenizer.tokenize_region_set(&rs); + + println!("{}", tokenized_regions.len()); + assert_eq!(tokenized_regions.len(), 4); + + // last should be the unknown token + let unknown_token = tokenizer + .universe + .convert_id_to_region(tokenized_regions[3]) + .unwrap(); + assert!(unknown_token.chr == "chrUNK"); + } +} diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 73220db..3a8b49f 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -3,11 +3,11 @@ use std::fs::read_to_string; use std::path::Path; use anyhow::Result; -use rust_lapper::{Interval, Lapper}; +use rust_lapper::Lapper; use crate::common::consts::special_tokens::*; use crate::common::models::{Region, RegionSet, TokenizedRegionSet, Universe}; -use crate::common::utils::extract_regions_from_bed_file; +use crate::common::utils::{create_interval_tree_from_universe, extract_regions_from_bed_file}; use crate::tokenizers::config::TokenizerConfig; use crate::tokenizers::traits::{Pad, SpecialTokens, Tokenizer}; @@ -15,112 +15,92 @@ pub struct TreeTokenizer { pub universe: Universe, tree: HashMap>, secondary_trees: Option>>>, - exclude_ranges: Option>>>>, + exclude_ranges: Option>>, } impl TryFrom<&Path> for TreeTokenizer { type Error = anyhow::Error; /// /// # Arguments - /// - `value` - the path to the bed file + /// - `value` - the path to the tokenizer config file (a TOML) or bed file /// /// # Returns /// A new TreeTokenizer fn try_from(value: &Path) -> Result { - // read in yaml from file - let yaml_str = read_to_string(value)?; - let config: TokenizerConfig = toml::from_str(&yaml_str)?; + // detect file type... if ends in toml assume toml otherwise assume its a bed file + // and just build the universe + tree from that and move on. + // + // This maintains backwards compatibility with the old way of creating tokenizers from bed files + // and allows for the new way of creating tokenizers from toml files + let file_extension = value.extension().unwrap().to_str().unwrap(); - // create initial universe from the *required* universe field - let mut universe = Universe::try_from(Path::new(&config.universe))?; + let (mut universe, tree, secondary_trees, exclude_ranges) = match file_extension { + // parse config file + "toml" => { + let toml_str = read_to_string(value)?; + let config: TokenizerConfig = toml::from_str(&toml_str)?; - let mut tree: HashMap> = HashMap::new(); - let mut intervals: HashMap>> = HashMap::new(); + // universe path is relative to the config file + let universe_path = value.parent().unwrap().join(&config.universe); - for region in universe.regions.iter() { - // create interval - let interval = Interval { - start: region.start, - stop: region.end, - val: universe.convert_region_to_id(region).unwrap(), - }; + // create initial universe from the *required* universe field + let mut universe = Universe::try_from(Path::new(&universe_path))?; - // use chr to get the vector of intervals - let chr_intervals = intervals.entry(region.chr.to_owned()).or_default(); + let tree = create_interval_tree_from_universe(&universe); - // push interval to vector - chr_intervals.push(interval); - } + // create secondary trees if they exist + let secondary_trees = match config.hierarchical_universes { + Some(hierarchical_universes) => { + let mut secondary_trees = Vec::new(); + for hierarchical_universe in hierarchical_universes { + let hierarchical_universe_path = + value.parent().unwrap().join(&hierarchical_universe); - for (chr, chr_intervals) in intervals.iter() { - let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); - tree.insert(chr.to_string(), lapper); - } + let hierarchical_universe_regions = + extract_regions_from_bed_file(&hierarchical_universe_path)?; - // create secondary trees if they exist - let mut secondary_trees: Vec>> = Vec::new(); - if let Some(hierarchical_universes) = config.hierarchical_universes { - for universe_path in hierarchical_universes { - let tree: HashMap> = HashMap::new(); - - // extract regions from the bed file - let path_to_bed = Path::new(&universe_path); - let regions = extract_regions_from_bed_file(path_to_bed)?; - - // insert these into the universe so they can get id's assigned - for region in regions { - universe.insert_token(®ion); - - // create interval - let interval = Interval { - start: region.start, - stop: region.end, - val: universe.convert_region_to_id(®ion).unwrap(), - }; + for region in hierarchical_universe_regions { + universe.insert_token(®ion); + } - // use chr to get the vector of intervals - let chr_intervals = intervals.entry(region.chr.to_owned()).or_default(); + let hierarchical_tree = create_interval_tree_from_universe(&universe); - // push interval to vector - chr_intervals.push(interval); - } + secondary_trees.push(hierarchical_tree); + } - let mut s_tree: HashMap> = HashMap::new(); - for (chr, chr_intervals) in intervals.iter() { - let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); - s_tree.insert(chr.to_string(), lapper); - } + Some(secondary_trees) + } + None => None, + }; - secondary_trees.push(tree); - } - }; + // create exclude ranges if they exist + let exclude_ranges = match config.exclude_ranges { + Some(exclude_ranges) => { + let exclude_ranges_path = value.parent().unwrap().join(exclude_ranges); - // create exclude ranges if they exist (no need to increment universe, since these are completely ignored) - let mut exclude_ranges: Vec>>> = Vec::new(); + // universe gets discarded since its not conasidered a part of the tokenizers universe + let exclude_ranges_universe = + Universe::try_from(exclude_ranges_path.as_path())?; - if let Some(exclude_ranges_path) = config.exclude_ranges { - let path_to_bed = Path::new(&exclude_ranges_path); - let regions = extract_regions_from_bed_file(path_to_bed)?; + let exclude_ranges_map = + create_interval_tree_from_universe(&exclude_ranges_universe); - let mut exclude_intervals: HashMap>> = HashMap::new(); + Some(exclude_ranges_map) + } - for region in regions { - // create interval - let interval = Interval { - start: region.start, - stop: region.end, - val: 0, + None => None, }; - // use chr to get the vector of intervals - let chr_intervals = exclude_intervals.entry(region.chr.to_owned()).or_default(); - - // push interval to vector - chr_intervals.push(interval); + (universe, tree, secondary_trees, exclude_ranges) } - - exclude_ranges.push(exclude_intervals); - } + // else assume its a bed file + _ => { + let regions = extract_regions_from_bed_file(value)?; + let universe = Universe::from(regions); + let tree = create_interval_tree_from_universe(&universe); + (universe, tree, None, None) + } + }; // add special tokens to the universe // unk @@ -175,16 +155,8 @@ impl TryFrom<&Path> for TreeTokenizer { Ok(TreeTokenizer { universe, tree, - secondary_trees: if !secondary_trees.is_empty() { - Some(secondary_trees) - } else { - None - }, - exclude_ranges: if !exclude_ranges.is_empty() { - Some(exclude_ranges) - } else { - None - }, + secondary_trees, + exclude_ranges, }) } } diff --git a/gtars/tests/README.md b/gtars/tests/README.md new file mode 100644 index 0000000..2e713ec --- /dev/null +++ b/gtars/tests/README.md @@ -0,0 +1 @@ +To run tests, run `cargo test` in the root directory of the project. diff --git a/gtars/tests/data/chroms.bed b/gtars/tests/data/chroms.bed index e88deb7..b6d08bd 100644 --- a/gtars/tests/data/chroms.bed +++ b/gtars/tests/data/chroms.bed @@ -1,24 +1,24 @@ -ch1 0 248956422 -ch2 0 242193529 -ch3 0 198295559 -ch4 0 190214555 -ch5 0 181538259 -ch6 0 170805979 -ch7 0 159345973 -ch8 0 145138636 -ch9 0 138394717 -ch10 0 133797422 -ch11 0 135086622 -ch12 0 133275309 -ch13 0 114364328 -ch14 0 107043718 -ch15 0 101991189 -ch16 0 90338345 -ch17 0 83257441 -ch18 0 80373285 -ch19 0 58617616 -ch20 0 64444167 -ch21 0 46709983 -ch22 0 50818468 -chX 0 156040895 -chY 0 57227415 \ No newline at end of file +ch1 0 248956422 +ch2 0 242193529 +ch3 0 198295559 +ch4 0 190214555 +ch5 0 181538259 +ch6 0 170805979 +ch7 0 159345973 +ch8 0 145138636 +ch9 0 138394717 +ch10 0 133797422 +ch11 0 135086622 +ch12 0 133275309 +ch13 0 114364328 +ch14 0 107043718 +ch15 0 101991189 +ch16 0 90338345 +ch17 0 83257441 +ch18 0 80373285 +ch19 0 58617616 +ch20 0 64444167 +ch21 0 46709983 +ch22 0 50818468 +chX 0 156040895 +chY 0 57227415 \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.toml b/gtars/tests/data/tokenizer.toml index 9573b0b..eb969b6 100644 --- a/gtars/tests/data/tokenizer.toml +++ b/gtars/tests/data/tokenizer.toml @@ -1,6 +1,3 @@ universe = "peaks.bed.gz" - -[[hierarchical_universe]] -file = "chroms.bed" - -exclude_ranges = "excluderanges.bed.gz" \ No newline at end of file +exclude_ranges = "excluderanges.bed.gz" +hierarchical_universes = ["chroms.bed"] \ No newline at end of file diff --git a/gtars/tests/test.rs b/gtars/tests/test.rs deleted file mode 100644 index 5e01ad5..0000000 --- a/gtars/tests/test.rs +++ /dev/null @@ -1,221 +0,0 @@ -use std::path::Path; - -use rstest::*; -use tempfile::NamedTempFile; - -use gtars::common::models::{Region, RegionSet}; -use gtars::io::{append_tokens_to_gtok_file, init_gtok_file, read_tokens_from_gtok}; -use gtars::tokenizers::{Tokenizer, TreeTokenizer}; - -#[fixture] -fn path_to_data() -> &'static str { - "tests/data" -} - -#[fixture] -fn path_to_bed_file() -> &'static str { - "tests/data/peaks.bed" -} - -#[fixture] -fn path_to_bed_file_gzipped() -> &'static str { - "tests/data/peaks.bed.gz" -} - -#[fixture] -fn path_to_tokenize_bed_file() -> &'static str { - "tests/data/to_tokenize.bed" -} - -#[fixture] -fn path_to_anndata_file() -> &'static str { - "tests/data/pbmc_hg38.h5ad" -} - -#[fixture] -fn path_to_r2v_repo() -> &'static str { - "databio/r2v-luecken2021-hg38-v2" -} - -#[fixture] -fn bb_bed_id() -> &'static str { - "fa09672b962809b408b356728d81640e" -} - -#[fixture] -fn path_to_gtok_file() -> &'static str { - "tests/data/out/tokens.gtok" -} - -mod tests { - use std::io::Read; - - use gtars::common::utils::extract_regions_from_bed_file; - - use super::*; - - #[rstest] - fn test_region() { - let region = Region { - chr: "chr1".to_string(), - start: 100, - end: 200, - }; - - assert_eq!(region.chr, "chr1"); - assert_eq!(region.start, 100); - assert_eq!(region.end, 200); - } - - #[rstest] - fn test_extract_regions_from_bed_file(path_to_bed_file: &str) { - let path = Path::new(path_to_bed_file); - let regions = extract_regions_from_bed_file(path); - assert!(regions.is_ok(), "Failed to extract regions from BED file"); - let regions = regions.unwrap(); - assert!(regions.len() == 25); - } - - #[rstest] - fn test_extract_regions_from_bed_file_gzipped(path_to_bed_file_gzipped: &str) { - let path = Path::new(path_to_bed_file_gzipped); - let regions = extract_regions_from_bed_file(path); - assert!(regions.is_ok(), "Failed to extract regions from BED file"); - let regions = regions.unwrap(); - assert!(regions.len() == 25); - } - - #[rstest] - fn test_region_set_from_bed(path_to_bed_file: &str) { - let path = Path::new(path_to_bed_file); - let rs = RegionSet::try_from(path).unwrap(); - - assert!(rs.len() == 25); - } - - #[rstest] - fn test_region_set_from_bytes(path_to_bed_file: &str) { - let path = Path::new(path_to_bed_file); - let rs = RegionSet::try_from(path).unwrap(); - - let mut bytes: Vec = Vec::new(); - - std::fs::File::open(path) - .unwrap() - .read_to_end(&mut bytes) - .unwrap(); - - let rs2 = RegionSet::from(bytes.as_slice()); - - assert!(rs2.len() == rs.len()); - } - - #[rstest] - fn test_region_set_to_bed(path_to_bed_file: &str) { - let path = Path::new(path_to_bed_file); - let rs = RegionSet::try_from(path).unwrap(); - - // create a temporary file - let tmp_file = NamedTempFile::new().unwrap(); - let tmp_path = tmp_file.into_temp_path(); - let tmp_path = Path::new(tmp_path.to_str().unwrap()); - - // write the region set to the temporary file - rs.to_bed(tmp_path).unwrap(); - - // read the temporary file back in as a region set - let rs2 = RegionSet::try_from(tmp_path).unwrap(); - - assert!(rs2.len() == 25); - } - - #[rstest] - fn test_create_tokenizer(path_to_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - assert!(tokenizer.vocab_size() == 32); // 25 regions + 7 special tokens - } - - #[rstest] - fn test_create_anndata_tokenizer(path_to_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - assert!(tokenizer.vocab_size() == 116497); - } - - // #[rstest] - // fn test_create_tokenizer_from_bedbase(bb_bed_id: &str) { - // let tokenizer = TreeTokenizer::from_bedbase(bb_bed_id).unwrap(); - // assert!(tokenizer.vocab_size() == 25214); - // } - - #[rstest] - fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); - let tokenized_regions = tokenizer.tokenize_region_set(&rs); - - println!("{}", tokenized_regions.len()); - assert!(tokenized_regions.len() == 4); - - // last should be the unknown token - let unknown_token = tokenizer - .universe - .convert_id_to_region(tokenized_regions[3]) - .unwrap(); - assert!(unknown_token.chr == "chrUNK"); - } - - #[rstest] - fn test_init_gtok_file(path_to_gtok_file: &str) { - let res = init_gtok_file(path_to_gtok_file); - assert!(res.is_ok()); - - // check that the file was created - let path = Path::new(path_to_gtok_file); - assert!(path.exists()); - - // delete the file - std::fs::remove_file(path).expect("Failed to delete the gtok file."); - } - - #[rstest] - fn test_append_to_gtok(path_to_gtok_file: &str) { - let res = init_gtok_file(path_to_gtok_file); - assert!(res.is_ok()); - - let tokens = vec![1, 2, 3, 4, 5]; - let res = append_tokens_to_gtok_file(path_to_gtok_file, &tokens); - assert!(res.is_ok()); - - let tokens = read_tokens_from_gtok(path_to_gtok_file); - assert!(tokens.is_ok()); - let tokens = tokens.unwrap(); - assert!(tokens.len() == 5); - - // delete the file - let path = Path::new(path_to_gtok_file); - std::fs::remove_file(path).expect("Failed to delete the gtok file."); - } - - // - // Cant get these to run because the polars CsvReader isnt working for gzipped files right now. - // - // #[rstest] - // fn test_pretokenization_folder(path_to_data: &str, path_to_bed_file: &str) { - // let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - // let path_to_data = Path::new(path_to_data); - // let outdir = "tests/data/out"; - - // let res = gtars::tools::pre_tokenize_data(path_to_data, outdir, &tokenizer); - // assert!(res.is_ok()); - // } - - // #[rstest] - // fn test_pretokenization_file(path_to_tokenize_bed_file: &str, path_to_bed_file: &str) { - // let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - // let path_to_data = Path::new(path_to_tokenize_bed_file); - // let outdir = "tests/data/out"; - - // let res = gtars::tools::pre_tokenize_data(path_to_data, outdir, &tokenizer); - // assert!(res.is_ok()); - // } -} From 78ee6a0684fcc2f624d79802527f23f4009c1c03 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Thu, 13 Jun 2024 14:54:57 -0400 Subject: [PATCH 03/28] work on hc universes --- gtars/src/tokenizers/fragment_tokenizer.rs | 4 +- gtars/src/tokenizers/mod.rs | 46 +++++++- gtars/src/tokenizers/tree_tokenizer.rs | 119 +++++++++++++++------ gtars/tests/data/chroms.bed | 48 ++++----- 4 files changed, 159 insertions(+), 58 deletions(-) diff --git a/gtars/src/tokenizers/fragment_tokenizer.rs b/gtars/src/tokenizers/fragment_tokenizer.rs index 5511cc9..e11780c 100644 --- a/gtars/src/tokenizers/fragment_tokenizer.rs +++ b/gtars/src/tokenizers/fragment_tokenizer.rs @@ -291,7 +291,7 @@ where // get actual tokens let tokens = self.tokenizer.tokenize_region(&r); - let barcode_tokens = barcode_ids_map.entry(barcode).or_insert(vec![]); + let barcode_tokens = barcode_ids_map.entry(barcode).or_default(); barcode_tokens.extend(tokens.ids); } @@ -338,7 +338,7 @@ where // get actual tokens let tokens = self.tokenizer.tokenize_region(&r); - let barcode_tokens = barcode_ids_map.entry(barcode).or_insert(vec![]); + let barcode_tokens = barcode_ids_map.entry(barcode).or_default(); barcode_tokens.extend(tokens.ids); } diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 59be33f..079c720 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -25,7 +25,7 @@ pub use tree_tokenizer::TreeTokenizer; #[cfg(test)] mod tests { - use crate::common::models::RegionSet; + use crate::common::models::{RegionSet, Region}; use std::path::Path; use super::*; @@ -75,4 +75,48 @@ mod tests { .unwrap(); assert!(unknown_token.chr == "chrUNK"); } + + #[rstest] + fn test_hierarchical_universe_hit(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let res = tokenizer.tokenize_region(&Region { + chr: "chr1".to_string(), + start: 100, + end: 200, + }); + assert_eq!(res.len(), 1); + + // check the id, it should be len(primary_universe) + 1 (since its chr1) + assert_eq!(res.ids, vec![25]); + + let res = res.into_region_vec(); + let region = &res[0]; + + assert_eq!(region.chr, "chr1"); + assert_eq!(region.start, 0); + assert_eq!(region.end, 248_956_422); + + } + + #[rstest] + fn test_hierarchical_universe_no_hit(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let res = tokenizer.tokenize_region(&Region { + chr: "chrFOO".to_string(), + start: 100, + end: 200, + }); + assert_eq!(res.len(), 1); + + // check the id, it should be the id of the UNK token + assert_eq!(res.ids, vec![49]); + + let res = res.into_region_vec(); + let region = &res[0]; + + assert_eq!(region.chr, "chrUNK"); + assert_eq!(region.start, 0); + assert_eq!(region.end, 0); + + } } diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 3a8b49f..63c0603 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -3,7 +3,7 @@ use std::fs::read_to_string; use std::path::Path; use anyhow::Result; -use rust_lapper::Lapper; +use rust_lapper::{Lapper, Interval}; use crate::common::consts::special_tokens::*; use crate::common::models::{Region, RegionSet, TokenizedRegionSet, Universe}; @@ -53,18 +53,36 @@ impl TryFrom<&Path> for TreeTokenizer { Some(hierarchical_universes) => { let mut secondary_trees = Vec::new(); for hierarchical_universe in hierarchical_universes { + + let mut hierarchical_tree: HashMap> = HashMap::new(); + let hierarchical_universe_path = value.parent().unwrap().join(&hierarchical_universe); let hierarchical_universe_regions = extract_regions_from_bed_file(&hierarchical_universe_path)?; + let mut intervals: HashMap>> = HashMap::new(); for region in hierarchical_universe_regions { universe.insert_token(®ion); + let interval = Interval { + start: region.start, + stop: region.end, + val: universe.convert_region_to_id(®ion).unwrap(), + }; + + intervals + .entry(region.chr.clone()) + .or_default() + .push(interval); } - let hierarchical_tree = create_interval_tree_from_universe(&universe); + for (chr, chr_intervals) in intervals.iter() { + let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); + hierarchical_tree.insert(chr.to_string(), lapper); + } + secondary_trees.push(hierarchical_tree); } @@ -163,29 +181,85 @@ impl TryFrom<&Path> for TreeTokenizer { impl Tokenizer for TreeTokenizer { fn tokenize_region(&self, region: &Region) -> TokenizedRegionSet { + let lapper = self.tree.get(®ion.chr); + match lapper { Some(lapper) => { let intervals = lapper.find(region.start, region.end); - let ids: Vec = intervals.map(|interval| interval.val).collect(); + let mut ids: Vec = intervals.map(|interval| interval.val).collect(); + // tokenized to nothing... check secondary trees if ids.is_empty() { - let ids = vec![self.unknown_token_id()]; - return TokenizedRegionSet { - ids, - universe: &self.universe, - }; + // oh, we have no secondary trees, just return the unknown token + if self.secondary_trees.is_none() { + ids = vec![self.unknown_token_id()]; + // iterate over secondary trees and check if the region is in any of them + } else { + for s_tree in self.secondary_trees.as_ref().unwrap() { + // default to unknown token + ids = vec![self.unknown_token_id()]; + + let s_lapper = s_tree.get(®ion.chr); + if s_lapper.is_none() { + continue; + } + // get overlapped intervals -- map to regions + let intervals = s_lapper.unwrap().find(region.start, region.end); + let regions: Vec = intervals.map(|interval| interval.val).collect(); + + // a hit + if !regions.is_empty() { + ids = regions; + break; + } + } + } + } + + TokenizedRegionSet { + ids, + universe: &self.universe, + } + } + // primary universe didnt have that chromosome/contig/seqname + // so, check secondary trees + None => { + let mut ids = Vec::new(); + // oh, we have no secondary trees, just return the unknown token + if self.secondary_trees.is_none() { + ids = vec![self.unknown_token_id()]; + // iterate over secondary trees and check if the region is in any of them + } else { + for s_tree in self.secondary_trees.as_ref().unwrap() { + // default to unknown token + ids = vec![self.unknown_token_id()]; + + let s_lapper = s_tree.get(®ion.chr); + if s_lapper.is_none() { + continue; + } + + // get overlapped intervals -- map to regions + let intervals = s_lapper.unwrap().find(region.start, region.end); + let regions: Vec = intervals.map(|interval| interval.val).collect(); + + // a hit + if !regions.is_empty() { + ids = regions; + break; + } else { + ids = vec![self.unknown_token_id()]; + } + } } TokenizedRegionSet { ids, universe: &self.universe, } + } - None => TokenizedRegionSet { - ids: vec![self.unknown_token_id()], - universe: &self.universe, - }, } } @@ -193,25 +267,8 @@ impl Tokenizer for TreeTokenizer { let mut tokenized_regions: Vec = Vec::new(); for region in region_set { - let lapper = self.tree.get(®ion.chr); - - match lapper { - Some(tree) => { - let intervals = tree.find(region.start, region.end); - - let regions: Vec = intervals.map(|interval| interval.val).collect(); - - if regions.is_empty() { - tokenized_regions.push(self.unknown_token_id()); - continue; - } - - tokenized_regions.extend(regions); - } - None => { - tokenized_regions.push(self.unknown_token_id()); - } - } + let tokenized_region = self.tokenize_region(region); + tokenized_regions.extend(tokenized_region.ids); } TokenizedRegionSet { diff --git a/gtars/tests/data/chroms.bed b/gtars/tests/data/chroms.bed index b6d08bd..efdeff3 100644 --- a/gtars/tests/data/chroms.bed +++ b/gtars/tests/data/chroms.bed @@ -1,24 +1,24 @@ -ch1 0 248956422 -ch2 0 242193529 -ch3 0 198295559 -ch4 0 190214555 -ch5 0 181538259 -ch6 0 170805979 -ch7 0 159345973 -ch8 0 145138636 -ch9 0 138394717 -ch10 0 133797422 -ch11 0 135086622 -ch12 0 133275309 -ch13 0 114364328 -ch14 0 107043718 -ch15 0 101991189 -ch16 0 90338345 -ch17 0 83257441 -ch18 0 80373285 -ch19 0 58617616 -ch20 0 64444167 -ch21 0 46709983 -ch22 0 50818468 -chX 0 156040895 -chY 0 57227415 \ No newline at end of file +chr1 0 248956422 +chr2 0 242193529 +chr3 0 198295559 +chr4 0 190214555 +chr5 0 181538259 +chr6 0 170805979 +chr7 0 159345973 +chr8 0 145138636 +chr9 0 138394717 +chr10 0 133797422 +chr11 0 135086622 +chr12 0 133275309 +chr13 0 114364328 +chr14 0 107043718 +chr15 0 101991189 +chr16 0 90338345 +chr17 0 83257441 +chr18 0 80373285 +chr19 0 58617616 +chr20 0 64444167 +chr21 0 46709983 +chr22 0 50818468 +chrX 0 156040895 +chrY 0 57227415 \ No newline at end of file From 6500c7754f61e482eb535088a4dcbf049a8a5ff2 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Thu, 13 Jun 2024 15:00:48 -0400 Subject: [PATCH 04/28] add tests githu action --- .github/workflows/tests.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..2d9cb00 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,22 @@ +name: Run tests + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Build + run: cargo build --verbose + working-directory: ./gtars + - name: Run tests + run: cargo test --verbose + working-directory: ./gtars From 983075afa45dafed40b8c95d8e59547a6f83d77c Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Thu, 13 Jun 2024 15:30:21 -0400 Subject: [PATCH 05/28] work on README and codecov --- .github/workflows/codecov.yml | 29 ++++++++++++++++++++++++++ README.md | 3 +++ gtars/src/tokenizers/mod.rs | 4 +--- gtars/src/tokenizers/tree_tokenizer.rs | 26 +++++++++++------------ 4 files changed, 46 insertions(+), 16 deletions(-) create mode 100644 .github/workflows/codecov.yml diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml new file mode 100644 index 0000000..aad0f23 --- /dev/null +++ b/.github/workflows/codecov.yml @@ -0,0 +1,29 @@ +name: Compute coverage + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + coverage: + runs-on: ubuntu-latest + env: + CARGO_TERM_COLOR: always + steps: + - uses: actions/checkout@v4 + - name: Install Rust + run: rustup update stable + - name: Install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + - name: Generate code coverage + run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v3 + with: + files: lcov.info + fail_ci_if_error: true \ No newline at end of file diff --git a/README.md b/README.md index be138c6..1d0ace3 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,6 @@ +[![codecov](https://codecov.io/gh/databio/gtars/branch/master/graph/badge.svg)](https://codecov.io/gh/databio/gtars) +[![crates.io](https://img.shields.io/crates/v/gtars?&logo=rust)](https://crates.io/crates/gtars) +

gtars logo

diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 079c720..3cb67ab 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -25,7 +25,7 @@ pub use tree_tokenizer::TreeTokenizer; #[cfg(test)] mod tests { - use crate::common::models::{RegionSet, Region}; + use crate::common::models::{Region, RegionSet}; use std::path::Path; use super::*; @@ -95,7 +95,6 @@ mod tests { assert_eq!(region.chr, "chr1"); assert_eq!(region.start, 0); assert_eq!(region.end, 248_956_422); - } #[rstest] @@ -117,6 +116,5 @@ mod tests { assert_eq!(region.chr, "chrUNK"); assert_eq!(region.start, 0); assert_eq!(region.end, 0); - } } diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 63c0603..25a0d60 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -3,7 +3,7 @@ use std::fs::read_to_string; use std::path::Path; use anyhow::Result; -use rust_lapper::{Lapper, Interval}; +use rust_lapper::{Interval, Lapper}; use crate::common::consts::special_tokens::*; use crate::common::models::{Region, RegionSet, TokenizedRegionSet, Universe}; @@ -53,8 +53,8 @@ impl TryFrom<&Path> for TreeTokenizer { Some(hierarchical_universes) => { let mut secondary_trees = Vec::new(); for hierarchical_universe in hierarchical_universes { - - let mut hierarchical_tree: HashMap> = HashMap::new(); + let mut hierarchical_tree: HashMap> = + HashMap::new(); let hierarchical_universe_path = value.parent().unwrap().join(&hierarchical_universe); @@ -62,7 +62,8 @@ impl TryFrom<&Path> for TreeTokenizer { let hierarchical_universe_regions = extract_regions_from_bed_file(&hierarchical_universe_path)?; - let mut intervals: HashMap>> = HashMap::new(); + let mut intervals: HashMap>> = + HashMap::new(); for region in hierarchical_universe_regions { universe.insert_token(®ion); let interval = Interval { @@ -77,12 +78,12 @@ impl TryFrom<&Path> for TreeTokenizer { .push(interval); } - for (chr, chr_intervals) in intervals.iter() { - let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); + let lapper: Lapper = + Lapper::new(chr_intervals.to_owned()); hierarchical_tree.insert(chr.to_string(), lapper); } - + secondary_trees.push(hierarchical_tree); } @@ -181,7 +182,6 @@ impl TryFrom<&Path> for TreeTokenizer { impl Tokenizer for TreeTokenizer { fn tokenize_region(&self, region: &Region) -> TokenizedRegionSet { - let lapper = self.tree.get(®ion.chr); match lapper { @@ -206,8 +206,9 @@ impl Tokenizer for TreeTokenizer { } // get overlapped intervals -- map to regions let intervals = s_lapper.unwrap().find(region.start, region.end); - let regions: Vec = intervals.map(|interval| interval.val).collect(); - + let regions: Vec = + intervals.map(|interval| interval.val).collect(); + // a hit if !regions.is_empty() { ids = regions; @@ -239,11 +240,11 @@ impl Tokenizer for TreeTokenizer { if s_lapper.is_none() { continue; } - + // get overlapped intervals -- map to regions let intervals = s_lapper.unwrap().find(region.start, region.end); let regions: Vec = intervals.map(|interval| interval.val).collect(); - + // a hit if !regions.is_empty() { ids = regions; @@ -258,7 +259,6 @@ impl Tokenizer for TreeTokenizer { ids, universe: &self.universe, } - } } } From 7f0b6545d6cb2f079c3dbfee80c887d3773abd78 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Thu, 13 Jun 2024 15:31:52 -0400 Subject: [PATCH 06/28] switch working directory --- .github/workflows/codecov.yml | 4 +++- .github/workflows/tests.yml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index aad0f23..5c3367f 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -22,8 +22,10 @@ jobs: uses: taiki-e/install-action@cargo-llvm-cov - name: Generate code coverage run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info + working-directory: ./gtars - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: files: lcov.info - fail_ci_if_error: true \ No newline at end of file + fail_ci_if_error: true + working-directory: ./gtars \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2d9cb00..29fbda2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -10,7 +10,7 @@ env: CARGO_TERM_COLOR: always jobs: - build: + run: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 From b912538b4fe3702d8a0c8596c9e4078f7ec788ba Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 15:27:48 -0400 Subject: [PATCH 07/28] update tokenizer config --- gtars/src/tokenizers/config.rs | 21 ++++++++++++++-- gtars/src/tokenizers/meta_tokenizer.rs | 10 ++++++++ gtars/src/tokenizers/mod.rs | 34 ++++++++++++++++++++++++++ gtars/src/tokenizers/tree_tokenizer.rs | 29 ++++++++++++++-------- gtars/tests/data/peaks.meta.bed | 25 +++++++++++++++++++ gtars/tests/data/tokenizer.toml | 5 ++-- gtars/tests/data/tokenizer_bad.toml | 3 +++ 7 files changed, 112 insertions(+), 15 deletions(-) create mode 100644 gtars/src/tokenizers/meta_tokenizer.rs create mode 100755 gtars/tests/data/peaks.meta.bed create mode 100644 gtars/tests/data/tokenizer_bad.toml diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs index 5f2b54a..935605b 100644 --- a/gtars/src/tokenizers/config.rs +++ b/gtars/src/tokenizers/config.rs @@ -1,8 +1,25 @@ +use std::fs::read_to_string; +use std::path::Path; + +use anyhow::Result; use serde::{Deserialize, Serialize}; #[derive(Deserialize, Serialize, Debug, PartialEq)] pub struct TokenizerConfig { - pub universe: String, - pub hierarchical_universes: Option>, + pub universes: Vec, pub exclude_ranges: Option, } + +impl TokenizerConfig { + /// + /// Create a new tokenizer config. + /// + /// # Arguments + /// - path: Path to the config file (a .toml) file. + pub fn new(path: &Path) -> Result { + let toml_str = read_to_string(path)?; + let config: TokenizerConfig = toml::from_str(&toml_str)?; + + Ok(config) + } +} diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs new file mode 100644 index 0000000..e0f3429 --- /dev/null +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -0,0 +1,10 @@ +use std::collections::HashMap; + +use rust_lapper::{Lapper, Interval}; + +use crate::common::models::Universe; + +pub struct MetaTokenizer { + pub universe: Universe, + tree: HashMap> +} \ No newline at end of file diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 3cb67ab..331b25a 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -8,6 +8,7 @@ pub mod soft_tokenizer; pub mod special_tokens; pub mod traits; pub mod tree_tokenizer; +pub mod meta_tokenizer; /// constants for the tokenizer module. pub mod consts { @@ -26,6 +27,7 @@ pub use tree_tokenizer::TreeTokenizer; mod tests { use crate::common::models::{Region, RegionSet}; + use crate::tokenizers::traits::SpecialTokens; use std::path::Path; use super::*; @@ -42,6 +44,11 @@ mod tests { "tests/data/tokenizer.toml" } + #[fixture] + fn path_to_bad_config_file() -> &'static str { + "tests/data/tokenizer_bad.toml" + } + #[fixture] fn path_to_tokenize_bed_file() -> &'static str { "tests/data/to_tokenize.bed" @@ -59,6 +66,33 @@ mod tests { assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens } + #[rstest] + #[should_panic] + fn test_bad_config_file(path_to_bad_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file)); + let _tokenizer = tokenizer.unwrap(); + } + + #[rstest] + fn test_get_special_token_ids(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let unk_id = tokenizer.unknown_token_id(); + let pad_id = tokenizer.padding_token_id(); + let mask_id = tokenizer.mask_token_id(); + let eos_id = tokenizer.eos_token_id(); + let bos_id = tokenizer.bos_token_id(); + let cls_id = tokenizer.cls_token_id(); + let sep_id = tokenizer.sep_token_id(); + + assert_eq!(unk_id, 25); + assert_eq!(pad_id, 26); + assert_eq!(mask_id, 27); + assert_eq!(eos_id, 28); + assert_eq!(bos_id, 29); + assert_eq!(cls_id, 30); + assert_eq!(sep_id, 31); + } + #[rstest] fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 25a0d60..4f2fc8f 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -1,8 +1,7 @@ use std::collections::HashMap; -use std::fs::read_to_string; use std::path::Path; -use anyhow::Result; +use anyhow::{Context, Result}; use rust_lapper::{Interval, Lapper}; use crate::common::consts::special_tokens::*; @@ -15,7 +14,6 @@ pub struct TreeTokenizer { pub universe: Universe, tree: HashMap>, secondary_trees: Option>>>, - exclude_ranges: Option>>, } impl TryFrom<&Path> for TreeTokenizer { @@ -34,14 +32,26 @@ impl TryFrom<&Path> for TreeTokenizer { // and allows for the new way of creating tokenizers from toml files let file_extension = value.extension().unwrap().to_str().unwrap(); - let (mut universe, tree, secondary_trees, exclude_ranges) = match file_extension { + let (mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension { // parse config file "toml" => { - let toml_str = read_to_string(value)?; - let config: TokenizerConfig = toml::from_str(&toml_str)?; + let config = TokenizerConfig::new(value) + .with_context(|| { + format!("Invalid tokenizer configuration found for file: {}", value.to_str().unwrap()) + })?; + + if config.universes.is_empty() { + anyhow::bail!("You must have at least one universe in your universe list. Found zero.") + } + + let primary_universe = &config.universes[0]; + let other_universes = match config.universes.len() { + 1 => None, + _ => Some(&config.universes[1..]) + }; // universe path is relative to the config file - let universe_path = value.parent().unwrap().join(&config.universe); + let universe_path = value.parent().unwrap().join(primary_universe); // create initial universe from the *required* universe field let mut universe = Universe::try_from(Path::new(&universe_path))?; @@ -49,7 +59,7 @@ impl TryFrom<&Path> for TreeTokenizer { let tree = create_interval_tree_from_universe(&universe); // create secondary trees if they exist - let secondary_trees = match config.hierarchical_universes { + let secondary_trees = match other_universes { Some(hierarchical_universes) => { let mut secondary_trees = Vec::new(); for hierarchical_universe in hierarchical_universes { @@ -57,7 +67,7 @@ impl TryFrom<&Path> for TreeTokenizer { HashMap::new(); let hierarchical_universe_path = - value.parent().unwrap().join(&hierarchical_universe); + value.parent().unwrap().join(hierarchical_universe); let hierarchical_universe_regions = extract_regions_from_bed_file(&hierarchical_universe_path)?; @@ -175,7 +185,6 @@ impl TryFrom<&Path> for TreeTokenizer { universe, tree, secondary_trees, - exclude_ranges, }) } } diff --git a/gtars/tests/data/peaks.meta.bed b/gtars/tests/data/peaks.meta.bed new file mode 100755 index 0000000..7303b6c --- /dev/null +++ b/gtars/tests/data/peaks.meta.bed @@ -0,0 +1,25 @@ +chr17 7915738 7915777 0 +chr6 157381091 157381200 0 +chr2 168247745 168247800 0 +chr4 16270164 16270220 1 +chr6 7313181 7313245 1 +chr10 70576200 70576231 2 +chr1 151399431 151399527 2 +chr2 203871200 203871375 2 +chr2 203871387 203871588 2 +chr12 54220192 54220409 2 +chr9 3526071 3526165 3 +chr9 3526183 3526269 3 +chr7 1044556 1044591 3 +chr8 65841729 65841752 4 +chr8 65841823 65841921 4 +chr2 206713923 206713976 5 +chr19 48260083 48260280 5 +chr15 28095897 28095963 5 +chr17 78759156 78759193 5 +chr17 78759222 78759311 5 +chr12 121129062 121129088 6 +chr1 110202920 110203109 6 +chr13 74550022 74550411 6 +chr15 49155856 49155887 7 +chr15 49155935 49156182 8 \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.toml b/gtars/tests/data/tokenizer.toml index eb969b6..648deaa 100644 --- a/gtars/tests/data/tokenizer.toml +++ b/gtars/tests/data/tokenizer.toml @@ -1,3 +1,2 @@ -universe = "peaks.bed.gz" -exclude_ranges = "excluderanges.bed.gz" -hierarchical_universes = ["chroms.bed"] \ No newline at end of file +universes = ["peaks.bed.gz", "chroms.bed"] +exclude_ranges = "excluderanges.bed.gz" \ No newline at end of file diff --git a/gtars/tests/data/tokenizer_bad.toml b/gtars/tests/data/tokenizer_bad.toml new file mode 100644 index 0000000..f59134a --- /dev/null +++ b/gtars/tests/data/tokenizer_bad.toml @@ -0,0 +1,3 @@ +universes = "peaks.bed.gz" +hieracrhical_universes = ["chroms.bed"] +exclude_ranges = "excluderanges.bed.gz" \ No newline at end of file From bf39b5cb52f1338cc70f7156c0c4cfe068b2ef35 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 16:39:00 -0400 Subject: [PATCH 08/28] realy update documentation --- gtars/src/ailist/core.rs | 182 ++++++++++++++++++++++ gtars/src/ailist/mod.rs | 183 ++-------------------- gtars/src/io/gtok.rs | 190 ++++++++++++++++++++++ gtars/src/io/mod.rs | 208 ++++--------------------- gtars/src/lib.rs | 27 ++++ gtars/src/tokenizers/meta_tokenizer.rs | 3 +- gtars/src/tokenizers/mod.rs | 22 ++- gtars/src/tokenizers/tree_tokenizer.rs | 4 + gtars/src/uniwig/mod.rs | 13 ++ 9 files changed, 476 insertions(+), 356 deletions(-) create mode 100644 gtars/src/ailist/core.rs create mode 100644 gtars/src/io/gtok.rs diff --git a/gtars/src/ailist/core.rs b/gtars/src/ailist/core.rs new file mode 100644 index 0000000..29697a1 --- /dev/null +++ b/gtars/src/ailist/core.rs @@ -0,0 +1,182 @@ +use std::fmt; + +pub struct Interval { + pub start: u32, + pub end: u32, +} + +impl fmt::Display for Interval { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "({}, {})", self.start, self.end) + } +} + +/// +/// The Augmented Interval List (AIList), enumerates intersections between a query interval q and an interval set R. +/// +pub struct AIList { + starts: Vec, + ends: Vec, + max_ends: Vec, + header_list: Vec, +} + +impl AIList { + /// + /// Create a new AIList struct + /// + /// # Arguments + /// - intervals: list of intervals to create from + /// + /// # Returns + /// - AIList struct + pub fn new(intervals: &mut Vec, minimum_coverage_length: usize) -> AIList { + // in the future, clone and sort... + intervals.sort_by_key(|key| key.start); + + let mut starts: Vec = Vec::new(); + let mut ends: Vec = Vec::new(); + let mut max_ends: Vec = Vec::new(); + let mut header_list: Vec = vec![0]; + + loop { + let mut results = Self::decompose(intervals, minimum_coverage_length); + + starts.append(&mut results.0); + ends.append(&mut results.1); + max_ends.append(&mut results.2); + + *intervals = results.3; + + if intervals.is_empty() { + break; + } else { + header_list.push(starts.len()); + } + } + + AIList { + starts, + ends, + max_ends, + header_list, + } + } + + fn decompose( + intervals: &mut [Interval], + minimum_coverage_length: usize, + ) -> (Vec, Vec, Vec, Vec) { + // look at the next minL*2 intervals + let mut starts: Vec = Vec::new(); + let mut ends: Vec = Vec::new(); + let mut max_ends: Vec = Vec::new(); + let mut l2: Vec = Vec::new(); + + for (index, interval) in intervals.iter().enumerate() { + let mut count = 0; + for i in 1..(minimum_coverage_length * 2) { + match intervals.get(index + i) { + Some(interval2) => { + if interval.end > interval2.end { + count += 1; + } + } + None => break, + } + } + if count >= minimum_coverage_length { + l2.push(Interval { + start: interval.start, + end: interval.end, + }); + } else { + starts.push(interval.start); + ends.push(interval.end) + } + } + + let mut max: u32 = 0; + + for end in ends.iter() { + max = if max > *end { max } else { *end }; + max_ends.push(max); + } + + (starts, ends, max_ends, l2) + } + + fn query_slice( + interval: &Interval, + starts: &[u32], + ends: &[u32], + max_ends: &[u32], + ) -> Vec { + let mut results_list: Vec = Vec::new(); + let mut i = starts.partition_point(|&x| x < interval.end); + + while i > 0 { + i -= 1; + if interval.start > ends[i] { + //this means that there is no intersection + if interval.start > max_ends[i] { + //there is no further intersection + return results_list; + } + } else { + results_list.push(Interval { + start: starts[i], + end: ends[i], + }) + } + } + results_list + } + + pub fn query(&self, interval: &Interval) -> Vec { + let mut results_list: Vec = Vec::new(); + + for i in 0..(self.header_list.len() - 1) { + results_list.append(&mut Self::query_slice( + interval, + &self.starts[self.header_list[i]..self.header_list[i + 1]], + &self.ends[self.header_list[i]..self.header_list[i + 1]], + &self.max_ends[self.header_list[i]..self.header_list[i + 1]], + )); + } + // now do the last decomposed ailist + let i = self.header_list.len() - 1; + results_list.extend(Self::query_slice( + interval, + &self.starts[self.header_list[i]..], + &self.ends[self.header_list[i]..], + &self.max_ends[self.header_list[i]..], + )); + + results_list + } +} + +impl fmt::Display for AIList { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut string = String::new(); + string.push('\n'); + for element in self.starts.iter() { + string.push_str(format!("{element}").as_str()); + } + string.push('\n'); + for element in self.ends.iter() { + string.push_str(format!("{element}").as_str()); + } + string.push('\n'); + for element in self.max_ends.iter() { + string.push_str(format!("{element}").as_str()); + } + string.push('\n'); + for element in self.header_list.iter() { + string.push_str(format!("{element}").as_str()); + } + string.push('\n'); + write!(f, "{string}") + } +} diff --git a/gtars/src/ailist/mod.rs b/gtars/src/ailist/mod.rs index 2449502..3d1c86c 100644 --- a/gtars/src/ailist/mod.rs +++ b/gtars/src/ailist/mod.rs @@ -1,171 +1,12 @@ -use std::fmt; - -pub struct Interval { - pub start: u32, - pub end: u32, -} - -impl fmt::Display for Interval { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "({}, {})", self.start, self.end) - } -} - -pub struct AIList { - starts: Vec, - ends: Vec, - max_ends: Vec, - header_list: Vec, -} - -impl AIList { - pub fn new(intervals: &mut Vec, minimum_coverage_length: usize) -> AIList { - // in the future, clone and sort... - intervals.sort_by_key(|key| key.start); - - let mut starts: Vec = Vec::new(); - let mut ends: Vec = Vec::new(); - let mut max_ends: Vec = Vec::new(); - let mut header_list: Vec = vec![0]; - - loop { - let mut results = Self::decompose(intervals, minimum_coverage_length); - - starts.append(&mut results.0); - ends.append(&mut results.1); - max_ends.append(&mut results.2); - - *intervals = results.3; - - if intervals.is_empty() { - break; - } else { - header_list.push(starts.len()); - } - } - - AIList { - starts, - ends, - max_ends, - header_list, - } - } - - fn decompose( - intervals: &mut [Interval], - minimum_coverage_length: usize, - ) -> (Vec, Vec, Vec, Vec) { - // look at the next minL*2 intervals - let mut starts: Vec = Vec::new(); - let mut ends: Vec = Vec::new(); - let mut max_ends: Vec = Vec::new(); - let mut l2: Vec = Vec::new(); - - for (index, interval) in intervals.iter().enumerate() { - let mut count = 0; - for i in 1..(minimum_coverage_length * 2) { - match intervals.get(index + i) { - Some(interval2) => { - if interval.end > interval2.end { - count += 1; - } - } - None => break, - } - } - if count >= minimum_coverage_length { - l2.push(Interval { - start: interval.start, - end: interval.end, - }); - } else { - starts.push(interval.start); - ends.push(interval.end) - } - } - - let mut max: u32 = 0; - - for end in ends.iter() { - max = if max > *end { max } else { *end }; - max_ends.push(max); - } - - (starts, ends, max_ends, l2) - } - - fn query_slice( - interval: &Interval, - starts: &[u32], - ends: &[u32], - max_ends: &[u32], - ) -> Vec { - let mut results_list: Vec = Vec::new(); - let mut i = starts.partition_point(|&x| x < interval.end); - - while i > 0 { - i -= 1; - if interval.start > ends[i] { - //this means that there is no intersection - if interval.start > max_ends[i] { - //there is no further intersection - return results_list; - } - } else { - results_list.push(Interval { - start: starts[i], - end: ends[i], - }) - } - } - results_list - } - - pub fn query(&self, interval: &Interval) -> Vec { - let mut results_list: Vec = Vec::new(); - - for i in 0..(self.header_list.len() - 1) { - results_list.append(&mut Self::query_slice( - interval, - &self.starts[self.header_list[i]..self.header_list[i + 1]], - &self.ends[self.header_list[i]..self.header_list[i + 1]], - &self.max_ends[self.header_list[i]..self.header_list[i + 1]], - )); - } - // now do the last decomposed ailist - let i = self.header_list.len() - 1; - results_list.extend(Self::query_slice( - interval, - &self.starts[self.header_list[i]..], - &self.ends[self.header_list[i]..], - &self.max_ends[self.header_list[i]..], - )); - - results_list - } -} - -impl fmt::Display for AIList { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let mut string = String::new(); - string.push('\n'); - for element in self.starts.iter() { - string.push_str(format!("{element}").as_str()); - } - string.push('\n'); - for element in self.ends.iter() { - string.push_str(format!("{element}").as_str()); - } - string.push('\n'); - for element in self.max_ends.iter() { - string.push_str(format!("{element}").as_str()); - } - string.push('\n'); - for element in self.header_list.iter() { - string.push_str(format!("{element}").as_str()); - } - string.push('\n'); - write!(f, "{string}") - } -} +//! +//! # Augmented Interval List: a novel data structure for efficient genomic interval search +//! This is a rust implementation of the Augmented Interval List (AIList): [https://academic.oup.com/bioinformatics/article/35/23/4907/5509521](https://academic.oup.com/bioinformatics/article/35/23/4907/5509521). +//! +//! The Augmented Interval List (AIList), enumerates intersections between a query interval q and an interval set R. +//! +//! It should be complete, but has not been rigorously tested. +//! +pub mod core; + +// re-expose models +pub use core::{AIList, Interval}; \ No newline at end of file diff --git a/gtars/src/io/gtok.rs b/gtars/src/io/gtok.rs new file mode 100644 index 0000000..690cb88 --- /dev/null +++ b/gtars/src/io/gtok.rs @@ -0,0 +1,190 @@ +use std::fs::File; +use std::fs::OpenOptions; +use std::io::{BufReader, BufWriter, Read, Write}; + +use anyhow::{Context, Result}; + +use super::consts::{GTOK_HEADER, GTOK_U16_FLAG, GTOK_U32_FLAG}; + +/// +/// Writes a vector of tokens to a file in the `.gtok` format. +/// # Arguments +/// - filename: the file to save the tokens to +/// - tokens: tokens to save +/// +pub fn write_tokens_to_gtok(filename: &str, tokens: &[u32]) -> Result<()> { + // make sure the path exists + let path = std::path::Path::new(filename); + + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } else { + anyhow::bail!("Failed to create parent directories for gtok file!") + } + + let file = File::create(filename).with_context(|| "Failed to create gtok file!")?; + let mut writer = BufWriter::new(file); + + // write the header + writer + .write_all(GTOK_HEADER) + .with_context(|| "Failed to write GTOK header to file!")?; + + // determine size of tokens + let is_small = tokens.iter().all(|&x| x <= u16::MAX as u32); + let flag = if is_small { + GTOK_U16_FLAG + } else { + GTOK_U32_FLAG + }; + writer + .write_all(&flag.to_le_bytes()) + .with_context(|| "Failed to write GTOK size flag to file!")?; + + for &token in tokens { + if is_small { + writer + .write_all(&(token as u16).to_le_bytes()) + .with_context(|| "Failed to write bytes to file!")?; + continue; + } + writer + .write_all(&token.to_le_bytes()) + .with_context(|| "Failed to write bytes to file!")?; + } + + Ok(()) +} + +/// +/// Read in a vector of tokens from a file in the `.gtok` format. +/// # Arguments +/// - filename: filename to read the tokens from +/// +/// # Returns +/// - vector of tokens in u32 format +pub fn read_tokens_from_gtok(filename: &str) -> Result> { + let file = File::open(filename)?; + let mut reader = BufReader::new(file); + + // check the header + let mut header = [0; 4]; + reader.read_exact(&mut header)?; + + if &header != GTOK_HEADER { + anyhow::bail!("File doesn't appear to be a valid .gtok file.") + } + + let mut size_flag = [0; 1]; + reader.read_exact(&mut size_flag)?; + + let mut tokens = Vec::new(); + + match size_flag { + [GTOK_U16_FLAG] => { + let mut buffer = [0; 2]; + while let Ok(()) = reader.read_exact(&mut buffer) { + tokens.push(u16::from_le_bytes(buffer) as u32); + } + } + [GTOK_U32_FLAG] => { + let mut buffer = [0; 4]; + while let Ok(()) = reader.read_exact(&mut buffer) { + tokens.push(u32::from_le_bytes(buffer)); + } + } + _ => { + anyhow::bail!("Invalid data format flag found in gtok file") + } + } + + Ok(tokens) +} + +/// +/// Initialize a `.gtok` file with a header and size flag. +/// # Arguments +/// - filename: the file to initialize +/// +/// # Returns +/// - Result<(), anyhow::Error> +pub fn init_gtok_file(filename: &str) -> Result<()> { + // make sure the path exists + let path = std::path::Path::new(filename); + + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent)?; + } else { + anyhow::bail!("Failed to create parent directories for gtok file!") + } + + let file = File::create(filename).with_context(|| "Failed to create gtok file!")?; + let mut writer = BufWriter::new(file); + + writer + .write_all(GTOK_HEADER) + .with_context(|| "Failed to write GTOK header to file!")?; + + // assume large and write u32 flag + writer + .write_all(>OK_U32_FLAG.to_le_bytes()) + .with_context(|| "Failed to write GTOK size flag to file!")?; + + Ok(()) +} + +/// +/// Add tokens to the end of an existing `.gtok` file. +/// +/// # Arguments +/// - filename: the file to append to +/// +/// # Returns +/// - Result<(), anyhow::Error> +pub fn append_tokens_to_gtok_file(filename: &str, tokens: &[u32]) -> Result<()> { + let file = File::open(filename).with_context(|| "Failed to open gtok file!")?; + + let mut reader = BufReader::new(file); + + // check the header + let mut header = [0; 4]; + reader.read_exact(&mut header)?; + + if &header != GTOK_HEADER { + anyhow::bail!("File doesn't appear to be a valid .gtok file.") + } + + // detect the size flag + let mut size_flag = [0; 1]; + reader.read_exact(&mut size_flag)?; + + // start appending to the open file + // must reopen because `Bufreader` takes ownership of `file`. + let file = OpenOptions::new() + .append(true) + .open(filename) + .with_context(|| "Failed to open gtok file for appending")?; + let mut writer = BufWriter::new(file); + + match size_flag { + [GTOK_U16_FLAG] => { + for token in tokens { + writer + .write_all(&(*token as u16).to_le_bytes()) + .with_context(|| "Failed to write bytes to file!")?; + } + } + [GTOK_U32_FLAG] => { + for token in tokens { + writer + .write_all(&token.to_le_bytes()) + .with_context(|| "Failed to write bytes to file!")?; + } + } + _ => { + anyhow::bail!("Invalid data format flag found in gtok file") + } + } + + Ok(()) +} diff --git a/gtars/src/io/mod.rs b/gtars/src/io/mod.rs index 22cec93..1a0d40f 100644 --- a/gtars/src/io/mod.rs +++ b/gtars/src/io/mod.rs @@ -1,184 +1,28 @@ -use std::fs::File; -use std::fs::OpenOptions; -use std::io::{BufReader, BufWriter, Read, Write}; - -use anyhow::{Context, Result}; - +//! # Input/Output utilities for genomic data. +//! +//! This small module provides some small, but convenient utility functions for writing and reading +//! genomic data to and from disk. Most importantly, it contains functions for saving and reading +//! `.gtok` files to disk - special files that store pre-tokenized genomic data for use in machine +//! learning pipelines. +//! +//! ## Examples +//! ### Save tokens to disk +//! ```rust +//! use gtars::io::write_tokens_to_gtok; +//! +//! let ids = vec![42, 101, 999]; +//! write_tokens_to_gtok("tokens.gtok".as_str(), &ids); +//! ``` +//! ### Read tokens from disk +//! ```rust +//! use gtars::io::read_tokens_from_gtok; +//! let ids = read_tokens_from_gtoK("tokens.gtok".to_str()); +//! +//! println!(ids); // [42, 101, 999] +//! ``` +pub mod gtok; pub mod consts; -use consts::{GTOK_HEADER, GTOK_U16_FLAG, GTOK_U32_FLAG}; - -/// -/// Writes a vector of tokens to a file in the `.gtok` format. -/// # Arguments -/// - filename: the file to save the tokens to -/// - tokens: tokens to save -/// -pub fn write_tokens_to_gtok(filename: &str, tokens: &[u32]) -> Result<()> { - // make sure the path exists - let path = std::path::Path::new(filename); - - if let Some(parent) = path.parent() { - std::fs::create_dir_all(parent)?; - } else { - anyhow::bail!("Failed to create parent directories for gtok file!") - } - - let file = File::create(filename).with_context(|| "Failed to create gtok file!")?; - let mut writer = BufWriter::new(file); - - // write the header - writer - .write_all(GTOK_HEADER) - .with_context(|| "Failed to write GTOK header to file!")?; - - // determine size of tokens - let is_small = tokens.iter().all(|&x| x <= u16::MAX as u32); - let flag = if is_small { - GTOK_U16_FLAG - } else { - GTOK_U32_FLAG - }; - writer - .write_all(&flag.to_le_bytes()) - .with_context(|| "Failed to write GTOK size flag to file!")?; - - for &token in tokens { - if is_small { - writer - .write_all(&(token as u16).to_le_bytes()) - .with_context(|| "Failed to write bytes to file!")?; - continue; - } - writer - .write_all(&token.to_le_bytes()) - .with_context(|| "Failed to write bytes to file!")?; - } - - Ok(()) -} - -/// -/// Read in a vector of tokens from a file in the `.gtok` format. -/// # Arguments -/// - filename: filename to read the tokens from -/// -/// # Returns -/// - vector of tokens in u32 format -pub fn read_tokens_from_gtok(filename: &str) -> Result> { - let file = File::open(filename)?; - let mut reader = BufReader::new(file); - - // check the header - let mut header = [0; 4]; - reader.read_exact(&mut header)?; - - if &header != GTOK_HEADER { - anyhow::bail!("File doesn't appear to be a valid .gtok file.") - } - - let mut size_flag = [0; 1]; - reader.read_exact(&mut size_flag)?; - - let mut tokens = Vec::new(); - - match size_flag { - [GTOK_U16_FLAG] => { - let mut buffer = [0; 2]; - while let Ok(()) = reader.read_exact(&mut buffer) { - tokens.push(u16::from_le_bytes(buffer) as u32); - } - } - [GTOK_U32_FLAG] => { - let mut buffer = [0; 4]; - while let Ok(()) = reader.read_exact(&mut buffer) { - tokens.push(u32::from_le_bytes(buffer)); - } - } - _ => { - anyhow::bail!("Invalid data format flag found in gtok file") - } - } - - Ok(tokens) -} - -/// -/// Initialize a `.gtok` file with a header and size flag. -/// # Arguments -/// - filename: the file to initialize -/// -/// # Returns -/// - Result<(), anyhow::Error> -pub fn init_gtok_file(filename: &str) -> Result<()> { - // make sure the path exists - let path = std::path::Path::new(filename); - - if let Some(parent) = path.parent() { - std::fs::create_dir_all(parent)?; - } else { - anyhow::bail!("Failed to create parent directories for gtok file!") - } - - let file = File::create(filename).with_context(|| "Failed to create gtok file!")?; - let mut writer = BufWriter::new(file); - - writer - .write_all(GTOK_HEADER) - .with_context(|| "Failed to write GTOK header to file!")?; - - // assume large and write u32 flag - writer - .write_all(>OK_U32_FLAG.to_le_bytes()) - .with_context(|| "Failed to write GTOK size flag to file!")?; - - Ok(()) -} - -pub fn append_tokens_to_gtok_file(filename: &str, tokens: &[u32]) -> Result<()> { - let file = File::open(filename).with_context(|| "Failed to open gtok file!")?; - - let mut reader = BufReader::new(file); - - // check the header - let mut header = [0; 4]; - reader.read_exact(&mut header)?; - - if &header != GTOK_HEADER { - anyhow::bail!("File doesn't appear to be a valid .gtok file.") - } - - // detect the size flag - let mut size_flag = [0; 1]; - reader.read_exact(&mut size_flag)?; - - // start appending to the open file - // must reopen beacause `Bufreader` takes ownership of `file`. - let file = OpenOptions::new() - .append(true) - .open(filename) - .with_context(|| "Failed to open gtok file for appending")?; - let mut writer = BufWriter::new(file); - - match size_flag { - [GTOK_U16_FLAG] => { - for token in tokens { - writer - .write_all(&(*token as u16).to_le_bytes()) - .with_context(|| "Failed to write bytes to file!")?; - } - } - [GTOK_U32_FLAG] => { - for token in tokens { - writer - .write_all(&token.to_le_bytes()) - .with_context(|| "Failed to write bytes to file!")?; - } - } - _ => { - anyhow::bail!("Invalid data format flag found in gtok file") - } - } - - Ok(()) -} +// re-expose core functions +pub use gtok::*; +pub use consts::*; \ No newline at end of file diff --git a/gtars/src/lib.rs b/gtars/src/lib.rs index 170f60e..cb8b762 100644 --- a/gtars/src/lib.rs +++ b/gtars/src/lib.rs @@ -4,6 +4,33 @@ //! processors for our python package, [`geniml`](https:github.com/databio/geniml), a library for machine learning on genomic intervals. //! However, it can be used as a standalone library for working with genomic intervals as well. //! +//! There are several modules in this crate. The most comprehensive is the [tokenizers] modules which houses genomic region tokenizers +//! for use as pre-processors to machine learning pipelines. +//! +//! ## Examples +//! ### Create a tokenizer and tokenize a bed file +//! ```rust +//! use gtars::tokenizers::TreeTokenizer; +//! use gtars::common::models::RegionSet; +//! +//! let path_to_bed_file = "path/to/screen.bed"; +//! let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); +//! +//! let path_to_tokenize_bed_fil = "path/to/peaks.bed"; +//! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); +//! +//! let tokenized_regions = tokenizer.tokenize_region_set(&rs); +//! println!(tokenized_regions.ids); +//! ``` +//! +//! You can save the result of this tokenization to a file for later use in machine learning model training: +//! ### Write tokens to `gtok` file for later use: +//! ```rust +//! use gtars::io::write_tokens_to_gtok; +//! +//! let ids = vec![42, 101, 999]; +//! write_tokens_to_gtok("tokens.gtok".as_str(), &ids); +//! ``` pub mod ailist; pub mod common; pub mod io; diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index e0f3429..2a49a69 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -6,5 +6,6 @@ use crate::common::models::Universe; pub struct MetaTokenizer { pub universe: Universe, - tree: HashMap> + tree: HashMap>, + secondary_trees: Option>>>, } \ No newline at end of file diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 331b25a..56ad4af 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -1,6 +1,24 @@ -//! # Tokenizers - tokenize new genomic intervals into a known universe for machine-learning pipelines +//! # Genomic data tokenizers and pre-processors to prepare interval data for machine learning pipelines. //! -//! There is currently only one tokenizer - the `TreeTokenizer` +//! The tokenizers module is the most comprehensive module in `gtars`. It houses all tokenizers that implement +//! tokenization of genomic data into a known vocabulary. This is especially useful for genomic data machine +//! learning models that are based on NLP-models like tranformers. +//! +//! ## Example +//! ### Create a tokenizer and tokenize a bed file +//! ```rust +//! use gtars::tokenizers::TreeTokenizer; +//! use gtars::common::models::RegionSet; +//! +//! let path_to_bed_file = "path/to/screen.bed"; +//! let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); +//! +//! let path_to_tokenize_bed_fil = "path/to/peaks.bed"; +//! let let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); +//! +//! let tokenized_regions = tokenizer.tokenize_region_set(&rs); +//! println!(tokenized_regions.ids); +//! ``` pub mod cli; pub mod config; pub mod fragment_tokenizer; diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 4f2fc8f..c74b9fd 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -10,6 +10,10 @@ use crate::common::utils::{create_interval_tree_from_universe, extract_regions_f use crate::tokenizers::config::TokenizerConfig; use crate::tokenizers::traits::{Pad, SpecialTokens, Tokenizer}; +/// +/// The TreeTokenizer is a basic tokenizer that can "tokenize" genomic regions +/// into a known universe (or vocabulary). This is especially useful as a +/// pre-processor for machine learning pipelines pub struct TreeTokenizer { pub universe: Universe, tree: HashMap>, diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index dc27aa4..10cae1d 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -1,3 +1,16 @@ +//! +//! # Uniwig BigWig generator + +//! The `uniwig` module is responsible for generating three BigWig output files based on a set of bed files. The generated files include a track for the start coordinates, a track for the end coordinates, and a track for the core coordinates. +//! +//! ## Under Construction +//! +//! This module is currently under construction. Stay tuned for more updates. +//! +//! ![Construction Sign](https://www.signoutfitters.com/images/products/detail/Workers_Symbol_Construction_Sign.png) +//! +//! + pub fn run_uniwig() { println!("Im running.") } From ae37dabdb37edc4736fae91448b88e6fa1ab7e9c Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 16:49:25 -0400 Subject: [PATCH 09/28] fix doc tests --- gtars/src/io/mod.rs | 6 +++--- gtars/src/lib.rs | 12 +++++++----- gtars/src/tokenizers/mod.rs | 12 +++++++----- gtars/tokens.gtok | Bin 0 -> 11 bytes 4 files changed, 17 insertions(+), 13 deletions(-) create mode 100644 gtars/tokens.gtok diff --git a/gtars/src/io/mod.rs b/gtars/src/io/mod.rs index 1a0d40f..4f10087 100644 --- a/gtars/src/io/mod.rs +++ b/gtars/src/io/mod.rs @@ -11,14 +11,14 @@ //! use gtars::io::write_tokens_to_gtok; //! //! let ids = vec![42, 101, 999]; -//! write_tokens_to_gtok("tokens.gtok".as_str(), &ids); +//! write_tokens_to_gtok("tokens.gtok", &ids); //! ``` //! ### Read tokens from disk //! ```rust //! use gtars::io::read_tokens_from_gtok; -//! let ids = read_tokens_from_gtoK("tokens.gtok".to_str()); +//! let ids = read_tokens_from_gtok("tokens.gtok").unwrap(); //! -//! println!(ids); // [42, 101, 999] +//! println!("{:?}", ids); // [42, 101, 999] //! ``` pub mod gtok; pub mod consts; diff --git a/gtars/src/lib.rs b/gtars/src/lib.rs index cb8b762..ed7e8ea 100644 --- a/gtars/src/lib.rs +++ b/gtars/src/lib.rs @@ -10,17 +10,19 @@ //! ## Examples //! ### Create a tokenizer and tokenize a bed file //! ```rust -//! use gtars::tokenizers::TreeTokenizer; +//! use std::path::Path; +//! +//! use gtars::tokenizers::{Tokenizer, TreeTokenizer}; //! use gtars::common::models::RegionSet; //! -//! let path_to_bed_file = "path/to/screen.bed"; +//! let path_to_bed_file = "tests/data/peaks.bed"; //! let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); //! -//! let path_to_tokenize_bed_fil = "path/to/peaks.bed"; +//! let path_to_tokenize_bed_file = "tests/data/to_tokenize.bed"; //! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); //! //! let tokenized_regions = tokenizer.tokenize_region_set(&rs); -//! println!(tokenized_regions.ids); +//! println!("{:?}", tokenized_regions.ids); //! ``` //! //! You can save the result of this tokenization to a file for later use in machine learning model training: @@ -29,7 +31,7 @@ //! use gtars::io::write_tokens_to_gtok; //! //! let ids = vec![42, 101, 999]; -//! write_tokens_to_gtok("tokens.gtok".as_str(), &ids); +//! write_tokens_to_gtok("tokens.gtok", &ids); //! ``` pub mod ailist; pub mod common; diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 56ad4af..87ef51d 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -7,17 +7,19 @@ //! ## Example //! ### Create a tokenizer and tokenize a bed file //! ```rust -//! use gtars::tokenizers::TreeTokenizer; +//! use std::path::Path; +//! +//! use gtars::tokenizers::{Tokenizer, TreeTokenizer}; //! use gtars::common::models::RegionSet; //! -//! let path_to_bed_file = "path/to/screen.bed"; +//! let path_to_bed_file = "tests/data/peaks.bed.gz"; //! let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); //! -//! let path_to_tokenize_bed_fil = "path/to/peaks.bed"; -//! let let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); +//! let path_to_tokenize_bed_file = "tests/data/to_tokenize.bed"; +//! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); //! //! let tokenized_regions = tokenizer.tokenize_region_set(&rs); -//! println!(tokenized_regions.ids); +//! println!("{:?}", tokenized_regions.ids); //! ``` pub mod cli; pub mod config; diff --git a/gtars/tokens.gtok b/gtars/tokens.gtok new file mode 100644 index 0000000000000000000000000000000000000000..4856944c538f14d9a70e4fdd6746ced858a8d3bf GIT binary patch literal 11 ScmZ<{@%LubVn}6p&I|w#=K`<* literal 0 HcmV?d00001 From 9f04a083ab0902ce09c1b4f462554afe3dfe8ef5 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 17:02:13 -0400 Subject: [PATCH 10/28] docs for common/utils --- gtars/src/common/mod.rs | 18 ++++++++++++++++++ gtars/src/common/utils.rs | 29 +++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/gtars/src/common/mod.rs b/gtars/src/common/mod.rs index 09fe2e0..85982ed 100644 --- a/gtars/src/common/mod.rs +++ b/gtars/src/common/mod.rs @@ -1,3 +1,21 @@ +//! +//! # Common, core utilities for `gtars` +//! This module contains core utilities across the `gtars` crate. While possible, it's usually not interfaced with directly +//! unless interacting with any of the [models]. +//! +//! ## Examples +//! ### Create region set +//! ```rust +//! use std::path::Path; +//! use gtars::common::models::RegionSet; +//! +//! let path_to_tokenize_bed_file = "tests/data/to_tokenize.bed"; +//! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); +//! +//! println!("{:?}", rs.regions); +//! ``` +//! + pub mod consts; pub mod models; pub mod utils; diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index 8d2df25..a285df1 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -12,6 +12,13 @@ use rust_lapper::{Interval, Lapper}; use crate::common::models::region::Region; use crate::common::models::universe::Universe; + +/// +/// Function to return a reader for either a gzip'd or non-gzip'd file. +/// +/// # Arguments +/// - path: path to the file to read +/// pub fn get_dynamic_reader(path: &Path) -> Result>> { let is_gzipped = path.extension() == Some(OsStr::new("gz")); let file = File::open(path).with_context(|| "Failed to open bed file.")?; @@ -26,6 +33,11 @@ pub fn get_dynamic_reader(path: &Path) -> Result>> { Ok(reader) } +/// +/// Create a region-to-id hash-map from a list of regions +/// +/// # Arguments: +/// - regions: vec![] of [Region] structs pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap { let mut current_id = 0; let mut region_to_id: HashMap = HashMap::new(); @@ -40,6 +52,11 @@ pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap { region_to_id } +/// +/// Generate an id-to-region hash-map from a list of regions +/// +/// # Arguments: +/// - regions: vec![] of [Region] structs pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap { let mut current_id = 0; let mut id_to_region: HashMap = HashMap::new(); @@ -54,6 +71,12 @@ pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap { id_to_region } +/// +/// Read in a bed file into a vector of [Region] structs. It handles detecting +/// the file-type, verifying each line, and error handling. +/// +/// # Arguments: +/// - path: path to the bed file to read in. pub fn extract_regions_from_bed_file(path: &Path) -> Result> { let reader = get_dynamic_reader(path)?; @@ -88,6 +111,12 @@ pub fn extract_regions_from_bed_file(path: &Path) -> Result> { Ok(regions) } +/// +/// Simple wrapper function that will create a [Lapper] object (an interval tree) +/// from a [Universe] struct. +/// +/// # Arguments: +/// - universe: the universe to create the interval tree for. pub fn create_interval_tree_from_universe( universe: &Universe, ) -> HashMap> { From c1121b554dc8fd3e72e055a88a275d0e65897e6c Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 18:28:16 -0400 Subject: [PATCH 11/28] work on documentation --- gtars/src/ailist/core.rs | 6 +++--- gtars/src/ailist/mod.rs | 8 ++++---- gtars/src/common/mod.rs | 8 ++++---- gtars/src/common/models/region_set.rs | 10 ++++++++++ gtars/src/common/models/tokenized_regionset.rs | 4 ++++ gtars/src/common/utils.rs | 11 +++++------ gtars/src/io/gtok.rs | 4 ++-- gtars/src/io/mod.rs | 12 ++++++------ gtars/src/lib.rs | 14 +++++++------- gtars/src/tokenizers/meta_tokenizer.rs | 6 +++--- gtars/src/tokenizers/mod.rs | 12 ++++++------ gtars/src/tokenizers/tree_tokenizer.rs | 14 +++++++++----- gtars/src/uniwig/mod.rs | 2 +- 13 files changed, 64 insertions(+), 47 deletions(-) diff --git a/gtars/src/ailist/core.rs b/gtars/src/ailist/core.rs index 29697a1..30266c5 100644 --- a/gtars/src/ailist/core.rs +++ b/gtars/src/ailist/core.rs @@ -13,7 +13,7 @@ impl fmt::Display for Interval { /// /// The Augmented Interval List (AIList), enumerates intersections between a query interval q and an interval set R. -/// +/// pub struct AIList { starts: Vec, ends: Vec, @@ -24,10 +24,10 @@ pub struct AIList { impl AIList { /// /// Create a new AIList struct - /// + /// /// # Arguments /// - intervals: list of intervals to create from - /// + /// /// # Returns /// - AIList struct pub fn new(intervals: &mut Vec, minimum_coverage_length: usize) -> AIList { diff --git a/gtars/src/ailist/mod.rs b/gtars/src/ailist/mod.rs index 3d1c86c..2f690d5 100644 --- a/gtars/src/ailist/mod.rs +++ b/gtars/src/ailist/mod.rs @@ -1,12 +1,12 @@ -//! +//! //! # Augmented Interval List: a novel data structure for efficient genomic interval search //! This is a rust implementation of the Augmented Interval List (AIList): [https://academic.oup.com/bioinformatics/article/35/23/4907/5509521](https://academic.oup.com/bioinformatics/article/35/23/4907/5509521). -//! +//! //! The Augmented Interval List (AIList), enumerates intersections between a query interval q and an interval set R. //! //! It should be complete, but has not been rigorously tested. -//! +//! pub mod core; // re-expose models -pub use core::{AIList, Interval}; \ No newline at end of file +pub use core::{AIList, Interval}; diff --git a/gtars/src/common/mod.rs b/gtars/src/common/mod.rs index 85982ed..e37c7f1 100644 --- a/gtars/src/common/mod.rs +++ b/gtars/src/common/mod.rs @@ -2,19 +2,19 @@ //! # Common, core utilities for `gtars` //! This module contains core utilities across the `gtars` crate. While possible, it's usually not interfaced with directly //! unless interacting with any of the [models]. -//! +//! //! ## Examples //! ### Create region set //! ```rust //! use std::path::Path; //! use gtars::common::models::RegionSet; -//! +//! //! let path_to_tokenize_bed_file = "tests/data/to_tokenize.bed"; //! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); -//! +//! //! println!("{:?}", rs.regions); //! ``` -//! +//! pub mod consts; pub mod models; diff --git a/gtars/src/common/models/region_set.rs b/gtars/src/common/models/region_set.rs index eb654b3..19d25aa 100644 --- a/gtars/src/common/models/region_set.rs +++ b/gtars/src/common/models/region_set.rs @@ -19,6 +19,11 @@ pub struct RegionSetIterator<'a> { impl TryFrom<&Path> for RegionSet { type Error = anyhow::Error; + /// + /// Create a new [RegionSet] from a bed file. + /// + /// # Arguments: + /// - value: path to bed file on disk. fn try_from(value: &Path) -> Result { let regions = extract_regions_from_bed_file(value)?; Ok(RegionSet { regions }) @@ -78,6 +83,11 @@ impl<'a> IntoIterator for &'a RegionSet { } impl RegionSet { + /// + /// Dump a regionset to disk + /// + /// # Arguments + /// - path: the path to the file to dump to pub fn to_bed(&self, path: &Path) -> Result<()> { let mut file = File::create(path)?; // is there a better way to do this? diff --git a/gtars/src/common/models/tokenized_regionset.rs b/gtars/src/common/models/tokenized_regionset.rs index 1959c13..5fd8ac3 100644 --- a/gtars/src/common/models/tokenized_regionset.rs +++ b/gtars/src/common/models/tokenized_regionset.rs @@ -12,6 +12,10 @@ use crate::io::write_tokens_to_gtok; use super::RegionSet; +/// +/// A tokenized region set is a specific representation of a region set. It is +/// two things: 1) a list of ids, and 2) a pointer to a Universe. The ids correspond +/// to the regions in that universe this [TokenizedRegionSet] represents. pub struct TokenizedRegionSet<'a> { pub ids: Vec, pub universe: &'a Universe, diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index a285df1..91e5baa 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -12,10 +12,9 @@ use rust_lapper::{Interval, Lapper}; use crate::common::models::region::Region; use crate::common::models::universe::Universe; - /// /// Function to return a reader for either a gzip'd or non-gzip'd file. -/// +/// /// # Arguments /// - path: path to the file to read /// @@ -35,7 +34,7 @@ pub fn get_dynamic_reader(path: &Path) -> Result>> { /// /// Create a region-to-id hash-map from a list of regions -/// +/// /// # Arguments: /// - regions: vec![] of [Region] structs pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap { @@ -54,7 +53,7 @@ pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap { /// /// Generate an id-to-region hash-map from a list of regions -/// +/// /// # Arguments: /// - regions: vec![] of [Region] structs pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap { @@ -74,7 +73,7 @@ pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap { /// /// Read in a bed file into a vector of [Region] structs. It handles detecting /// the file-type, verifying each line, and error handling. -/// +/// /// # Arguments: /// - path: path to the bed file to read in. pub fn extract_regions_from_bed_file(path: &Path) -> Result> { @@ -114,7 +113,7 @@ pub fn extract_regions_from_bed_file(path: &Path) -> Result> { /// /// Simple wrapper function that will create a [Lapper] object (an interval tree) /// from a [Universe] struct. -/// +/// /// # Arguments: /// - universe: the universe to create the interval tree for. pub fn create_interval_tree_from_universe( diff --git a/gtars/src/io/gtok.rs b/gtars/src/io/gtok.rs index 690cb88..a658ae6 100644 --- a/gtars/src/io/gtok.rs +++ b/gtars/src/io/gtok.rs @@ -135,10 +135,10 @@ pub fn init_gtok_file(filename: &str) -> Result<()> { /// /// Add tokens to the end of an existing `.gtok` file. -/// +/// /// # Arguments /// - filename: the file to append to -/// +/// /// # Returns /// - Result<(), anyhow::Error> pub fn append_tokens_to_gtok_file(filename: &str, tokens: &[u32]) -> Result<()> { diff --git a/gtars/src/io/mod.rs b/gtars/src/io/mod.rs index 4f10087..9b7693b 100644 --- a/gtars/src/io/mod.rs +++ b/gtars/src/io/mod.rs @@ -1,15 +1,15 @@ //! # Input/Output utilities for genomic data. -//! +//! //! This small module provides some small, but convenient utility functions for writing and reading //! genomic data to and from disk. Most importantly, it contains functions for saving and reading //! `.gtok` files to disk - special files that store pre-tokenized genomic data for use in machine //! learning pipelines. -//! +//! //! ## Examples //! ### Save tokens to disk //! ```rust //! use gtars::io::write_tokens_to_gtok; -//! +//! //! let ids = vec![42, 101, 999]; //! write_tokens_to_gtok("tokens.gtok", &ids); //! ``` @@ -17,12 +17,12 @@ //! ```rust //! use gtars::io::read_tokens_from_gtok; //! let ids = read_tokens_from_gtok("tokens.gtok").unwrap(); -//! +//! //! println!("{:?}", ids); // [42, 101, 999] //! ``` -pub mod gtok; pub mod consts; +pub mod gtok; // re-expose core functions +pub use consts::*; pub use gtok::*; -pub use consts::*; \ No newline at end of file diff --git a/gtars/src/lib.rs b/gtars/src/lib.rs index ed7e8ea..abcdc30 100644 --- a/gtars/src/lib.rs +++ b/gtars/src/lib.rs @@ -6,30 +6,30 @@ //! //! There are several modules in this crate. The most comprehensive is the [tokenizers] modules which houses genomic region tokenizers //! for use as pre-processors to machine learning pipelines. -//! +//! //! ## Examples //! ### Create a tokenizer and tokenize a bed file //! ```rust //! use std::path::Path; -//! +//! //! use gtars::tokenizers::{Tokenizer, TreeTokenizer}; //! use gtars::common::models::RegionSet; -//! +//! //! let path_to_bed_file = "tests/data/peaks.bed"; //! let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); -//! +//! //! let path_to_tokenize_bed_file = "tests/data/to_tokenize.bed"; //! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); -//! +//! //! let tokenized_regions = tokenizer.tokenize_region_set(&rs); //! println!("{:?}", tokenized_regions.ids); //! ``` -//! +//! //! You can save the result of this tokenization to a file for later use in machine learning model training: //! ### Write tokens to `gtok` file for later use: //! ```rust //! use gtars::io::write_tokens_to_gtok; -//! +//! //! let ids = vec![42, 101, 999]; //! write_tokens_to_gtok("tokens.gtok", &ids); //! ``` diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index 2a49a69..5192aa4 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -1,11 +1,11 @@ use std::collections::HashMap; -use rust_lapper::{Lapper, Interval}; +use rust_lapper::{Interval, Lapper}; use crate::common::models::Universe; pub struct MetaTokenizer { pub universe: Universe, - tree: HashMap>, + tree: HashMap>, secondary_trees: Option>>>, -} \ No newline at end of file +} diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 87ef51d..cbb2f80 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -3,32 +3,32 @@ //! The tokenizers module is the most comprehensive module in `gtars`. It houses all tokenizers that implement //! tokenization of genomic data into a known vocabulary. This is especially useful for genomic data machine //! learning models that are based on NLP-models like tranformers. -//! +//! //! ## Example //! ### Create a tokenizer and tokenize a bed file //! ```rust //! use std::path::Path; -//! +//! //! use gtars::tokenizers::{Tokenizer, TreeTokenizer}; //! use gtars::common::models::RegionSet; -//! +//! //! let path_to_bed_file = "tests/data/peaks.bed.gz"; //! let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); -//! +//! //! let path_to_tokenize_bed_file = "tests/data/to_tokenize.bed"; //! let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); -//! +//! //! let tokenized_regions = tokenizer.tokenize_region_set(&rs); //! println!("{:?}", tokenized_regions.ids); //! ``` pub mod cli; pub mod config; pub mod fragment_tokenizer; +pub mod meta_tokenizer; pub mod soft_tokenizer; pub mod special_tokens; pub mod traits; pub mod tree_tokenizer; -pub mod meta_tokenizer; /// constants for the tokenizer module. pub mod consts { diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index c74b9fd..096d960 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -39,19 +39,23 @@ impl TryFrom<&Path> for TreeTokenizer { let (mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension { // parse config file "toml" => { - let config = TokenizerConfig::new(value) - .with_context(|| { - format!("Invalid tokenizer configuration found for file: {}", value.to_str().unwrap()) + let config = TokenizerConfig::new(value).with_context(|| { + format!( + "Invalid tokenizer configuration found for file: {}", + value.to_str().unwrap() + ) })?; if config.universes.is_empty() { - anyhow::bail!("You must have at least one universe in your universe list. Found zero.") + anyhow::bail!( + "You must have at least one universe in your universe list. Found zero." + ) } let primary_universe = &config.universes[0]; let other_universes = match config.universes.len() { 1 => None, - _ => Some(&config.universes[1..]) + _ => Some(&config.universes[1..]), }; // universe path is relative to the config file diff --git a/gtars/src/uniwig/mod.rs b/gtars/src/uniwig/mod.rs index 10cae1d..f74b837 100644 --- a/gtars/src/uniwig/mod.rs +++ b/gtars/src/uniwig/mod.rs @@ -9,7 +9,7 @@ //! //! ![Construction Sign](https://www.signoutfitters.com/images/products/detail/Workers_Symbol_Construction_Sign.png) //! -//! +//! pub fn run_uniwig() { println!("Im running.") From a2af7c1c77641ec681a09a6fd4a65d0a27bd5107 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Fri, 14 Jun 2024 21:05:01 -0400 Subject: [PATCH 12/28] work on meta-token tokenizer --- gtars/src/common/models/universe.rs | 3 +- gtars/src/common/utils.rs | 4 +- gtars/src/tokenizers/meta_tokenizer.rs | 202 ++++++++++++++++++++++++- 3 files changed, 205 insertions(+), 4 deletions(-) diff --git a/gtars/src/common/models/universe.rs b/gtars/src/common/models/universe.rs index 9fe10f6..9c4c7e5 100644 --- a/gtars/src/common/models/universe.rs +++ b/gtars/src/common/models/universe.rs @@ -8,7 +8,7 @@ use crate::common::utils::{ extract_regions_from_bed_file, generate_id_to_region_map, generate_region_to_id_map, }; -#[derive(Clone, Eq, PartialEq)] +#[derive(Clone, Eq, PartialEq, Default)] pub struct Universe { pub regions: Vec, pub region_to_id: HashMap, @@ -20,6 +20,7 @@ impl Universe { let new_id = self.region_to_id.len(); self.region_to_id.insert(region.to_owned(), new_id as u32); self.id_to_region.insert(new_id as u32, region.to_owned()); + self.regions.push(region.to_owned()); } pub fn convert_region_to_id(&self, region: &Region) -> Option { diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index 91e5baa..93b8c83 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -139,8 +139,8 @@ pub fn create_interval_tree_from_universe( } // build the tree - for (chr, chr_intervals) in intervals.iter() { - let lapper: Lapper = Lapper::new(chr_intervals.to_owned()); + for (chr, chr_intervals) in intervals.into_iter() { + let lapper: Lapper = Lapper::new(chr_intervals); tree.insert(chr.to_string(), lapper); } diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index 5192aa4..8affc77 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -1,11 +1,211 @@ use std::collections::HashMap; +use std::io::BufRead; +use std::path::Path; +use anyhow::{Context, Result}; use rust_lapper::{Interval, Lapper}; -use crate::common::models::Universe; +use crate::common::models::{Region, Universe}; +use crate::common::utils::get_dynamic_reader; +use crate::tokenizers::TokenizerConfig; +/// +/// The MetaTokenizer is a TreeTokenizer that implements the concept of meta-tokens. Meta +/// tokens are a way to reduce the size of the vocabulary for genomic interval-based +/// machine learning models. +/// +/// In brief, meta-tokens are tokens that represent *clusters* of genomic intervals. pub struct MetaTokenizer { pub universe: Universe, + region_to_metatoken: HashMap, tree: HashMap>, secondary_trees: Option>>>, } + +impl TryFrom<&Path> for MetaTokenizer { + type Error = anyhow::Error; + + /// + /// # Arguments + /// - `value` - the path to the tokenizer config file (a TOML) or bed file + /// + /// # Returns + /// A new TreeTokenizer + fn try_from(value: &Path) -> Result { + let config = TokenizerConfig::new(value).with_context(|| { + format!( + "Invalid tokenizer configuration found for file: {}", + value.to_str().unwrap() + ) + })?; + + // verify the config is good + if config.universes.is_empty() { + anyhow::bail!("You must have at least one universe in your universe list. Found zero.") + } + + // get priority list of universes + let primary_universe = &config.universes[0]; + let other_universes = match config.universes.len() { + 1 => None, + _ => Some(&config.universes[1..]), + }; + + // parse first universe + let reader = get_dynamic_reader(Path::new(primary_universe))?; + let mut universe = Universe::default(); + let mut intervals: HashMap>> = HashMap::new(); + let mut region_to_metatoken: HashMap = HashMap::new(); + + for line in reader.lines() { + let line = line?; + + let fields: Vec<&str> = line.split('\t').collect(); + + // check length of fields + if fields.len() < 4 { + anyhow::bail!("BED file line does not have at least 4 fields: {}", line); + } + + // parse the fields + let chr = fields[0]; + let start = fields[1].parse::().with_context(|| { + format!("Failed to parse start position in BED file line: {}", line) + })?; + + let end = fields[2].parse::().with_context(|| { + format!("Failed to parse end position in BED file line: {}", line) + })?; + + let meta_id = fields[3] + .parse::() + .with_context(|| format!("Failed to parse meta ID in BED file line: {}", line))?; + + // construct the actual region + let region = Region { + chr: chr.to_string(), + start, + end, + }; + + // construct the mapped meta token + let meta_region = Region { + chr: meta_id.to_string(), + start: 0, + end: 0, + }; + + // update the universe with the metatoken + universe.insert_token(&meta_region); + + // insert a region into the appropriate list + let ilist = intervals.entry(region.chr.clone()).or_default(); + ilist.push(Interval { + start: region.start, + stop: region.end, + val: universe.convert_region_to_id(&meta_region).unwrap(), + }); + + // insert the region into the meta token map + region_to_metatoken.insert(region, meta_region); + } + + let mut tree: HashMap> = HashMap::new(); + + for (chr, chr_intervals) in intervals.into_iter() { + let lapper: Lapper = Lapper::new(chr_intervals); + tree.insert(chr, lapper); + } + + let secondary_trees = match other_universes { + None => None, + Some(other_universes) => { + let mut secondary_trees = Vec::new(); + + for other_universe in other_universes { + let reader = get_dynamic_reader(Path::new(other_universe))?; + let mut intervals: HashMap>> = HashMap::new(); + + for line in reader.lines() { + let line = line?; + + let fields: Vec<&str> = line.split('\t').collect(); + + // check length of fields + if fields.len() < 4 { + anyhow::bail!( + "BED file line does not have at least 4 fields: {}", + line + ); + } + + // parse the fields + let chr = fields[0]; + let start = fields[1].parse::().with_context(|| { + format!("Failed to parse start position in BED file line: {}", line) + })?; + + let end = fields[2].parse::().with_context(|| { + format!("Failed to parse end position in BED file line: {}", line) + })?; + + let meta_id = fields[3].parse::().with_context(|| { + format!("Failed to parse meta ID in BED file line: {}", line) + })?; + + // construct the actual region + let region = Region { + chr: chr.to_string(), + start, + end, + }; + + // TODO: this is actually not right... secondary universes + // shouldnt have to be aware of others. So, this might + // be the wrong meta token id. + // we need to keep track of meta tokens that + // already exist and increment from there. + // construct the mapped meta token + let meta_region = Region { + chr: meta_id.to_string(), + start: 0, + end: 0, + }; + + // update the universe with the metatoken + universe.insert_token(&meta_region); + + // insert a region into the appropriate list + let ilist = intervals.entry(region.chr.clone()).or_default(); + ilist.push(Interval { + start: region.start, + stop: region.end, + val: universe.convert_region_to_id(&meta_region).unwrap(), + }); + + // insert the region into the meta token map + region_to_metatoken.insert(region, meta_region); + } + + let mut tree: HashMap> = HashMap::new(); + + for (chr, chr_intervals) in intervals.into_iter() { + let lapper: Lapper = Lapper::new(chr_intervals); + tree.insert(chr, lapper); + } + + secondary_trees.push(tree); + } + + Some(secondary_trees) + } + }; + + Ok(MetaTokenizer { + universe, + region_to_metatoken, + tree, + secondary_trees, + }) + } +} From 3b5238890c99b6c5fabf3ffc779256809c237c5d Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Sat, 15 Jun 2024 21:37:03 -0400 Subject: [PATCH 13/28] finish making meta tokenizer... hopefully --- gtars/src/tokenizers/meta_tokenizer.rs | 100 +++++++++++++++++++++---- 1 file changed, 84 insertions(+), 16 deletions(-) diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index 8affc77..b27e2ae 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -5,6 +5,7 @@ use std::path::Path; use anyhow::{Context, Result}; use rust_lapper::{Interval, Lapper}; +use crate::common::consts::special_tokens::*; use crate::common::models::{Region, Universe}; use crate::common::utils::get_dynamic_reader; use crate::tokenizers::TokenizerConfig; @@ -57,6 +58,8 @@ impl TryFrom<&Path> for MetaTokenizer { let mut intervals: HashMap>> = HashMap::new(); let mut region_to_metatoken: HashMap = HashMap::new(); + let mut seen_metatokens: HashMap = HashMap::new(); + for line in reader.lines() { let line = line?; @@ -76,10 +79,22 @@ impl TryFrom<&Path> for MetaTokenizer { let end = fields[2].parse::().with_context(|| { format!("Failed to parse end position in BED file line: {}", line) })?; - - let meta_id = fields[3] - .parse::() - .with_context(|| format!("Failed to parse meta ID in BED file line: {}", line))?; + + // why is primary_ being prepended to the metatoken id? + // - this is a way to ensure that the metatoken id is unique, + // imagine a secondary universe that has the same metatoken id + let meta_id = format!("primary_{}", fields[3]); + + // get the id for the metatoken if we've seen it before + // else create a new id and insert it into the hashmap + let meta_id = match seen_metatokens.get(&meta_id) { + Some(id) => *id, + None => { + let id = seen_metatokens.len() as u32; + seen_metatokens.insert(meta_id, id); + id + } + }; // construct the actual region let region = Region { @@ -90,7 +105,7 @@ impl TryFrom<&Path> for MetaTokenizer { // construct the mapped meta token let meta_region = Region { - chr: meta_id.to_string(), + chr: format!("chrM{}", meta_id), start: 0, end: 0, }; @@ -122,7 +137,8 @@ impl TryFrom<&Path> for MetaTokenizer { Some(other_universes) => { let mut secondary_trees = Vec::new(); - for other_universe in other_universes { + for (u_num, other_universe) in other_universes.iter().enumerate() { + let reader = get_dynamic_reader(Path::new(other_universe))?; let mut intervals: HashMap>> = HashMap::new(); @@ -149,9 +165,16 @@ impl TryFrom<&Path> for MetaTokenizer { format!("Failed to parse end position in BED file line: {}", line) })?; - let meta_id = fields[3].parse::().with_context(|| { - format!("Failed to parse meta ID in BED file line: {}", line) - })?; + let meta_id = format!("secondary_{}_{}", u_num, fields[3]); + + let meta_id = match seen_metatokens.get(&meta_id) { + Some(id) => *id, + None => { + let id = seen_metatokens.len() as u32; + seen_metatokens.insert(meta_id, id); + id + } + }; // construct the actual region let region = Region { @@ -160,14 +183,9 @@ impl TryFrom<&Path> for MetaTokenizer { end, }; - // TODO: this is actually not right... secondary universes - // shouldnt have to be aware of others. So, this might - // be the wrong meta token id. - // we need to keep track of meta tokens that - // already exist and increment from there. - // construct the mapped meta token + // extract meta region id let meta_region = Region { - chr: meta_id.to_string(), + chr: format!("chrM{}", meta_id), start: 0, end: 0, }; @@ -201,6 +219,56 @@ impl TryFrom<&Path> for MetaTokenizer { } }; + // now we can insert the special tokens + // unk + universe.insert_token(&Region { + chr: UNKNOWN_CHR.to_string(), + start: UNKNOWN_START as u32, + end: UNKNOWN_END as u32, + }); + + // pad + universe.insert_token(&Region { + chr: PAD_CHR.to_string(), + start: PAD_START as u32, + end: PAD_END as u32, + }); + + // mask + universe.insert_token(&Region { + chr: MASK_CHR.to_string(), + start: MASK_START as u32, + end: MASK_END as u32, + }); + + // eos + universe.insert_token(&Region { + chr: EOS_CHR.to_string(), + start: EOS_START as u32, + end: EOS_END as u32, + }); + + // bos + universe.insert_token(&Region { + chr: BOS_CHR.to_string(), + start: BOS_START as u32, + end: BOS_END as u32, + }); + + // cls + universe.insert_token(&Region { + chr: CLS_CHR.to_string(), + start: CLS_START as u32, + end: CLS_END as u32, + }); + + // sep + universe.insert_token(&Region { + chr: SEP_CHR.to_string(), + start: SEP_START as u32, + end: SEP_END as u32, + }); + Ok(MetaTokenizer { universe, region_to_metatoken, From 74518efb782c56f243230e5a383c6c337fa07de7 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Sat, 15 Jun 2024 21:50:57 -0400 Subject: [PATCH 14/28] basic implementation of the meta tokenizer --- gtars/src/tokenizers/meta_tokenizer.rs | 34 ++++++- gtars/src/tokenizers/mod.rs | 132 +------------------------ gtars/src/tokenizers/tree_tokenizer.rs | 130 ++++++++++++++++++++++++ gtars/tests/data/chroms.meta.bed | 24 +++++ gtars/tests/data/tokenizer.meta.toml | 1 + 5 files changed, 188 insertions(+), 133 deletions(-) create mode 100644 gtars/tests/data/chroms.meta.bed create mode 100644 gtars/tests/data/tokenizer.meta.toml diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index b27e2ae..517d779 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -52,8 +52,10 @@ impl TryFrom<&Path> for MetaTokenizer { _ => Some(&config.universes[1..]), }; + let primary_universe = value.parent().unwrap().join(primary_universe); + // parse first universe - let reader = get_dynamic_reader(Path::new(primary_universe))?; + let reader = get_dynamic_reader(Path::new(&primary_universe))?; let mut universe = Universe::default(); let mut intervals: HashMap>> = HashMap::new(); let mut region_to_metatoken: HashMap = HashMap::new(); @@ -139,7 +141,9 @@ impl TryFrom<&Path> for MetaTokenizer { for (u_num, other_universe) in other_universes.iter().enumerate() { - let reader = get_dynamic_reader(Path::new(other_universe))?; + let other_universe = value.parent().unwrap().join(other_universe); + + let reader = get_dynamic_reader(Path::new(&other_universe))?; let mut intervals: HashMap>> = HashMap::new(); for line in reader.lines() { @@ -277,3 +281,29 @@ impl TryFrom<&Path> for MetaTokenizer { }) } } + + +// tests +#[cfg(test)] +mod tests { + + use super::*; + use pretty_assertions::assert_eq; + use rstest::*; + + #[fixture] + fn path_to_config_file() -> &'static str { + "tests/data/tokenizer.meta.toml" + } + + #[fixture] + fn path_to_tokenize_bed_file() -> &'static str { + "tests/data/to_tokenize.bed" + } + + #[rstest] + fn test_create_tokenizer(path_to_config_file: &str) { + let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + assert_eq!(tokenizer.universe.len(), 27); + } +} \ No newline at end of file diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index cbb2f80..19b94ad 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -41,134 +41,4 @@ pub mod consts { pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; -pub use tree_tokenizer::TreeTokenizer; - -#[cfg(test)] -mod tests { - - use crate::common::models::{Region, RegionSet}; - use crate::tokenizers::traits::SpecialTokens; - use std::path::Path; - - use super::*; - use pretty_assertions::assert_eq; - use rstest::*; - - #[fixture] - fn path_to_bed_file() -> &'static str { - "tests/data/peaks.bed" - } - - #[fixture] - fn path_to_config_file() -> &'static str { - "tests/data/tokenizer.toml" - } - - #[fixture] - fn path_to_bad_config_file() -> &'static str { - "tests/data/tokenizer_bad.toml" - } - - #[fixture] - fn path_to_tokenize_bed_file() -> &'static str { - "tests/data/to_tokenize.bed" - } - - #[rstest] - fn test_create_tokenizer_from_bed(path_to_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens - } - - #[rstest] - fn test_create_tokenizer_from_config(path_to_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens - } - - #[rstest] - #[should_panic] - fn test_bad_config_file(path_to_bad_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file)); - let _tokenizer = tokenizer.unwrap(); - } - - #[rstest] - fn test_get_special_token_ids(path_to_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - let unk_id = tokenizer.unknown_token_id(); - let pad_id = tokenizer.padding_token_id(); - let mask_id = tokenizer.mask_token_id(); - let eos_id = tokenizer.eos_token_id(); - let bos_id = tokenizer.bos_token_id(); - let cls_id = tokenizer.cls_token_id(); - let sep_id = tokenizer.sep_token_id(); - - assert_eq!(unk_id, 25); - assert_eq!(pad_id, 26); - assert_eq!(mask_id, 27); - assert_eq!(eos_id, 28); - assert_eq!(bos_id, 29); - assert_eq!(cls_id, 30); - assert_eq!(sep_id, 31); - } - - #[rstest] - fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); - let tokenized_regions = tokenizer.tokenize_region_set(&rs); - - println!("{}", tokenized_regions.len()); - assert_eq!(tokenized_regions.len(), 4); - - // last should be the unknown token - let unknown_token = tokenizer - .universe - .convert_id_to_region(tokenized_regions[3]) - .unwrap(); - assert!(unknown_token.chr == "chrUNK"); - } - - #[rstest] - fn test_hierarchical_universe_hit(path_to_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - let res = tokenizer.tokenize_region(&Region { - chr: "chr1".to_string(), - start: 100, - end: 200, - }); - assert_eq!(res.len(), 1); - - // check the id, it should be len(primary_universe) + 1 (since its chr1) - assert_eq!(res.ids, vec![25]); - - let res = res.into_region_vec(); - let region = &res[0]; - - assert_eq!(region.chr, "chr1"); - assert_eq!(region.start, 0); - assert_eq!(region.end, 248_956_422); - } - - #[rstest] - fn test_hierarchical_universe_no_hit(path_to_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - let res = tokenizer.tokenize_region(&Region { - chr: "chrFOO".to_string(), - start: 100, - end: 200, - }); - assert_eq!(res.len(), 1); - - // check the id, it should be the id of the UNK token - assert_eq!(res.ids, vec![49]); - - let res = res.into_region_vec(); - let region = &res[0]; - - assert_eq!(region.chr, "chrUNK"); - assert_eq!(region.start, 0); - assert_eq!(region.end, 0); - } -} +pub use tree_tokenizer::TreeTokenizer; \ No newline at end of file diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 096d960..48c0931 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -414,3 +414,133 @@ impl TreeTokenizer { // use default implementation impl Pad for TreeTokenizer {} + +#[cfg(test)] +mod tests { + + use crate::common::models::{Region, RegionSet}; + use crate::tokenizers::traits::SpecialTokens; + use std::path::Path; + + use super::*; + use pretty_assertions::assert_eq; + use rstest::*; + + #[fixture] + fn path_to_bed_file() -> &'static str { + "tests/data/peaks.bed" + } + + #[fixture] + fn path_to_config_file() -> &'static str { + "tests/data/tokenizer.toml" + } + + #[fixture] + fn path_to_bad_config_file() -> &'static str { + "tests/data/tokenizer_bad.toml" + } + + #[fixture] + fn path_to_tokenize_bed_file() -> &'static str { + "tests/data/to_tokenize.bed" + } + + #[rstest] + fn test_create_tokenizer_from_bed(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens + } + + #[rstest] + fn test_create_tokenizer_from_config(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens + } + + #[rstest] + #[should_panic] + fn test_bad_config_file(path_to_bad_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file)); + let _tokenizer = tokenizer.unwrap(); + } + + #[rstest] + fn test_get_special_token_ids(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let unk_id = tokenizer.unknown_token_id(); + let pad_id = tokenizer.padding_token_id(); + let mask_id = tokenizer.mask_token_id(); + let eos_id = tokenizer.eos_token_id(); + let bos_id = tokenizer.bos_token_id(); + let cls_id = tokenizer.cls_token_id(); + let sep_id = tokenizer.sep_token_id(); + + assert_eq!(unk_id, 25); + assert_eq!(pad_id, 26); + assert_eq!(mask_id, 27); + assert_eq!(eos_id, 28); + assert_eq!(bos_id, 29); + assert_eq!(cls_id, 30); + assert_eq!(sep_id, 31); + } + + #[rstest] + fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); + let tokenized_regions = tokenizer.tokenize_region_set(&rs); + + println!("{}", tokenized_regions.len()); + assert_eq!(tokenized_regions.len(), 4); + + // last should be the unknown token + let unknown_token = tokenizer + .universe + .convert_id_to_region(tokenized_regions[3]) + .unwrap(); + assert!(unknown_token.chr == "chrUNK"); + } + + #[rstest] + fn test_hierarchical_universe_hit(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let res = tokenizer.tokenize_region(&Region { + chr: "chr1".to_string(), + start: 100, + end: 200, + }); + assert_eq!(res.len(), 1); + + // check the id, it should be len(primary_universe) + 1 (since its chr1) + assert_eq!(res.ids, vec![25]); + + let res = res.into_region_vec(); + let region = &res[0]; + + assert_eq!(region.chr, "chr1"); + assert_eq!(region.start, 0); + assert_eq!(region.end, 248_956_422); + } + + #[rstest] + fn test_hierarchical_universe_no_hit(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let res = tokenizer.tokenize_region(&Region { + chr: "chrFOO".to_string(), + start: 100, + end: 200, + }); + assert_eq!(res.len(), 1); + + // check the id, it should be the id of the UNK token + assert_eq!(res.ids, vec![49]); + + let res = res.into_region_vec(); + let region = &res[0]; + + assert_eq!(region.chr, "chrUNK"); + assert_eq!(region.start, 0); + assert_eq!(region.end, 0); + } +} diff --git a/gtars/tests/data/chroms.meta.bed b/gtars/tests/data/chroms.meta.bed new file mode 100644 index 0000000..f75c730 --- /dev/null +++ b/gtars/tests/data/chroms.meta.bed @@ -0,0 +1,24 @@ +chr1 0 248956422 1 +chr2 0 242193529 1 +chr3 0 198295559 2 +chr4 0 190214555 2 +chr5 0 181538259 3 +chr6 0 170805979 3 +chr7 0 159345973 4 +chr8 0 145138636 4 +chr9 0 138394717 5 +chr10 0 133797422 5 +chr11 0 135086622 6 +chr12 0 133275309 6 +chr13 0 114364328 6 +chr14 0 107043718 7 +chr15 0 101991189 7 +chr16 0 90338345 8 +chr17 0 83257441 8 +chr18 0 80373285 8 +chr19 0 58617616 8 +chr20 0 64444167 9 +chr21 0 46709983 9 +chr22 0 50818468 10 +chrX 0 156040895 11 +chrY 0 57227415 11 \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.meta.toml b/gtars/tests/data/tokenizer.meta.toml new file mode 100644 index 0000000..219834f --- /dev/null +++ b/gtars/tests/data/tokenizer.meta.toml @@ -0,0 +1 @@ +universes = ["peaks.meta.bed", "chroms.meta.bed"] \ No newline at end of file From 8ba4294305991c1f4d8abae8384412ae860e0390 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Sat, 15 Jun 2024 21:51:52 -0400 Subject: [PATCH 15/28] update test data --- gtars/tests/data/chroms.meta.bed | 48 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/gtars/tests/data/chroms.meta.bed b/gtars/tests/data/chroms.meta.bed index f75c730..7ee71a8 100644 --- a/gtars/tests/data/chroms.meta.bed +++ b/gtars/tests/data/chroms.meta.bed @@ -1,24 +1,24 @@ -chr1 0 248956422 1 -chr2 0 242193529 1 -chr3 0 198295559 2 -chr4 0 190214555 2 -chr5 0 181538259 3 -chr6 0 170805979 3 -chr7 0 159345973 4 -chr8 0 145138636 4 -chr9 0 138394717 5 -chr10 0 133797422 5 -chr11 0 135086622 6 -chr12 0 133275309 6 -chr13 0 114364328 6 -chr14 0 107043718 7 -chr15 0 101991189 7 -chr16 0 90338345 8 -chr17 0 83257441 8 -chr18 0 80373285 8 -chr19 0 58617616 8 -chr20 0 64444167 9 -chr21 0 46709983 9 -chr22 0 50818468 10 -chrX 0 156040895 11 -chrY 0 57227415 11 \ No newline at end of file +chr1 0 248956422 cluster_1 +chr2 0 242193529 cluster_1 +chr3 0 198295559 cluster_2 +chr4 0 190214555 cluster_2 +chr5 0 181538259 cluster_3 +chr6 0 170805979 cluster_3 +chr7 0 159345973 cluster_4 +chr8 0 145138636 cluster_4 +chr9 0 138394717 cluster_5 +chr10 0 133797422 cluster_5 +chr11 0 135086622 cluster_6 +chr12 0 133275309 cluster_6 +chr13 0 114364328 cluster_6 +chr14 0 107043718 cluster_7 +chr15 0 101991189 cluster_7 +chr16 0 90338345 cluster_8 +chr17 0 83257441 cluster_8 +chr18 0 80373285 cluster_8 +chr19 0 58617616 cluster_8 +chr20 0 64444167 cluster_9 +chr21 0 46709983 cluster_9 +chr22 0 50818468 cluster_10 +chrX 0 156040895 cluster_11 +chrY 0 57227415 cluster_11 \ No newline at end of file From 797b20431ea1ddb76c1f9b0114db23482ffbf32a Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Sun, 16 Jun 2024 19:06:06 -0400 Subject: [PATCH 16/28] flush out the meta tokenizer --- gtars/src/common/models/universe.rs | 4 + gtars/src/tokenizers/meta_tokenizer.rs | 279 ++++++++++++++++++++++++- gtars/tests/data/chroms.meta.bed | 3 +- 3 files changed, 280 insertions(+), 6 deletions(-) diff --git a/gtars/src/common/models/universe.rs b/gtars/src/common/models/universe.rs index 9c4c7e5..3e757bd 100644 --- a/gtars/src/common/models/universe.rs +++ b/gtars/src/common/models/universe.rs @@ -48,6 +48,10 @@ impl Universe { pub fn is_empty(&self) -> bool { self.region_to_id.len() == 0 } + + pub fn contains_region(&self, region: &Region) -> bool { + self.region_to_id.contains_key(region) + } } impl From> for Universe { diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index 517d779..3767679 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -6,9 +6,11 @@ use anyhow::{Context, Result}; use rust_lapper::{Interval, Lapper}; use crate::common::consts::special_tokens::*; -use crate::common::models::{Region, Universe}; +use crate::common::models::{Region, RegionSet, Universe, TokenizedRegionSet}; use crate::common::utils::get_dynamic_reader; -use crate::tokenizers::TokenizerConfig; +use crate::tokenizers::{TokenizerConfig, Tokenizer}; + +use super::traits::SpecialTokens; /// /// The MetaTokenizer is a TreeTokenizer that implements the concept of meta-tokens. Meta @@ -113,7 +115,9 @@ impl TryFrom<&Path> for MetaTokenizer { }; // update the universe with the metatoken - universe.insert_token(&meta_region); + if !universe.contains_region(&meta_region) { + universe.insert_token(&meta_region); + } // insert a region into the appropriate list let ilist = intervals.entry(region.chr.clone()).or_default(); @@ -195,7 +199,9 @@ impl TryFrom<&Path> for MetaTokenizer { }; // update the universe with the metatoken - universe.insert_token(&meta_region); + if !universe.contains_region(&meta_region) { + universe.insert_token(&meta_region); + } // insert a region into the appropriate list let ilist = intervals.entry(region.chr.clone()).or_default(); @@ -282,11 +288,219 @@ impl TryFrom<&Path> for MetaTokenizer { } } +impl SpecialTokens for MetaTokenizer { + fn unknown_token(&self) -> Region { + Region { + chr: UNKNOWN_CHR.to_string(), + start: UNKNOWN_START as u32, + end: UNKNOWN_END as u32, + } + } + + fn padding_token(&self) -> Region { + Region { + chr: PAD_CHR.to_string(), + start: PAD_START as u32, + end: PAD_END as u32, + } + } + + fn mask_token(&self) -> Region { + Region { + chr: MASK_CHR.to_string(), + start: MASK_START as u32, + end: MASK_END as u32, + } + } + + fn cls_token(&self) -> Region { + Region { + chr: CLS_CHR.to_string(), + start: CLS_START as u32, + end: CLS_END as u32, + } + } + + fn bos_token(&self) -> Region { + Region { + chr: BOS_CHR.to_string(), + start: BOS_START as u32, + end: BOS_END as u32, + } + } + + fn eos_token(&self) -> Region { + Region { + chr: EOS_CHR.to_string(), + start: EOS_START as u32, + end: EOS_END as u32, + } + } + + fn sep_token(&self) -> Region { + Region { + chr: SEP_CHR.to_string(), + start: SEP_START as u32, + end: SEP_END as u32, + } + } + + fn unknown_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.unknown_token()) + .unwrap() + } + + fn padding_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.padding_token()) + .unwrap() + } + + fn mask_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.mask_token()) + .unwrap() + } + + fn cls_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.cls_token()) + .unwrap() + } + + fn bos_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.bos_token()) + .unwrap() + } + + fn eos_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.eos_token()) + .unwrap() + } + + fn sep_token_id(&self) -> u32 { + self.universe + .convert_region_to_id(&self.sep_token()) + .unwrap() + } +} + +impl Tokenizer for MetaTokenizer { + + fn vocab_size(&self) -> usize { + self.universe.len() + } + + fn get_universe(&self) -> &Universe { + &self.universe + } + + fn tokenize_region(&self, region: &Region) -> TokenizedRegionSet { + let lapper = self.tree.get(®ion.chr); + + match lapper { + Some(lapper) => { + let intervals = lapper.find(region.start, region.end); + let mut ids: Vec = intervals.map(|interval| interval.val).collect(); + + // tokenized to nothing... check secondary trees + if ids.is_empty() { + // oh, we have no secondary trees, just return the unknown token + if self.secondary_trees.is_none() { + ids = vec![self.unknown_token_id()]; + // iterate over secondary trees and check if the region is in any of them + } else { + for s_tree in self.secondary_trees.as_ref().unwrap() { + // default to unknown token + ids = vec![self.unknown_token_id()]; + + let s_lapper = s_tree.get(®ion.chr); + if s_lapper.is_none() { + continue; + } + // get overlapped intervals -- map to regions + let intervals = s_lapper.unwrap().find(region.start, region.end); + let regions: Vec = + intervals.map(|interval| interval.val).collect(); + + // a hit + if !regions.is_empty() { + ids = regions; + break; + } + } + } + } + + TokenizedRegionSet { + ids, + universe: &self.universe, + } + } + // primary universe didnt have that chromosome/contig/seqname + // so, check secondary trees + None => { + let mut ids = Vec::new(); + // oh, we have no secondary trees, just return the unknown token + if self.secondary_trees.is_none() { + ids = vec![self.unknown_token_id()]; + // iterate over secondary trees and check if the region is in any of them + } else { + for s_tree in self.secondary_trees.as_ref().unwrap() { + // default to unknown token + ids = vec![self.unknown_token_id()]; + + let s_lapper = s_tree.get(®ion.chr); + if s_lapper.is_none() { + continue; + } + + // get overlapped intervals -- map to regions + let intervals = s_lapper.unwrap().find(region.start, region.end); + let regions: Vec = intervals.map(|interval| interval.val).collect(); + + // a hit + if !regions.is_empty() { + ids = regions; + break; + } else { + ids = vec![self.unknown_token_id()]; + } + } + } + + TokenizedRegionSet { + ids, + universe: &self.universe, + } + } + } + } + + fn tokenize_region_set(&self, region_set: &RegionSet) -> TokenizedRegionSet { + let mut tokenized_regions: Vec = Vec::new(); + + for region in region_set { + let tokenized_region = self.tokenize_region(region); + tokenized_regions.extend(tokenized_region.ids); + } + + TokenizedRegionSet { + ids: tokenized_regions, + universe: &self.universe, + } + } +} // tests #[cfg(test)] mod tests { + use crate::common::models::RegionSet; + use super::*; use pretty_assertions::assert_eq; use rstest::*; @@ -306,4 +520,61 @@ mod tests { let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); assert_eq!(tokenizer.universe.len(), 27); } + + #[rstest] + fn test_does_tokenize(path_to_config_file: &str, path_to_tokenize_bed_file: &str) { + let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let region_set = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); + + let tokens = tokenizer.tokenize_region_set(®ion_set); + + assert_eq!(tokens.len(), 4); + } + + #[rstest] + fn test_tokenize_to_first_second_unk(path_to_config_file: &str) { + let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + + let r1 = Region { // tokenize to id 1 + chr: "chr4".to_string(), + start: 16270184, + end: 16270240 + }; + + let r2 = Region { // drops through to the secondary and tokenizes to id 13 + chr: "chr10".to_string(), + start: 705762, + end: 705762 + }; + + let r3 = Region { // unknown token, so should be id 20 + chr: "chrY".to_string(), + start: 1000000, + end: 1000000 + }; + + assert_eq!(tokenizer.tokenize_region(&r1).ids, vec![1]); + assert_eq!(tokenizer.tokenize_region(&r2).ids, vec![13]); + assert_eq!(tokenizer.tokenize_region(&r3).ids, vec![20]); + } + + #[rstest] + fn test_multiple_regions_to_one_meta_id(path_to_config_file: &str) { + let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + + let r1 = Region { // tokenize to 2 + chr: "chr10".to_string(), + start: 70576220, + end: 70576251 + }; + + let r2 = Region { // tokenize to id 2 + chr: "chr2".to_string(), + start: 203871487, + end: 203871688 + }; + + assert_eq!(tokenizer.tokenize_region(&r1).ids, vec![2]); + assert_eq!(tokenizer.tokenize_region(&r2).ids, vec![2]); + } } \ No newline at end of file diff --git a/gtars/tests/data/chroms.meta.bed b/gtars/tests/data/chroms.meta.bed index 7ee71a8..76b9bac 100644 --- a/gtars/tests/data/chroms.meta.bed +++ b/gtars/tests/data/chroms.meta.bed @@ -20,5 +20,4 @@ chr19 0 58617616 cluster_8 chr20 0 64444167 cluster_9 chr21 0 46709983 cluster_9 chr22 0 50818468 cluster_10 -chrX 0 156040895 cluster_11 -chrY 0 57227415 cluster_11 \ No newline at end of file +chrX 0 156040895 cluster_11 \ No newline at end of file From 8342c2e32dcd4adfd4e9a2b4a7c1c9e0c394f849 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 25 Jun 2024 11:13:46 -0400 Subject: [PATCH 17/28] add python bindings to the meta tokenizer --- bindings/gtars/tokenizers/__init__.pyi | 153 ++++++++- bindings/src/models/region_set.rs | 2 +- bindings/src/tokenizers/meta_tokenizer.rs | 199 +++++++++++ bindings/src/tokenizers/mod.rs | 3 + gtars/docs/universes.svg | 311 ++++++++++++++++-- .../src/common/models/tokenized_regionset.rs | 2 +- gtars/src/tokenizers/mod.rs | 3 +- 7 files changed, 647 insertions(+), 26 deletions(-) create mode 100644 bindings/src/tokenizers/meta_tokenizer.rs diff --git a/bindings/gtars/tokenizers/__init__.pyi b/bindings/gtars/tokenizers/__init__.pyi index 946cd40..945d644 100644 --- a/bindings/gtars/tokenizers/__init__.pyi +++ b/bindings/gtars/tokenizers/__init__.pyi @@ -220,7 +220,7 @@ class TokenizedRegionSet: class TreeTokenizer: def __new__(cls, path: str) -> TreeTokenizer: """ - Construct a new TreeTokenize from a universe file. + Construct a new TreeTokenizer from a universe file. :param path: The path to the universe file. This should be a BED file. """ @@ -383,4 +383,155 @@ class FragmentTokenizer: :param file_path: The path to the file containing fragments. :param out_path: The path to the output file. If None, the output is written to the standard output. :param filter: A list of chromosomes to filter. If None, all chromosomes are included. + """ + +class MetaTokenizer: + def __new__(cls, path: str) -> MetaTokenizer: + """ + Construct a new MetaTokenizer from a universe file. + + :param path: The path to the universe file. This should be a BED file. + """ + + def unknown_token(self) -> Region: + """ + Get the unknown token. + """ + + def padding_token(self) -> Region: + """ + Get the padding token. + """ + + def mask_token(self) -> Region: + """ + Get the mask token. + """ + + def cls_token(self) -> Region: + """ + Get the CLS token. + """ + + def bos_token(self) -> Region: + """ + Get the BOS token. + """ + + def eos_token(self) -> Region: + """ + Get the EOS token. + """ + + def sep_token(self) -> Region: + """ + Get the SEP token. + """ + + def unknown_token_id(self) -> int: + """ + Get the ID of the unknown token. + """ + + def padding_token_id(self) -> int: + """ + Get the ID of the padding token. + """ + + def mask_token_id(self) -> int: + """ + Get the ID of the mask token. + """ + + def cls_token_id(self) -> int: + """ + Get the ID of the CLS token. + """ + + def bos_token_id(self) -> int: + """ + Get the ID of the BOS token. + """ + + def eos_token_id(self) -> int: + """ + Get the ID of the EOS token. + """ + + def sep_token_id(self) -> int: + """ + Get the ID of the SEP token. + """ + + def vocab_size(self) -> int: + """ + Get the vocabulary size. + """ + + def tokenize(self, regions: List[Region]) -> List[Region]: + """ + Tokenize a list of regions. This will only return the tokenized regions. + + :param regions: The regions to tokenize. + + :return: The tokenized regions as a list. + """ + + def tokenize_bed_file(self, path: str) -> List[Region]: + """ + Tokenize a BED file directly. + + :param path: The path to the BED file. + + :return: The tokenized regions as a list. + """ + + def encode(self, regions: List[Region]) -> List[int]: + """ + Encode a list of regions. This will return the integer representation of the tokenized regions. + + :param regions: The regions to encode. + + :return: The integer representation of the tokenized regions. + """ + + def decode(self, ids: List[int]) -> List[Region]: + """ + Decode a list of integer representations of the tokenized regions. + + :param ids: The integer representations of the tokenized regions. + + :return: The decoded regions. + """ + + def vocab(self) -> List[Tuple[Region, int]]: + """ + Get the vocabulary. + + :return: The vocabulary as a list of tuples. + """ + + @property + def universe(self) -> Universe: + """ + The universe object. + """ + + def __call__(self, regions: List[Region]) -> TokenizedRegionSet: + """ + Tokenize a list of regions. + + :param regions: The regions to tokenize. + + :return: A TokenizedRegionSet object. + """ + + def __len__(self) -> int: + """ + Get the vocabulary size. + """ + + def __repr__(self) -> str: + """ + Get a string representation of the tokenizer. """ \ No newline at end of file diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index 271a8e4..044e7cf 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -123,7 +123,7 @@ impl PyTokenizedRegionSet { Ok(self .ids .iter() - .map(|id| self.universe.borrow(py).id_to_region[&id].clone()) + .map(|id| self.universe.borrow(py).id_to_region[id].clone()) .collect()) }) } diff --git a/bindings/src/tokenizers/meta_tokenizer.rs b/bindings/src/tokenizers/meta_tokenizer.rs new file mode 100644 index 0000000..941a6d4 --- /dev/null +++ b/bindings/src/tokenizers/meta_tokenizer.rs @@ -0,0 +1,199 @@ +use gtars::tokenizers::traits::SpecialTokens; +use pyo3::prelude::*; +use pyo3::types::PyAny; + +use anyhow::Result; + +use std::path::Path; + +use gtars::common::models::RegionSet; +use gtars::tokenizers::{Tokenizer, MetaTokenizer}; + +use crate::models::{PyRegion, PyTokenizedRegionSet, PyUniverse}; +use crate::utils::extract_regions_from_py_any; + +#[pyclass(name = "MetaTokenizer")] +pub struct PyMetaTokenizer { + pub tokenizer: MetaTokenizer, + pub universe: Py, // this is a Py-wrapped version self.tokenizer.universe for performance reasons +} + +#[pymethods] +impl PyMetaTokenizer { + #[new] + pub fn new(path: String) -> Result { + Python::with_gil(|py| { + let path = Path::new(&path); + let tokenizer = MetaTokenizer::try_from(path)?; + let py_universe: PyUniverse = tokenizer.universe.to_owned().into(); + let py_universe_bound = Py::new(py, py_universe)?; + + Ok(PyMetaTokenizer { + tokenizer, + universe: py_universe_bound, + }) + }) + } + + #[getter] + pub fn unknown_token(&self) -> Result { + Ok(self.tokenizer.unknown_token().into()) + } + + #[getter] + pub fn padding_token(&self) -> Result { + Ok(self.tokenizer.padding_token().into()) + } + + #[getter] + pub fn mask_token(&self) -> Result { + Ok(self.tokenizer.mask_token().into()) + } + + #[getter] + pub fn cls_token(&self) -> Result { + Ok(self.tokenizer.cls_token().into()) + } + + #[getter] + pub fn bos_token(&self) -> Result { + Ok(self.tokenizer.bos_token().into()) + } + + #[getter] + pub fn eos_token(&self) -> Result { + Ok(self.tokenizer.eos_token().into()) + } + + #[getter] + pub fn sep_token(&self) -> Result { + Ok(self.tokenizer.sep_token().into()) + } + + #[getter] + pub fn padding_token_id(&self) -> u32 { + self.tokenizer.padding_token_id() + } + + #[getter] + pub fn mask_token_id(&self) -> u32 { + self.tokenizer.mask_token_id() + } + + #[getter] + pub fn cls_token_id(&self) -> u32 { + self.tokenizer.cls_token_id() + } + + #[getter] + pub fn bos_token_id(&self) -> u32 { + self.tokenizer.bos_token_id() + } + + #[getter] + pub fn eos_token_id(&self) -> u32 { + self.tokenizer.eos_token_id() + } + + #[getter] + pub fn sep_token_id(&self) -> u32 { + self.tokenizer.sep_token_id() + } + + #[getter] + pub fn unknown_token_id(&self) -> u32 { + self.tokenizer.unknown_token_id() + } + + #[getter] + pub fn vocab_size(&self) -> usize { + self.tokenizer.vocab_size() + } + + #[getter] + pub fn universe(&self) -> PyUniverse { + self.tokenizer.universe.clone().into() + } + + // tokenize just returns a list of regions + pub fn tokenize(&self, regions: &Bound<'_, PyAny>) -> Result> { + let rs = extract_regions_from_py_any(regions)?; + + // tokenize the RegionSet + let tokenized = self.tokenizer.tokenize_region_set(&rs); + + let regions = tokenized.into_region_vec(); + + Ok(regions.into_iter().map(|r| r.into()).collect()) + } + + pub fn tokenize_bed_file(&self, path: String) -> Result> { + let path = Path::new(&path); + let regions = RegionSet::try_from(path)?; + + let tokenized = self.tokenizer.tokenize_region_set(®ions); + + let regions = tokenized.into_region_vec(); + + Ok(regions.into_iter().map(|r| r.into()).collect()) + } + + // __call__ returns a TokenizedRegionSet + pub fn __call__(&self, regions: &Bound<'_, PyAny>) -> Result { + // attempt to map the list to a vector of regions + let rs = extract_regions_from_py_any(regions)?; + + // tokenize the RegionSet + let tokenized = self.tokenizer.tokenize_region_set(&rs); + + Python::with_gil(|py| { + let py_tokenized_region_set = PyTokenizedRegionSet { + ids: tokenized.ids, + curr: 0, + universe: self.universe.clone_ref(py), + }; + + Ok(py_tokenized_region_set) + }) + } + + // encode returns a list of ids + pub fn encode(&self, regions: &Bound<'_, PyAny>) -> Result> { + // attempt to map the list to a vector of regions + let rs = extract_regions_from_py_any(regions)?; + + // tokenize the RegionSet + let tokenized = self.tokenizer.tokenize_region_set(&rs); + + Ok(tokenized.ids) + } + + pub fn decode(&self, ids: Vec) -> Result> { + let regions = ids + .iter() + .map(|id| self.tokenizer.universe.id_to_region[id].clone().into()) + .collect(); + + Ok(regions) + } + + pub fn vocab(&self) -> Vec<(PyRegion, u32)> { + self.tokenizer + .universe + .regions + .iter() + .map(|r| (r.clone().into(), self.tokenizer.universe.region_to_id[r])) + .collect() + } + + pub fn __len__(&self) -> usize { + self.tokenizer.universe.len() + } + + pub fn __repr__(&self) -> String { + format!( + "MetaTokenizer({} total regions)", + self.tokenizer.universe.len() + ) + } +} diff --git a/bindings/src/tokenizers/mod.rs b/bindings/src/tokenizers/mod.rs index 0bf0728..5ff0c37 100644 --- a/bindings/src/tokenizers/mod.rs +++ b/bindings/src/tokenizers/mod.rs @@ -1,10 +1,12 @@ mod fragments_tokenizer; mod tree_tokenizer; +mod meta_tokenizer; use pyo3::prelude::*; pub use self::fragments_tokenizer::PyFragmentTokenizer; pub use self::tree_tokenizer::PyTreeTokenizer; +pub use self::meta_tokenizer::PyMetaTokenizer; pub use crate::models::{ PyRegion, PyRegionSet, PyTokenizedRegion, PyTokenizedRegionSet, PyUniverse, }; @@ -12,6 +14,7 @@ pub use crate::models::{ #[pymodule] pub fn tokenizers(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/gtars/docs/universes.svg b/gtars/docs/universes.svg index 8d16ba7..80bea7d 100644 --- a/gtars/docs/universes.svg +++ b/gtars/docs/universes.svg @@ -23,33 +23,18 @@ inkscape:pagecheckerboard="0" inkscape:deskcolor="#d1d1d1" inkscape:document-units="mm" - inkscape:zoom="1.8537713" - inkscape:cx="296.69248" - inkscape:cy="193.11983" + inkscape:zoom="2.7866297" + inkscape:cx="500.78415" + inkscape:cy="251.55836" inkscape:window-width="2560" inkscape:window-height="1387" inkscape:window-x="0" inkscape:window-y="899" inkscape:window-maximized="0" - inkscape:current-layer="layer1" />C + y="77.150002">CQueryU1U2TokensMetatokens?C diff --git a/gtars/src/common/models/tokenized_regionset.rs b/gtars/src/common/models/tokenized_regionset.rs index 5fd8ac3..03d07a4 100644 --- a/gtars/src/common/models/tokenized_regionset.rs +++ b/gtars/src/common/models/tokenized_regionset.rs @@ -50,7 +50,7 @@ impl From> for Vec { value .ids .iter() - .map(|id| value.universe.id_to_region[&id].to_owned()) + .map(|id| value.universe.id_to_region[id].to_owned()) .collect() } } diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 19b94ad..602f91f 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -41,4 +41,5 @@ pub mod consts { pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; -pub use tree_tokenizer::TreeTokenizer; \ No newline at end of file +pub use tree_tokenizer::TreeTokenizer; +pub use meta_tokenizer::MetaTokenizer; \ No newline at end of file From b9cb071e19a0d3dd2149297a53bd06d0c98ca4f1 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 25 Jun 2024 13:17:33 -0400 Subject: [PATCH 18/28] remove gtokens --- gtars/tokens.gtok | Bin 11 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 gtars/tokens.gtok diff --git a/gtars/tokens.gtok b/gtars/tokens.gtok deleted file mode 100644 index 4856944c538f14d9a70e4fdd6746ced858a8d3bf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11 ScmZ<{@%LubVn}6p&I|w#=K`<* From a0bc942e1666ef9607e4c8741b5d091c2257394f Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 25 Jun 2024 13:28:27 -0400 Subject: [PATCH 19/28] add token param --- .github/workflows/codecov.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 5c3367f..a34866a 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -26,6 +26,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 with: + token: ${{ secrets.CODECOV_TOKEN }} files: lcov.info fail_ci_if_error: true working-directory: ./gtars \ No newline at end of file From 0c9889b798e632ceaeb5429b11341e68f2ecc299 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 25 Jun 2024 18:40:59 -0400 Subject: [PATCH 20/28] add dynamic tokenizer builder --- gtars/src/tokenizers/builder.rs | 66 +++++++++++++++++++++++++++++++++ gtars/src/tokenizers/config.rs | 1 + gtars/src/tokenizers/mod.rs | 1 + 3 files changed, 68 insertions(+) create mode 100644 gtars/src/tokenizers/builder.rs diff --git a/gtars/src/tokenizers/builder.rs b/gtars/src/tokenizers/builder.rs new file mode 100644 index 0000000..5ab39c4 --- /dev/null +++ b/gtars/src/tokenizers/builder.rs @@ -0,0 +1,66 @@ +use std::path::Path; + +use anyhow::{Error, Result}; + +use super::{ + Tokenizer, + MetaTokenizer, + TreeTokenizer, + FragmentTokenizer, + TokenizerConfig, +}; + + +pub struct TokenizerBuilder; + +impl TokenizerBuilder { + pub fn from_toml(path: &Path) -> Result> { + let config = TokenizerConfig::new(path)?; + if let Some(tokenizer_type) = config.tokenizer_type { + match tokenizer_type.as_str() { + "tree" => { + Ok(Box::new(TreeTokenizer::try_from(path)?)) + }, + "meta" => { + Ok(Box::new(MetaTokenizer::try_from(path)?)) + }, + _ => { + Err(Error::msg("Tokenizer type not supported")) + } + } + } else { + println!("No tokenizer type found in config file. Instantiating a default TreeTokenizer. Note that this may lead to unexpected behavior."); + Ok(Box::new(TreeTokenizer::try_from(path)?)) + } + + } +} + +#[cfg(test)] +mod tests { + + use crate::common::models::{Region, RegionSet}; + use crate::tokenizers::traits::SpecialTokens; + use std::path::Path; + + use super::*; + use pretty_assertions::assert_eq; + use rstest::*; + + #[fixture] + fn path_to_bed_file() -> &'static str { + "tests/data/peaks.bed" + } + + #[fixture] + fn path_to_config_file() -> &'static str { + "tests/data/tokenizer.toml" + } + + #[rstest] + fn test_from_toml(path_to_config_file: &str) { + let path = Path::new(path_to_config_file); + let tokenizer = TokenizerBuilder::from_toml(path).unwrap(); + assert_eq!(tokenizer.vocab_size(), 56); + } +} \ No newline at end of file diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs index 935605b..7343cb7 100644 --- a/gtars/src/tokenizers/config.rs +++ b/gtars/src/tokenizers/config.rs @@ -6,6 +6,7 @@ use serde::{Deserialize, Serialize}; #[derive(Deserialize, Serialize, Debug, PartialEq)] pub struct TokenizerConfig { + pub tokenizer_type: Option, pub universes: Vec, pub exclude_ranges: Option, } diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 602f91f..c65540b 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -22,6 +22,7 @@ //! println!("{:?}", tokenized_regions.ids); //! ``` pub mod cli; +pub mod builder; pub mod config; pub mod fragment_tokenizer; pub mod meta_tokenizer; From cac526174ed3f6cca4523e723d5b964cda94357f Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Tue, 25 Jun 2024 18:49:35 -0400 Subject: [PATCH 21/28] update tests for saving tokens.gtok --- gtars/src/io/mod.rs | 4 ++-- gtars/src/lib.rs | 2 +- gtars/tests/data/out/tokens.gtok | Bin 0 -> 11 bytes 3 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 gtars/tests/data/out/tokens.gtok diff --git a/gtars/src/io/mod.rs b/gtars/src/io/mod.rs index 9b7693b..9904602 100644 --- a/gtars/src/io/mod.rs +++ b/gtars/src/io/mod.rs @@ -11,12 +11,12 @@ //! use gtars::io::write_tokens_to_gtok; //! //! let ids = vec![42, 101, 999]; -//! write_tokens_to_gtok("tokens.gtok", &ids); +//! write_tokens_to_gtok("tests/data/out/tokens.gtok", &ids); //! ``` //! ### Read tokens from disk //! ```rust //! use gtars::io::read_tokens_from_gtok; -//! let ids = read_tokens_from_gtok("tokens.gtok").unwrap(); +//! let ids = read_tokens_from_gtok("tests/data/out/tokens.gtok").unwrap(); //! //! println!("{:?}", ids); // [42, 101, 999] //! ``` diff --git a/gtars/src/lib.rs b/gtars/src/lib.rs index abcdc30..67b014a 100644 --- a/gtars/src/lib.rs +++ b/gtars/src/lib.rs @@ -31,7 +31,7 @@ //! use gtars::io::write_tokens_to_gtok; //! //! let ids = vec![42, 101, 999]; -//! write_tokens_to_gtok("tokens.gtok", &ids); +//! write_tokens_to_gtok("tests/data/out/tokens.gtok", &ids); //! ``` pub mod ailist; pub mod common; diff --git a/gtars/tests/data/out/tokens.gtok b/gtars/tests/data/out/tokens.gtok new file mode 100644 index 0000000000000000000000000000000000000000..4856944c538f14d9a70e4fdd6746ced858a8d3bf GIT binary patch literal 11 ScmZ<{@%LubVn}6p&I|w#=K`<* literal 0 HcmV?d00001 From df7df8646071004593f77d920d1dd708d53c886c Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Wed, 26 Jun 2024 13:25:04 -0400 Subject: [PATCH 22/28] WIP TokenizerBuilder --- bindings/src/tokenizers/builder.rs | 48 ++++++++++++++++++++++++++++++ bindings/src/tokenizers/mod.rs | 1 + gtars/src/tokenizers/builder.rs | 6 ++-- gtars/src/tokenizers/mod.rs | 3 +- 4 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 bindings/src/tokenizers/builder.rs diff --git a/bindings/src/tokenizers/builder.rs b/bindings/src/tokenizers/builder.rs new file mode 100644 index 0000000..d96f552 --- /dev/null +++ b/bindings/src/tokenizers/builder.rs @@ -0,0 +1,48 @@ +// TODO: stil a work in progress +use pyo3::prelude::*; + +use anyhow::Result; + +use std::path::Path; + + +use gtars::tokenizers::TokenizerConfig; + +use super::{ + PyMetaTokenizer, + PyTreeTokenizer +}; + +#[pyclass(name="TokenizerBuilder")] +pub struct PyTokenizerBuilder; + +#[pymethods] +impl PyTokenizerBuilder { + + #[classmethod] + pub fn from_toml(path: String) -> Result { + let config = TokenizerConfig::new(Path::new(&path))?; + + match config.tokenizer_type { + Some(tokenizer_type) => { + match tokenizer_type.as_str() { + "tree" => { + let t = PyTreeTokenizer::new(path)?; + t.to_object() + }, + "meta" => { + PyMetaTokenizer::new(path) + }, + _ => { + anyhow::bail!("Tokenizer type {} not supported", tokenizer_type) + } + } + }, + None => { + println!("No tokenizer type found in config file. Instantiating a default TreeTokenizer. Note that this may lead to unexpected behavior."); + PyTreeTokenizer::new(path) + } + }; + + } +} \ No newline at end of file diff --git a/bindings/src/tokenizers/mod.rs b/bindings/src/tokenizers/mod.rs index 5ff0c37..54bad4a 100644 --- a/bindings/src/tokenizers/mod.rs +++ b/bindings/src/tokenizers/mod.rs @@ -1,6 +1,7 @@ mod fragments_tokenizer; mod tree_tokenizer; mod meta_tokenizer; +// mod builder; use pyo3::prelude::*; diff --git a/gtars/src/tokenizers/builder.rs b/gtars/src/tokenizers/builder.rs index 5ab39c4..04046cd 100644 --- a/gtars/src/tokenizers/builder.rs +++ b/gtars/src/tokenizers/builder.rs @@ -6,7 +6,7 @@ use super::{ Tokenizer, MetaTokenizer, TreeTokenizer, - FragmentTokenizer, + // FragmentTokenizer, TokenizerConfig, }; @@ -23,7 +23,7 @@ impl TokenizerBuilder { }, "meta" => { Ok(Box::new(MetaTokenizer::try_from(path)?)) - }, + }, _ => { Err(Error::msg("Tokenizer type not supported")) } @@ -39,8 +39,6 @@ impl TokenizerBuilder { #[cfg(test)] mod tests { - use crate::common::models::{Region, RegionSet}; - use crate::tokenizers::traits::SpecialTokens; use std::path::Path; use super::*; diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index c65540b..50fbfc0 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -43,4 +43,5 @@ pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; pub use tree_tokenizer::TreeTokenizer; -pub use meta_tokenizer::MetaTokenizer; \ No newline at end of file +pub use meta_tokenizer::MetaTokenizer; +pub use builder::TokenizerBuilder; \ No newline at end of file From 429b69e820f9e79a7b3704baa6d5f35ece9213d4 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Wed, 26 Jun 2024 13:36:51 -0400 Subject: [PATCH 23/28] meta tokenizer updates --- bindings/src/tokenizers/meta_tokenizer.rs | 2 +- bindings/src/tokenizers/mod.rs | 4 +-- gtars/src/tokenizers/builder.rs | 22 +++++--------- gtars/src/tokenizers/meta_tokenizer.rs | 37 ++++++++++++----------- gtars/src/tokenizers/mod.rs | 6 ++-- 5 files changed, 33 insertions(+), 38 deletions(-) diff --git a/bindings/src/tokenizers/meta_tokenizer.rs b/bindings/src/tokenizers/meta_tokenizer.rs index 941a6d4..863e317 100644 --- a/bindings/src/tokenizers/meta_tokenizer.rs +++ b/bindings/src/tokenizers/meta_tokenizer.rs @@ -7,7 +7,7 @@ use anyhow::Result; use std::path::Path; use gtars::common::models::RegionSet; -use gtars::tokenizers::{Tokenizer, MetaTokenizer}; +use gtars::tokenizers::{MetaTokenizer, Tokenizer}; use crate::models::{PyRegion, PyTokenizedRegionSet, PyUniverse}; use crate::utils::extract_regions_from_py_any; diff --git a/bindings/src/tokenizers/mod.rs b/bindings/src/tokenizers/mod.rs index 54bad4a..14c5e08 100644 --- a/bindings/src/tokenizers/mod.rs +++ b/bindings/src/tokenizers/mod.rs @@ -1,13 +1,13 @@ mod fragments_tokenizer; -mod tree_tokenizer; mod meta_tokenizer; +mod tree_tokenizer; // mod builder; use pyo3::prelude::*; pub use self::fragments_tokenizer::PyFragmentTokenizer; -pub use self::tree_tokenizer::PyTreeTokenizer; pub use self::meta_tokenizer::PyMetaTokenizer; +pub use self::tree_tokenizer::PyTreeTokenizer; pub use crate::models::{ PyRegion, PyRegionSet, PyTokenizedRegion, PyTokenizedRegionSet, PyUniverse, }; diff --git a/gtars/src/tokenizers/builder.rs b/gtars/src/tokenizers/builder.rs index 04046cd..143dc79 100644 --- a/gtars/src/tokenizers/builder.rs +++ b/gtars/src/tokenizers/builder.rs @@ -3,14 +3,13 @@ use std::path::Path; use anyhow::{Error, Result}; use super::{ - Tokenizer, MetaTokenizer, - TreeTokenizer, + Tokenizer, // FragmentTokenizer, TokenizerConfig, + TreeTokenizer, }; - pub struct TokenizerBuilder; impl TokenizerBuilder { @@ -18,21 +17,14 @@ impl TokenizerBuilder { let config = TokenizerConfig::new(path)?; if let Some(tokenizer_type) = config.tokenizer_type { match tokenizer_type.as_str() { - "tree" => { - Ok(Box::new(TreeTokenizer::try_from(path)?)) - }, - "meta" => { - Ok(Box::new(MetaTokenizer::try_from(path)?)) - }, - _ => { - Err(Error::msg("Tokenizer type not supported")) - } + "tree" => Ok(Box::new(TreeTokenizer::try_from(path)?)), + "meta" => Ok(Box::new(MetaTokenizer::try_from(path)?)), + _ => Err(Error::msg("Tokenizer type not supported")), } } else { - println!("No tokenizer type found in config file. Instantiating a default TreeTokenizer. Note that this may lead to unexpected behavior."); + println!("No tokenizer type found in config file. Instantiating a default TreeTokenizer. Note that this may lead to unexpected behavior."); Ok(Box::new(TreeTokenizer::try_from(path)?)) } - } } @@ -61,4 +53,4 @@ mod tests { let tokenizer = TokenizerBuilder::from_toml(path).unwrap(); assert_eq!(tokenizer.vocab_size(), 56); } -} \ No newline at end of file +} diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index 3767679..eca0a2a 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -6,9 +6,9 @@ use anyhow::{Context, Result}; use rust_lapper::{Interval, Lapper}; use crate::common::consts::special_tokens::*; -use crate::common::models::{Region, RegionSet, Universe, TokenizedRegionSet}; +use crate::common::models::{Region, RegionSet, TokenizedRegionSet, Universe}; use crate::common::utils::get_dynamic_reader; -use crate::tokenizers::{TokenizerConfig, Tokenizer}; +use crate::tokenizers::{Tokenizer, TokenizerConfig}; use super::traits::SpecialTokens; @@ -83,7 +83,7 @@ impl TryFrom<&Path> for MetaTokenizer { let end = fields[2].parse::().with_context(|| { format!("Failed to parse end position in BED file line: {}", line) })?; - + // why is primary_ being prepended to the metatoken id? // - this is a way to ensure that the metatoken id is unique, // imagine a secondary universe that has the same metatoken id @@ -98,7 +98,7 @@ impl TryFrom<&Path> for MetaTokenizer { seen_metatokens.insert(meta_id, id); id } - }; + }; // construct the actual region let region = Region { @@ -144,7 +144,6 @@ impl TryFrom<&Path> for MetaTokenizer { let mut secondary_trees = Vec::new(); for (u_num, other_universe) in other_universes.iter().enumerate() { - let other_universe = value.parent().unwrap().join(other_universe); let reader = get_dynamic_reader(Path::new(&other_universe))?; @@ -389,7 +388,6 @@ impl SpecialTokens for MetaTokenizer { } impl Tokenizer for MetaTokenizer { - fn vocab_size(&self) -> usize { self.universe.len() } @@ -535,22 +533,25 @@ mod tests { fn test_tokenize_to_first_second_unk(path_to_config_file: &str) { let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - let r1 = Region { // tokenize to id 1 + let r1 = Region { + // tokenize to id 1 chr: "chr4".to_string(), start: 16270184, - end: 16270240 + end: 16270240, }; - let r2 = Region { // drops through to the secondary and tokenizes to id 13 + let r2 = Region { + // drops through to the secondary and tokenizes to id 13 chr: "chr10".to_string(), start: 705762, - end: 705762 + end: 705762, }; - let r3 = Region { // unknown token, so should be id 20 + let r3 = Region { + // unknown token, so should be id 20 chr: "chrY".to_string(), start: 1000000, - end: 1000000 + end: 1000000, }; assert_eq!(tokenizer.tokenize_region(&r1).ids, vec![1]); @@ -562,19 +563,21 @@ mod tests { fn test_multiple_regions_to_one_meta_id(path_to_config_file: &str) { let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - let r1 = Region { // tokenize to 2 + let r1 = Region { + // tokenize to 2 chr: "chr10".to_string(), start: 70576220, - end: 70576251 + end: 70576251, }; - let r2 = Region { // tokenize to id 2 + let r2 = Region { + // tokenize to id 2 chr: "chr2".to_string(), start: 203871487, - end: 203871688 + end: 203871688, }; assert_eq!(tokenizer.tokenize_region(&r1).ids, vec![2]); assert_eq!(tokenizer.tokenize_region(&r2).ids, vec![2]); } -} \ No newline at end of file +} diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index 50fbfc0..e3ece44 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -21,8 +21,8 @@ //! let tokenized_regions = tokenizer.tokenize_region_set(&rs); //! println!("{:?}", tokenized_regions.ids); //! ``` -pub mod cli; pub mod builder; +pub mod cli; pub mod config; pub mod fragment_tokenizer; pub mod meta_tokenizer; @@ -39,9 +39,9 @@ pub mod consts { } // expose the TreeTokenizer struct to users of this crate +pub use builder::TokenizerBuilder; pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; +pub use meta_tokenizer::MetaTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; pub use tree_tokenizer::TreeTokenizer; -pub use meta_tokenizer::MetaTokenizer; -pub use builder::TokenizerBuilder; \ No newline at end of file From 4e831c293f7183602d8c6d5a553c5acb1dcd1829 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Thu, 27 Jun 2024 14:44:09 -0400 Subject: [PATCH 24/28] add export functionality to tokenizers --- bindings/gtars/tokenizers/__init__.pyi | 14 +++++++++++++ bindings/src/tokenizers/meta_tokenizer.rs | 5 +++++ bindings/src/tokenizers/tree_tokenizer.rs | 5 +++++ gtars/src/tokenizers/builder.rs | 2 +- gtars/src/tokenizers/config.rs | 10 +++++++++- gtars/src/tokenizers/meta_tokenizer.rs | 10 +++++++++- gtars/src/tokenizers/traits.rs | 20 +++++++++++++++++++ gtars/src/tokenizers/tree_tokenizer.rs | 24 ++++++++++++++++++----- 8 files changed, 82 insertions(+), 8 deletions(-) diff --git a/bindings/gtars/tokenizers/__init__.pyi b/bindings/gtars/tokenizers/__init__.pyi index 945d644..0e3479a 100644 --- a/bindings/gtars/tokenizers/__init__.pyi +++ b/bindings/gtars/tokenizers/__init__.pyi @@ -348,6 +348,13 @@ class TreeTokenizer: """ The universe object. """ + + def export(self, path: str): + """ + Export the tokenizer configuration to a file. + + :param path: The path to the output file. + """ def __call__(self, regions: List[Region]) -> TokenizedRegionSet: """ @@ -516,6 +523,13 @@ class MetaTokenizer: """ The universe object. """ + + def export(self, path: str): + """ + Export the tokenizer configuration to a file. + + :param path: The path to the output file. + """ def __call__(self, regions: List[Region]) -> TokenizedRegionSet: """ diff --git a/bindings/src/tokenizers/meta_tokenizer.rs b/bindings/src/tokenizers/meta_tokenizer.rs index 863e317..96dd4eb 100644 --- a/bindings/src/tokenizers/meta_tokenizer.rs +++ b/bindings/src/tokenizers/meta_tokenizer.rs @@ -138,6 +138,11 @@ impl PyMetaTokenizer { Ok(regions.into_iter().map(|r| r.into()).collect()) } + pub fn export(&self, path: String) -> Result<()> { + let path = Path::new(&path); + self.tokenizer.export(path) + } + // __call__ returns a TokenizedRegionSet pub fn __call__(&self, regions: &Bound<'_, PyAny>) -> Result { // attempt to map the list to a vector of regions diff --git a/bindings/src/tokenizers/tree_tokenizer.rs b/bindings/src/tokenizers/tree_tokenizer.rs index 078bf6d..a696744 100644 --- a/bindings/src/tokenizers/tree_tokenizer.rs +++ b/bindings/src/tokenizers/tree_tokenizer.rs @@ -137,6 +137,11 @@ impl PyTreeTokenizer { Ok(regions.into_iter().map(|r| r.into()).collect()) } + + pub fn export(&self, path: String) -> Result<()> { + let path = Path::new(&path); + self.tokenizer.export(path) + } // __call__ returns a TokenizedRegionSet pub fn __call__(&self, regions: &Bound<'_, PyAny>) -> Result { diff --git a/gtars/src/tokenizers/builder.rs b/gtars/src/tokenizers/builder.rs index 143dc79..ee5d46e 100644 --- a/gtars/src/tokenizers/builder.rs +++ b/gtars/src/tokenizers/builder.rs @@ -14,7 +14,7 @@ pub struct TokenizerBuilder; impl TokenizerBuilder { pub fn from_toml(path: &Path) -> Result> { - let config = TokenizerConfig::new(path)?; + let config = TokenizerConfig::try_from(path)?; if let Some(tokenizer_type) = config.tokenizer_type { match tokenizer_type.as_str() { "tree" => Ok(Box::new(TreeTokenizer::try_from(path)?)), diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs index 7343cb7..6e8a009 100644 --- a/gtars/src/tokenizers/config.rs +++ b/gtars/src/tokenizers/config.rs @@ -17,10 +17,18 @@ impl TokenizerConfig { /// /// # Arguments /// - path: Path to the config file (a .toml) file. - pub fn new(path: &Path) -> Result { + pub fn try_from(path: &Path) -> Result { let toml_str = read_to_string(path)?; let config: TokenizerConfig = toml::from_str(&toml_str)?; Ok(config) } + + pub fn new(tokenizer_type: Option, universes: Vec, exclude_ranges: Option) -> TokenizerConfig { + TokenizerConfig { + tokenizer_type, + universes, + exclude_ranges, + } + } } diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index eca0a2a..010e21a 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -20,6 +20,7 @@ use super::traits::SpecialTokens; /// In brief, meta-tokens are tokens that represent *clusters* of genomic intervals. pub struct MetaTokenizer { pub universe: Universe, + config: TokenizerConfig, region_to_metatoken: HashMap, tree: HashMap>, secondary_trees: Option>>>, @@ -35,7 +36,7 @@ impl TryFrom<&Path> for MetaTokenizer { /// # Returns /// A new TreeTokenizer fn try_from(value: &Path) -> Result { - let config = TokenizerConfig::new(value).with_context(|| { + let config = TokenizerConfig::try_from(value).with_context(|| { format!( "Invalid tokenizer configuration found for file: {}", value.to_str().unwrap() @@ -279,6 +280,7 @@ impl TryFrom<&Path> for MetaTokenizer { }); Ok(MetaTokenizer { + config, universe, region_to_metatoken, tree, @@ -396,6 +398,12 @@ impl Tokenizer for MetaTokenizer { &self.universe } + fn export(&self, path: &Path) -> Result<()> { + let toml_str = toml::to_string(&self.config)?; + std::fs::write(path, toml_str)?; + Ok(()) + } + fn tokenize_region(&self, region: &Region) -> TokenizedRegionSet { let lapper = self.tree.get(®ion.chr); diff --git a/gtars/src/tokenizers/traits.rs b/gtars/src/tokenizers/traits.rs index 48c4bd6..769884e 100644 --- a/gtars/src/tokenizers/traits.rs +++ b/gtars/src/tokenizers/traits.rs @@ -30,9 +30,29 @@ pub trait Tokenizer { /// fn tokenize_region_set(&self, region_set: &RegionSet) -> TokenizedRegionSet; + /// + /// Get the vocabulary size of the tokenizer + /// + /// # Returns + /// The size of the vocabulary as usize fn vocab_size(&self) -> usize; + /// + /// Get the universe of the tokenizer + /// + /// # Returns + /// A reference to the universe of the tokenizer fn get_universe(&self) -> &Universe; + + /// + /// Export the tokenizer to a toml file + /// + /// # Arguments + /// - `path` - the path to the toml file + /// + /// # Returns + /// A Result + fn export(&self, path: &Path) -> Result<()>; } pub trait SingleCellTokenizer { diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 48c0931..b0a14f1 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -16,6 +16,7 @@ use crate::tokenizers::traits::{Pad, SpecialTokens, Tokenizer}; /// pre-processor for machine learning pipelines pub struct TreeTokenizer { pub universe: Universe, + config: TokenizerConfig, tree: HashMap>, secondary_trees: Option>>>, } @@ -36,10 +37,10 @@ impl TryFrom<&Path> for TreeTokenizer { // and allows for the new way of creating tokenizers from toml files let file_extension = value.extension().unwrap().to_str().unwrap(); - let (mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension { + let (config, mut universe, tree, secondary_trees, _exclude_ranges) = match file_extension { // parse config file "toml" => { - let config = TokenizerConfig::new(value).with_context(|| { + let config = TokenizerConfig::try_from(value).with_context(|| { format!( "Invalid tokenizer configuration found for file: {}", value.to_str().unwrap() @@ -111,7 +112,7 @@ impl TryFrom<&Path> for TreeTokenizer { }; // create exclude ranges if they exist - let exclude_ranges = match config.exclude_ranges { + let exclude_ranges = match &config.exclude_ranges { Some(exclude_ranges) => { let exclude_ranges_path = value.parent().unwrap().join(exclude_ranges); @@ -128,14 +129,20 @@ impl TryFrom<&Path> for TreeTokenizer { None => None, }; - (universe, tree, secondary_trees, exclude_ranges) + (config, universe, tree, secondary_trees, exclude_ranges) } // else assume its a bed file _ => { let regions = extract_regions_from_bed_file(value)?; let universe = Universe::from(regions); let tree = create_interval_tree_from_universe(&universe); - (universe, tree, None, None) + let config = TokenizerConfig::new( + Some("tree".to_string()), + vec![value.to_str().unwrap().to_string()], + None + + ); + (config, universe, tree, None, None) } }; @@ -190,6 +197,7 @@ impl TryFrom<&Path> for TreeTokenizer { }); Ok(TreeTokenizer { + config, universe, tree, secondary_trees, @@ -301,6 +309,12 @@ impl Tokenizer for TreeTokenizer { fn get_universe(&self) -> &Universe { &self.universe } + + fn export(&self, path: &Path) -> Result<()> { + let toml_str = toml::to_string(&self.config)?; + std::fs::write(path, toml_str)?; + Ok(()) + } } impl SpecialTokens for TreeTokenizer { From e70cf131b96cc8e2e0958edcad96cb0c61f0f1ff Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 22 Jul 2024 11:06:48 -0400 Subject: [PATCH 25/28] small tweaks --- gtars/src/tokenizers/config.rs | 6 +++++- gtars/src/tokenizers/traits.rs | 10 +++++----- gtars/src/tokenizers/tree_tokenizer.rs | 10 +++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/gtars/src/tokenizers/config.rs b/gtars/src/tokenizers/config.rs index 6e8a009..b23977a 100644 --- a/gtars/src/tokenizers/config.rs +++ b/gtars/src/tokenizers/config.rs @@ -24,7 +24,11 @@ impl TokenizerConfig { Ok(config) } - pub fn new(tokenizer_type: Option, universes: Vec, exclude_ranges: Option) -> TokenizerConfig { + pub fn new( + tokenizer_type: Option, + universes: Vec, + exclude_ranges: Option, + ) -> TokenizerConfig { TokenizerConfig { tokenizer_type, universes, diff --git a/gtars/src/tokenizers/traits.rs b/gtars/src/tokenizers/traits.rs index 769884e..4bbc571 100644 --- a/gtars/src/tokenizers/traits.rs +++ b/gtars/src/tokenizers/traits.rs @@ -30,26 +30,26 @@ pub trait Tokenizer { /// fn tokenize_region_set(&self, region_set: &RegionSet) -> TokenizedRegionSet; - /// + /// /// Get the vocabulary size of the tokenizer - /// + /// /// # Returns /// The size of the vocabulary as usize fn vocab_size(&self) -> usize; /// /// Get the universe of the tokenizer - /// + /// /// # Returns /// A reference to the universe of the tokenizer fn get_universe(&self) -> &Universe; /// /// Export the tokenizer to a toml file - /// + /// /// # Arguments /// - `path` - the path to the toml file - /// + /// /// # Returns /// A Result fn export(&self, path: &Path) -> Result<()>; diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index b0a14f1..c1f52c1 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -136,12 +136,12 @@ impl TryFrom<&Path> for TreeTokenizer { let regions = extract_regions_from_bed_file(value)?; let universe = Universe::from(regions); let tree = create_interval_tree_from_universe(&universe); - let config = TokenizerConfig::new( - Some("tree".to_string()), - vec![value.to_str().unwrap().to_string()], - None - ); + let universe_as_path = Path::new(value).file_name().unwrap(); + let universe_as_path = universe_as_path.to_string_lossy().to_string(); + + let config = + TokenizerConfig::new(Some("tree".to_string()), vec![universe_as_path], None); (config, universe, tree, None, None) } }; From c8abb9b4af900fb115f2ee16738a354ea9d91e29 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 22 Jul 2024 11:07:02 -0400 Subject: [PATCH 26/28] bindings --- bindings/src/tokenizers/tree_tokenizer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/src/tokenizers/tree_tokenizer.rs b/bindings/src/tokenizers/tree_tokenizer.rs index a696744..c8d480f 100644 --- a/bindings/src/tokenizers/tree_tokenizer.rs +++ b/bindings/src/tokenizers/tree_tokenizer.rs @@ -137,7 +137,7 @@ impl PyTreeTokenizer { Ok(regions.into_iter().map(|r| r.into()).collect()) } - + pub fn export(&self, path: String) -> Result<()> { let path = Path::new(&path); self.tokenizer.export(path) From 27fe07e8865defddb8123003be27e7e6a4edc179 Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 29 Jul 2024 11:38:12 -0400 Subject: [PATCH 27/28] add module paths to classes --- bindings/src/ailist/mod.rs | 2 +- bindings/src/models/interval.rs | 2 +- bindings/src/models/region.rs | 4 ++-- bindings/src/models/region_set.rs | 4 ++-- bindings/src/models/universe.rs | 2 +- bindings/src/tokenizers/fragments_tokenizer.rs | 2 +- bindings/src/tokenizers/meta_tokenizer.rs | 2 +- bindings/src/tokenizers/tree_tokenizer.rs | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bindings/src/ailist/mod.rs b/bindings/src/ailist/mod.rs index 78c6def..d21f980 100644 --- a/bindings/src/ailist/mod.rs +++ b/bindings/src/ailist/mod.rs @@ -3,7 +3,7 @@ use pyo3::{prelude::*, pyclass}; use crate::models::PyInterval; -#[pyclass(name = "AIList")] +#[pyclass(name = "AIList", module="gtars.ailist")] struct PyAIList { ailist: AIList, } diff --git a/bindings/src/models/interval.rs b/bindings/src/models/interval.rs index e4cceb5..758daed 100644 --- a/bindings/src/models/interval.rs +++ b/bindings/src/models/interval.rs @@ -1,6 +1,6 @@ use pyo3::prelude::*; -#[pyclass(name = "Interval")] +#[pyclass(name = "Interval", module="gtars.models")] pub struct PyInterval { #[pyo3(get, set)] pub start: u32, diff --git a/bindings/src/models/region.rs b/bindings/src/models/region.rs index fa40312..83bd52b 100644 --- a/bindings/src/models/region.rs +++ b/bindings/src/models/region.rs @@ -9,7 +9,7 @@ use gtars::common::models::region::Region; use crate::models::PyUniverse; -#[pyclass(name = "Region")] +#[pyclass(name = "Region", module="gtars.models")] #[derive(Clone, Debug, Hash, Eq, PartialEq)] pub struct PyRegion { pub chr: String, @@ -75,7 +75,7 @@ impl PyRegion { } } -#[pyclass(name = "TokenizedRegion")] +#[pyclass(name = "TokenizedRegion", module="gtars.models")] #[derive(Clone, Debug)] pub struct PyTokenizedRegion { pub id: u32, diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs index 044e7cf..e6348ff 100644 --- a/bindings/src/models/region_set.rs +++ b/bindings/src/models/region_set.rs @@ -10,7 +10,7 @@ use gtars::common::utils::extract_regions_from_bed_file; use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse}; -#[pyclass(name = "RegionSet")] +#[pyclass(name = "RegionSet", module="gtars.models")] #[derive(Clone, Debug)] pub struct PyRegionSet { pub regions: Vec, @@ -85,7 +85,7 @@ impl PyRegionSet { } } -#[pyclass(name = "TokenizedRegionSet")] +#[pyclass(name = "TokenizedRegionSet", module="gtars.models")] #[derive(Clone, Debug)] pub struct PyTokenizedRegionSet { pub ids: Vec, diff --git a/bindings/src/models/universe.rs b/bindings/src/models/universe.rs index 350e2d4..87d6726 100644 --- a/bindings/src/models/universe.rs +++ b/bindings/src/models/universe.rs @@ -7,7 +7,7 @@ use anyhow::Result; use crate::models::PyRegion; use gtars::common::models::Universe; -#[pyclass(name = "Universe")] +#[pyclass(name = "Universe", module="gtars.models")] #[derive(Clone, Debug)] pub struct PyUniverse { pub regions: Vec, diff --git a/bindings/src/tokenizers/fragments_tokenizer.rs b/bindings/src/tokenizers/fragments_tokenizer.rs index 32baeb8..5aa283b 100644 --- a/bindings/src/tokenizers/fragments_tokenizer.rs +++ b/bindings/src/tokenizers/fragments_tokenizer.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; use super::PyTokenizedRegionSet; use super::PyUniverse; -#[pyclass(name = "FragmentTokenizer")] +#[pyclass(name = "FragmentTokenizer", module="gtars.tokenizers")] pub struct PyFragmentTokenizer { pub tokenizer: gtars::tokenizers::FragmentTokenizer, pub universe: Py, // this is a Py-wrapped version self.tokenizer.universe for performance reasons diff --git a/bindings/src/tokenizers/meta_tokenizer.rs b/bindings/src/tokenizers/meta_tokenizer.rs index 96dd4eb..40704c5 100644 --- a/bindings/src/tokenizers/meta_tokenizer.rs +++ b/bindings/src/tokenizers/meta_tokenizer.rs @@ -12,7 +12,7 @@ use gtars::tokenizers::{MetaTokenizer, Tokenizer}; use crate::models::{PyRegion, PyTokenizedRegionSet, PyUniverse}; use crate::utils::extract_regions_from_py_any; -#[pyclass(name = "MetaTokenizer")] +#[pyclass(name = "MetaTokenizer", module="gtars.tokenizers")] pub struct PyMetaTokenizer { pub tokenizer: MetaTokenizer, pub universe: Py, // this is a Py-wrapped version self.tokenizer.universe for performance reasons diff --git a/bindings/src/tokenizers/tree_tokenizer.rs b/bindings/src/tokenizers/tree_tokenizer.rs index c8d480f..2fdece5 100644 --- a/bindings/src/tokenizers/tree_tokenizer.rs +++ b/bindings/src/tokenizers/tree_tokenizer.rs @@ -12,7 +12,7 @@ use gtars::tokenizers::{Tokenizer, TreeTokenizer}; use crate::models::{PyRegion, PyTokenizedRegionSet, PyUniverse}; use crate::utils::extract_regions_from_py_any; -#[pyclass(name = "TreeTokenizer")] +#[pyclass(name = "TreeTokenizer", module="gtars.tokenizers")] pub struct PyTreeTokenizer { pub tokenizer: TreeTokenizer, pub universe: Py, // this is a Py-wrapped version self.tokenizer.universe for performance reasons From f6367803b62b037a33b89030e0b58a2404b8009d Mon Sep 17 00:00:00 2001 From: Nathan LeRoy Date: Mon, 29 Jul 2024 11:41:03 -0400 Subject: [PATCH 28/28] bump version and changelog --- bindings/Cargo.toml | 2 +- gtars/Cargo.toml | 2 +- gtars/docs/changelog.md | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bindings/Cargo.toml b/bindings/Cargo.toml index 023be53..53f7c38 100644 --- a/bindings/Cargo.toml +++ b/bindings/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars-py" -version = "0.0.14" +version = "0.0.15" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index 0dfe4cc..4aa1868 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "gtars" -version = "0.0.14" +version = "0.0.15" edition = "2021" description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package." license = "MIT" diff --git a/gtars/docs/changelog.md b/gtars/docs/changelog.md index b70d58e..b157bd2 100644 --- a/gtars/docs/changelog.md +++ b/gtars/docs/changelog.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.0.15] +- added meta tokenization tools and a new `MetaTokenizer` struct that can be used to tokenize regions using the meta-token strategy. +- added some annotations to the `pyo3` `#[pyclass]` and `#[pymethods]` attributes to make the python bindings more readable. + ## [0.0.14] - renamed repository to `gtars` to better reflect the project's goals.