Merge pull request #27 from databio/dev

Hierarchical tokenizers
databio · Jul 29, 2024 · 8c0811a · 8c0811a
2 parents 506dabe + f636780
commit 8c0811a
Show file tree

Hide file tree

Showing 46 changed files with 3,496 additions and 671 deletions.
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -0,0 +1,32 @@
+name: Compute coverage
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  coverage:
+    runs-on: ubuntu-latest
+    env:
+      CARGO_TERM_COLOR: always
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install Rust
+        run: rustup update stable
+      - name: Install cargo-llvm-cov
+        uses: taiki-e/install-action@cargo-llvm-cov
+      - name: Generate code coverage
+        run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
+        working-directory: ./gtars
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: lcov.info
+          fail_ci_if_error: true
+          working-directory: ./gtars
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,22 @@
+name: Run tests
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  run:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Build
+      run: cargo build --verbose
+      working-directory: ./gtars
+    - name: Run tests
+      run: cargo test --verbose
+      working-directory: ./gtars
diff --git a/README.md b/README.md
@@ -1,3 +1,6 @@
+[![codecov](https://codecov.io/gh/databio/gtars/branch/master/graph/badge.svg)](https://codecov.io/gh/databio/gtars)
+[![crates.io](https://img.shields.io/crates/v/gtars?&logo=rust)](https://crates.io/crates/gtars)
+
 <h1 align="center">
 <img src="gtars/docs/logo.svg" alt="gtars logo" height="100px">
 </h1>

diff --git a/bindings/Cargo.toml b/bindings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "gtars-py"
-version = "0.0.14"
+version = "0.0.15"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/bindings/gtars/tokenizers/__init__.pyi b/bindings/gtars/tokenizers/__init__.pyi
@@ -220,7 +220,7 @@ class TokenizedRegionSet:
 class TreeTokenizer:
     def __new__(cls, path: str) -> TreeTokenizer:
         """
-        Construct a new TreeTokenize from a universe file.
+        Construct a new TreeTokenizer from a universe file.
 
         :param path: The path to the universe file. This should be a BED file.
         """
@@ -348,6 +348,13 @@ class TreeTokenizer:
         """
         The universe object.
         """
+
+    def export(self, path: str):
+        """
+        Export the tokenizer configuration to a file.
+
+        :param path: The path to the output file.
+        """
 
     def __call__(self, regions: List[Region]) -> TokenizedRegionSet:
         """
@@ -383,4 +390,162 @@ class FragmentTokenizer:
         :param file_path: The path to the file containing fragments.
         :param out_path: The path to the output file. If None, the output is written to the standard output.
         :param filter: A list of chromosomes to filter. If None, all chromosomes are included.
+        """
+
+class MetaTokenizer:
+    def __new__(cls, path: str) -> MetaTokenizer:
+        """
+        Construct a new MetaTokenizer from a universe file.
+
+        :param path: The path to the universe file. This should be a BED file.
+        """
+
+    def unknown_token(self) -> Region:
+        """
+        Get the unknown token.
+        """
+
+    def padding_token(self) -> Region:
+        """
+        Get the padding token.
+        """
+
+    def mask_token(self) -> Region:
+        """
+        Get the mask token.
+        """
+
+    def cls_token(self) -> Region:
+        """
+        Get the CLS token.
+        """
+
+    def bos_token(self) -> Region:
+        """
+        Get the BOS token.
+        """
+
+    def eos_token(self) -> Region:
+        """
+        Get the EOS token.
+        """
+
+    def sep_token(self) -> Region:
+        """
+        Get the SEP token.
+        """
+
+    def unknown_token_id(self) -> int:
+        """
+        Get the ID of the unknown token.
+        """
+
+    def padding_token_id(self) -> int:
+        """
+        Get the ID of the padding token.
+        """
+
+    def mask_token_id(self) -> int:
+        """
+        Get the ID of the mask token.
+        """
+
+    def cls_token_id(self) -> int:
+        """
+        Get the ID of the CLS token.
+        """
+
+    def bos_token_id(self) -> int:
+        """
+        Get the ID of the BOS token.
+        """
+
+    def eos_token_id(self) -> int:
+        """
+        Get the ID of the EOS token.
+        """
+
+    def sep_token_id(self) -> int:
+        """
+        Get the ID of the SEP token.
+        """
+
+    def vocab_size(self) -> int:
+        """
+        Get the vocabulary size.
+        """
+
+    def tokenize(self, regions: List[Region]) -> List[Region]:
+        """
+        Tokenize a list of regions. This will only return the tokenized regions.
+
+        :param regions: The regions to tokenize.
+
+        :return: The tokenized regions as a list.
+        """
+
+    def tokenize_bed_file(self, path: str) -> List[Region]:
+        """
+        Tokenize a BED file directly.
+
+        :param path: The path to the BED file.
+
+        :return: The tokenized regions as a list.
+        """
+
+    def encode(self, regions: List[Region]) -> List[int]:
+        """
+        Encode a list of regions. This will return the integer representation of the tokenized regions.
+
+        :param regions: The regions to encode.
+
+        :return: The integer representation of the tokenized regions.
+        """
+
+    def decode(self, ids: List[int]) -> List[Region]:
+        """
+        Decode a list of integer representations of the tokenized regions.
+
+        :param ids: The integer representations of the tokenized regions.
+
+        :return: The decoded regions.
+        """
+
+    def vocab(self) -> List[Tuple[Region, int]]:
+        """
+        Get the vocabulary.
+
+        :return: The vocabulary as a list of tuples.
+        """
+
+    @property
+    def universe(self) -> Universe:
+        """
+        The universe object.
+        """
+
+    def export(self, path: str):
+        """
+        Export the tokenizer configuration to a file.
+
+        :param path: The path to the output file.
+        """
+
+    def __call__(self, regions: List[Region]) -> TokenizedRegionSet:
+        """
+        Tokenize a list of regions.
+
+        :param regions: The regions to tokenize.
+
+        :return: A TokenizedRegionSet object.
+        """
+
+    def __len__(self) -> int:
+        """
+        Get the vocabulary size.
+        """
+
+    def __repr__(self) -> str:
+        """
+        Get a string representation of the tokenizer.
         """
diff --git a/bindings/src/ailist/mod.rs b/bindings/src/ailist/mod.rs
@@ -3,7 +3,7 @@ use pyo3::{prelude::*, pyclass};
 
 use crate::models::PyInterval;
 
-#[pyclass(name = "AIList")]
+#[pyclass(name = "AIList", module="gtars.ailist")]
 struct PyAIList {
     ailist: AIList,
 }

diff --git a/bindings/src/models/interval.rs b/bindings/src/models/interval.rs
@@ -1,6 +1,6 @@
 use pyo3::prelude::*;
 
-#[pyclass(name = "Interval")]
+#[pyclass(name = "Interval", module="gtars.models")]
 pub struct PyInterval {
     #[pyo3(get, set)]
     pub start: u32,

diff --git a/bindings/src/models/region.rs b/bindings/src/models/region.rs
@@ -9,7 +9,7 @@ use gtars::common::models::region::Region;
 
 use crate::models::PyUniverse;
 
-#[pyclass(name = "Region")]
+#[pyclass(name = "Region", module="gtars.models")]
 #[derive(Clone, Debug, Hash, Eq, PartialEq)]
 pub struct PyRegion {
     pub chr: String,
@@ -75,7 +75,7 @@ impl PyRegion {
     }
 }
 
-#[pyclass(name = "TokenizedRegion")]
+#[pyclass(name = "TokenizedRegion", module="gtars.models")]
 #[derive(Clone, Debug)]
 pub struct PyTokenizedRegion {
     pub id: u32,

diff --git a/bindings/src/models/region_set.rs b/bindings/src/models/region_set.rs
@@ -10,7 +10,7 @@ use gtars::common::utils::extract_regions_from_bed_file;
 
 use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse};
 
-#[pyclass(name = "RegionSet")]
+#[pyclass(name = "RegionSet", module="gtars.models")]
 #[derive(Clone, Debug)]
 pub struct PyRegionSet {
     pub regions: Vec<PyRegion>,
@@ -85,7 +85,7 @@ impl PyRegionSet {
     }
 }
 
-#[pyclass(name = "TokenizedRegionSet")]
+#[pyclass(name = "TokenizedRegionSet", module="gtars.models")]
 #[derive(Clone, Debug)]
 pub struct PyTokenizedRegionSet {
     pub ids: Vec<u32>,
@@ -123,7 +123,7 @@ impl PyTokenizedRegionSet {
             Ok(self
                 .ids
                 .iter()
-                .map(|id| self.universe.borrow(py).id_to_region[&id].clone())
+                .map(|id| self.universe.borrow(py).id_to_region[id].clone())
                 .collect())
         })
     }

diff --git a/bindings/src/models/universe.rs b/bindings/src/models/universe.rs
@@ -7,7 +7,7 @@ use anyhow::Result;
 use crate::models::PyRegion;
 use gtars::common::models::Universe;
 
-#[pyclass(name = "Universe")]
+#[pyclass(name = "Universe", module="gtars.models")]
 #[derive(Clone, Debug)]
 pub struct PyUniverse {
     pub regions: Vec<PyRegion>,

diff --git a/bindings/src/tokenizers/builder.rs b/bindings/src/tokenizers/builder.rs
@@ -0,0 +1,48 @@
+// TODO: stil a work in progress
+use pyo3::prelude::*;
+
+use anyhow::Result;
+
+use std::path::Path;
+
+
+use gtars::tokenizers::TokenizerConfig;
+
+use super::{
+    PyMetaTokenizer,
+    PyTreeTokenizer
+};
+
+#[pyclass(name="TokenizerBuilder")]
+pub struct PyTokenizerBuilder;
+
+#[pymethods]
+impl PyTokenizerBuilder {
+
+    #[classmethod]
+    pub fn from_toml(path: String) -> Result<PyObject> {
+        let config = TokenizerConfig::new(Path::new(&path))?;
+
+        match config.tokenizer_type {
+            Some(tokenizer_type) => {
+                match tokenizer_type.as_str() {
+                    "tree" => {
+                        let t = PyTreeTokenizer::new(path)?;
+                        t.to_object()
+                    },
+                    "meta" => {
+                        PyMetaTokenizer::new(path)
+                    },
+                    _ => {
+                        anyhow::bail!("Tokenizer type {} not supported", tokenizer_type)
+                    }
+                }
+            },
+            None => {
+                println!("No tokenizer type found in config file. Instantiating a default TreeTokenizer. Note that this may lead to unexpected behavior.");
+                PyTreeTokenizer::new(path)
+            }
+        };
+
+    }
+}
diff --git a/bindings/src/tokenizers/fragments_tokenizer.rs b/bindings/src/tokenizers/fragments_tokenizer.rs
@@ -5,7 +5,7 @@ use pyo3::prelude::*;
 use super::PyTokenizedRegionSet;
 use super::PyUniverse;
 
-#[pyclass(name = "FragmentTokenizer")]
+#[pyclass(name = "FragmentTokenizer", module="gtars.tokenizers")]
 pub struct PyFragmentTokenizer {
     pub tokenizer: gtars::tokenizers::FragmentTokenizer<TreeTokenizer>,
     pub universe: Py<PyUniverse>, // this is a Py-wrapped version self.tokenizer.universe for performance reasons