diff --git a/bbconf/_version.py b/bbconf/_version.py index 777f190..3e2f46a 100644 --- a/bbconf/_version.py +++ b/bbconf/_version.py @@ -1 +1 @@ -__version__ = "0.8.0" +__version__ = "0.9.0" diff --git a/bbconf/bbagent.py b/bbconf/bbagent.py index 029a645..1b0baab 100644 --- a/bbconf/bbagent.py +++ b/bbconf/bbagent.py @@ -62,6 +62,17 @@ def get_stats(self) -> StatsReturn: genomes_number=number_of_genomes, ) + def get_list_genomes(self) -> List[str]: + """ + Get list of genomes from the database + + :return: list of genomes + """ + statement = select(distinct(Bed.genome_alias)) + with Session(self.config.db_engine.engine) as session: + genomes = session.execute(statement).all() + return [result[0] for result in genomes] + @cached_property def list_of_licenses(self) -> List[str]: """ diff --git a/bbconf/db_utils.py b/bbconf/db_utils.py index 054520b..d929ec8 100644 --- a/bbconf/db_utils.py +++ b/bbconf/db_utils.py @@ -300,6 +300,9 @@ class BedSets(Base): files: Mapped[List["Files"]] = relationship("Files", back_populates="bedset") universe: Mapped["Universes"] = relationship("Universes", back_populates="bedset") + author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset") + source: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset") + class Universes(Base): __tablename__ = "universes" @@ -336,7 +339,7 @@ class TokenizedBed(Base): nullable=False, ) universe_id: Mapped[str] = mapped_column( - ForeignKey("universes.id", ondelete="CASCADE"), + ForeignKey("universes.id", ondelete="CASCADE", passive_deletes=True), primary_key=True, index=True, nullable=False, @@ -347,7 +350,9 @@ class TokenizedBed(Base): bed: Mapped["Bed"] = relationship("Bed", back_populates="tokenized") universe: Mapped["Universes"] = relationship( - "Universes", back_populates="tokenized" + "Universes", + back_populates="tokenized", + passive_deletes=True, ) diff --git a/bbconf/models/bed_models.py b/bbconf/models/bed_models.py index 442ea3e..3edf0c9 100644 --- a/bbconf/models/bed_models.py +++ b/bbconf/models/bed_models.py @@ -17,6 +17,7 @@ class BedPlots(BaseModel): widths_histogram: FileModel = None neighbor_distances: FileModel = None open_chromatin: FileModel = None + tss_distance: FileModel = None model_config = ConfigDict(extra="ignore") diff --git a/bbconf/models/bedset_models.py b/bbconf/models/bedset_models.py index 9102814..73bcb9a 100644 --- a/bbconf/models/bedset_models.py +++ b/bbconf/models/bedset_models.py @@ -1,3 +1,4 @@ +import datetime from typing import List, Union from pydantic import BaseModel, ConfigDict, model_validator @@ -21,10 +22,14 @@ class BedSetMetadata(BaseModel): id: str name: str md5sum: str + submission_date: datetime.datetime = None + last_update_date: datetime.datetime = None statistics: Union[BedSetStats, None] = None plots: Union[BedSetPlots, None] = None description: str = None bed_ids: List[str] = None + author: Union[str, None] = None + source: Union[str, None] = None class BedSetListResult(BaseModel): diff --git a/bbconf/modules/bedfiles.py b/bbconf/modules/bedfiles.py index bad4f4d..f9a60a6 100644 --- a/bbconf/modules/bedfiles.py +++ b/bbconf/modules/bedfiles.py @@ -1,6 +1,6 @@ import os from logging import getLogger -from typing import Dict, Union +from typing import Dict, List, Union import numpy as np from geniml.bbclient import BBClient @@ -10,7 +10,7 @@ from pydantic import BaseModel from qdrant_client.models import Distance, PointIdsList, VectorParams from sqlalchemy import and_, delete, func, select -from sqlalchemy.orm import Session +from sqlalchemy.orm import Session, aliased from tqdm import tqdm from bbconf.config_parser.bedbaseconfig import BedBaseConfig @@ -385,7 +385,12 @@ def get_ids_list( count = session.execute(count_statement).one() for result in bed_ids: - result_list.append(BedMetadataBasic(**result.__dict__)) + annotation = StandardMeta( + **result.annotations.__dict__ if result.annotations else {} + ) + result_list.append( + BedMetadataBasic(**result.__dict__, annotation=annotation) + ) return BedListResult( count=count[0], @@ -865,34 +870,28 @@ def reindex_qdrant(self) -> None: """ bb_client = BBClient() - statement = select(Bed.id).where(and_(Bed.genome_alias == QDRANT_GENOME)) - - with Session(self._db_engine.engine) as session: - bed_ids = session.execute(statement).all() + annotation_result = self.get_ids_list(limit=100000, genome=QDRANT_GENOME) - bed_ids = [bed_result[0] for bed_result in bed_ids] + if not annotation_result.results: + _LOGGER.error("No bed files found.") + return None + results = annotation_result.results - with tqdm(total=len(bed_ids), position=0, leave=True) as pbar: - for record_id in bed_ids: + with tqdm(total=len(results), position=0, leave=True) as pbar: + for record in results: try: - bed_region_set_obj = GRegionSet(bb_client.seek(record_id)) + bed_region_set_obj = GRegionSet(bb_client.seek(record.id)) except FileNotFoundError: - bed_region_set_obj = bb_client.load_bed(record_id) - - pbar.set_description(f"Processing file: {record_id}") - metadata = self._config.phc.sample.get( - namespace=self._config.config.phc.namespace, - name=self._config.config.phc.name, - tag=self._config.config.phc.tag, - sample_name=record_id, - ) + bed_region_set_obj = bb_client.load_bed(record.id) + + pbar.set_description(f"Processing file: {record.id}") self.upload_file_qdrant( - bed_id=record_id, + bed_id=record.id, bed_file=bed_region_set_obj, - payload=BedPEPHubRestrict(**metadata).model_dump(), + payload=record.annotation.model_dump() if record.annotation else {}, ) - pbar.write(f"File: {record_id} uploaded to qdrant successfully.") + pbar.write(f"File: {record.id} uploaded to qdrant successfully.") pbar.update(1) return None @@ -1180,3 +1179,41 @@ def get_tokenized_link( bed_id=bed_id, universe_id=universe_id, ) + + def get_missing_plots( + self, plot_name: str, limit: int = 1000, offset: int = 0 + ) -> List[str]: + """ + Get list of bed files that are missing plot + + :param plot_name: plot name + :param limit: number of results to return + :param offset: offset to start from + + :return: list of bed file identifiers + """ + if plot_name not in list(BedPlots.model_fields.keys()): + raise BedBaseConfError( + f"Plot name: {plot_name} is not valid. Valid names: {list(BedPlots.model_fields.keys())}" + ) + + with Session(self._sa_engine) as session: + # Alias for subquery + t2_alias = aliased(Files) + + # Define the subquery + subquery = select(t2_alias).where(t2_alias.name == plot_name).subquery() + + query = ( + select(Bed.id) + .outerjoin(subquery, Bed.id == subquery.c.bedfile_id) + .where(subquery.c.bedfile_id.is_(None)) + .limit(limit) + .offset(offset) + ) + + results = session.scalars(query) + + results = [result for result in results] + + return results diff --git a/bbconf/modules/bedsets.py b/bbconf/modules/bedsets.py index 3ea050e..3f46bb9 100644 --- a/bbconf/modules/bedsets.py +++ b/bbconf/modules/bedsets.py @@ -9,7 +9,7 @@ from bbconf.const import PKG_NAME from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files from bbconf.exceptions import BedSetExistsError, BedSetNotFoundError -from bbconf.models.bed_models import BedStatsModel +from bbconf.models.bed_models import BedStatsModel, StandardMeta from bbconf.models.bedset_models import ( BedMetadataBasic, BedSetBedFiles, @@ -77,6 +77,10 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata: statistics=stats, plots=plots, bed_ids=list_of_bedfiles, + submission_date=bedset_obj.submission_date, + last_update_date=bedset_obj.last_update_date, + author=bedset_obj.author, + source=bedset_obj.source, ) return bedset_metadata @@ -200,6 +204,8 @@ def get_bedset_pep(self, identifier: str) -> dict: "name": bedset.id, "description": bedset.description, "md5sum": bedset.md5sum, + "author": bedset.author, + "source": bedset.source, } return { @@ -215,6 +221,7 @@ def create( bedid_list: List[str], description: str = None, statistics: bool = False, + annotation: dict = None, plots: dict = None, upload_pephub: bool = False, upload_s3: bool = False, @@ -230,6 +237,7 @@ def create( :param description: bedset description :param bedid_list: list of bed file identifiers :param statistics: calculate statistics for bedset + :param annotation: bedset annotation (author, source) :param plots: dictionary with plots :param upload_pephub: upload bedset to pephub (create view in pephub) :param upload_s3: upload bedset to s3 @@ -249,6 +257,9 @@ def create( raise BedSetExistsError(identifier) self.delete(identifier) + if not isinstance(annotation, dict): + annotation = {} + if upload_pephub: try: self._create_pephub_view(identifier, description, bedid_list, no_fail) @@ -264,6 +275,8 @@ def create( bedset_means=stats.mean.model_dump() if stats else None, bedset_standard_deviation=stats.sd.model_dump() if stats else None, md5sum=compute_md5sum_bedset(bedid_list), + author=annotation.get("author"), + source=annotation.get("source"), ) if upload_s3: @@ -434,7 +447,16 @@ def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles: with Session(self._db_engine.engine) as session: bedfiles_list = session.scalars(statement) results = [ - BedMetadataBasic(**bedfile_obj.__dict__) + BedMetadataBasic( + **bedfile_obj.__dict__, + annotation=StandardMeta( + **( + bedfile_obj.annotations.__dict__ + if bedfile_obj.annotations + else {} + ) + ), + ) for bedfile_obj in bedfiles_list ] diff --git a/docs/changelog.md b/docs/changelog.md index bcf871c..1eba8d4 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,6 +2,15 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +# [0.9.0] - 2024-11-06 +## Changed +- Fixed bug with uploading tss dist plot\ + +## Added +- Added annotations to bedsets (author, source) +- get_genome_list method to bedfiles, that lists all available genomes +- Added method that lists all missing plots for bedfiles (get_missing_plots) + # [0.8.0] - 2024-10-23 ## Changed - Updated text to bed search (now using bivec) diff --git a/manual_testing.py b/manual_testing.py index 876b22c..0386ada 100644 --- a/manual_testing.py +++ b/manual_testing.py @@ -175,9 +175,20 @@ def get_pep(): prj +def get_id_plots_missing(): + from bbconf import BedBaseAgent + + agent = BedBaseAgent(config="/home/bnt4me/virginia/repos/bedhost/config.yaml") + + results = agent.bed.get_missing_plots("gccontent", limit=5000) + print(results) + print(agent.get_list_genomes()) + + if __name__ == "__main__": # zarr_s3() # add_s3() # get_from_s3() # biocframe() - get_pep() + # get_pep() + get_id_plots_missing()