Skip to content

Commit

Permalink
Merge pull request #63 from databio/dev
Browse files Browse the repository at this point in the history
Release v0.9.0
  • Loading branch information
khoroshevskyi authored Nov 7, 2024
2 parents 25beff9 + 75b84d2 commit 5138c2a
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 29 deletions.
2 changes: 1 addition & 1 deletion bbconf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.0"
__version__ = "0.9.0"
11 changes: 11 additions & 0 deletions bbconf/bbagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ def get_stats(self) -> StatsReturn:
genomes_number=number_of_genomes,
)

def get_list_genomes(self) -> List[str]:
"""
Get list of genomes from the database
:return: list of genomes
"""
statement = select(distinct(Bed.genome_alias))
with Session(self.config.db_engine.engine) as session:
genomes = session.execute(statement).all()
return [result[0] for result in genomes]

@cached_property
def list_of_licenses(self) -> List[str]:
"""
Expand Down
9 changes: 7 additions & 2 deletions bbconf/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,9 @@ class BedSets(Base):
files: Mapped[List["Files"]] = relationship("Files", back_populates="bedset")
universe: Mapped["Universes"] = relationship("Universes", back_populates="bedset")

author: Mapped[str] = mapped_column(nullable=True, comment="Author of the bedset")
source: Mapped[str] = mapped_column(nullable=True, comment="Source of the bedset")


class Universes(Base):
__tablename__ = "universes"
Expand Down Expand Up @@ -336,7 +339,7 @@ class TokenizedBed(Base):
nullable=False,
)
universe_id: Mapped[str] = mapped_column(
ForeignKey("universes.id", ondelete="CASCADE"),
ForeignKey("universes.id", ondelete="CASCADE", passive_deletes=True),
primary_key=True,
index=True,
nullable=False,
Expand All @@ -347,7 +350,9 @@ class TokenizedBed(Base):

bed: Mapped["Bed"] = relationship("Bed", back_populates="tokenized")
universe: Mapped["Universes"] = relationship(
"Universes", back_populates="tokenized"
"Universes",
back_populates="tokenized",
passive_deletes=True,
)


Expand Down
1 change: 1 addition & 0 deletions bbconf/models/bed_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class BedPlots(BaseModel):
widths_histogram: FileModel = None
neighbor_distances: FileModel = None
open_chromatin: FileModel = None
tss_distance: FileModel = None

model_config = ConfigDict(extra="ignore")

Expand Down
5 changes: 5 additions & 0 deletions bbconf/models/bedset_models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
from typing import List, Union

from pydantic import BaseModel, ConfigDict, model_validator
Expand All @@ -21,10 +22,14 @@ class BedSetMetadata(BaseModel):
id: str
name: str
md5sum: str
submission_date: datetime.datetime = None
last_update_date: datetime.datetime = None
statistics: Union[BedSetStats, None] = None
plots: Union[BedSetPlots, None] = None
description: str = None
bed_ids: List[str] = None
author: Union[str, None] = None
source: Union[str, None] = None


class BedSetListResult(BaseModel):
Expand Down
83 changes: 60 additions & 23 deletions bbconf/modules/bedfiles.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from logging import getLogger
from typing import Dict, Union
from typing import Dict, List, Union

import numpy as np
from geniml.bbclient import BBClient
Expand All @@ -10,7 +10,7 @@
from pydantic import BaseModel
from qdrant_client.models import Distance, PointIdsList, VectorParams
from sqlalchemy import and_, delete, func, select
from sqlalchemy.orm import Session
from sqlalchemy.orm import Session, aliased
from tqdm import tqdm

from bbconf.config_parser.bedbaseconfig import BedBaseConfig
Expand Down Expand Up @@ -385,7 +385,12 @@ def get_ids_list(
count = session.execute(count_statement).one()

for result in bed_ids:
result_list.append(BedMetadataBasic(**result.__dict__))
annotation = StandardMeta(
**result.annotations.__dict__ if result.annotations else {}
)
result_list.append(
BedMetadataBasic(**result.__dict__, annotation=annotation)
)

return BedListResult(
count=count[0],
Expand Down Expand Up @@ -865,34 +870,28 @@ def reindex_qdrant(self) -> None:
"""
bb_client = BBClient()

statement = select(Bed.id).where(and_(Bed.genome_alias == QDRANT_GENOME))

with Session(self._db_engine.engine) as session:
bed_ids = session.execute(statement).all()
annotation_result = self.get_ids_list(limit=100000, genome=QDRANT_GENOME)

bed_ids = [bed_result[0] for bed_result in bed_ids]
if not annotation_result.results:
_LOGGER.error("No bed files found.")
return None
results = annotation_result.results

with tqdm(total=len(bed_ids), position=0, leave=True) as pbar:
for record_id in bed_ids:
with tqdm(total=len(results), position=0, leave=True) as pbar:
for record in results:
try:
bed_region_set_obj = GRegionSet(bb_client.seek(record_id))
bed_region_set_obj = GRegionSet(bb_client.seek(record.id))
except FileNotFoundError:
bed_region_set_obj = bb_client.load_bed(record_id)

pbar.set_description(f"Processing file: {record_id}")
metadata = self._config.phc.sample.get(
namespace=self._config.config.phc.namespace,
name=self._config.config.phc.name,
tag=self._config.config.phc.tag,
sample_name=record_id,
)
bed_region_set_obj = bb_client.load_bed(record.id)

pbar.set_description(f"Processing file: {record.id}")

self.upload_file_qdrant(
bed_id=record_id,
bed_id=record.id,
bed_file=bed_region_set_obj,
payload=BedPEPHubRestrict(**metadata).model_dump(),
payload=record.annotation.model_dump() if record.annotation else {},
)
pbar.write(f"File: {record_id} uploaded to qdrant successfully.")
pbar.write(f"File: {record.id} uploaded to qdrant successfully.")
pbar.update(1)

return None
Expand Down Expand Up @@ -1180,3 +1179,41 @@ def get_tokenized_link(
bed_id=bed_id,
universe_id=universe_id,
)

def get_missing_plots(
self, plot_name: str, limit: int = 1000, offset: int = 0
) -> List[str]:
"""
Get list of bed files that are missing plot
:param plot_name: plot name
:param limit: number of results to return
:param offset: offset to start from
:return: list of bed file identifiers
"""
if plot_name not in list(BedPlots.model_fields.keys()):
raise BedBaseConfError(
f"Plot name: {plot_name} is not valid. Valid names: {list(BedPlots.model_fields.keys())}"
)

with Session(self._sa_engine) as session:
# Alias for subquery
t2_alias = aliased(Files)

# Define the subquery
subquery = select(t2_alias).where(t2_alias.name == plot_name).subquery()

query = (
select(Bed.id)
.outerjoin(subquery, Bed.id == subquery.c.bedfile_id)
.where(subquery.c.bedfile_id.is_(None))
.limit(limit)
.offset(offset)
)

results = session.scalars(query)

results = [result for result in results]

return results
26 changes: 24 additions & 2 deletions bbconf/modules/bedsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from bbconf.const import PKG_NAME
from bbconf.db_utils import Bed, BedFileBedSetRelation, BedSets, BedStats, Files
from bbconf.exceptions import BedSetExistsError, BedSetNotFoundError
from bbconf.models.bed_models import BedStatsModel
from bbconf.models.bed_models import BedStatsModel, StandardMeta
from bbconf.models.bedset_models import (
BedMetadataBasic,
BedSetBedFiles,
Expand Down Expand Up @@ -77,6 +77,10 @@ def get(self, identifier: str, full: bool = False) -> BedSetMetadata:
statistics=stats,
plots=plots,
bed_ids=list_of_bedfiles,
submission_date=bedset_obj.submission_date,
last_update_date=bedset_obj.last_update_date,
author=bedset_obj.author,
source=bedset_obj.source,
)

return bedset_metadata
Expand Down Expand Up @@ -200,6 +204,8 @@ def get_bedset_pep(self, identifier: str) -> dict:
"name": bedset.id,
"description": bedset.description,
"md5sum": bedset.md5sum,
"author": bedset.author,
"source": bedset.source,
}

return {
Expand All @@ -215,6 +221,7 @@ def create(
bedid_list: List[str],
description: str = None,
statistics: bool = False,
annotation: dict = None,
plots: dict = None,
upload_pephub: bool = False,
upload_s3: bool = False,
Expand All @@ -230,6 +237,7 @@ def create(
:param description: bedset description
:param bedid_list: list of bed file identifiers
:param statistics: calculate statistics for bedset
:param annotation: bedset annotation (author, source)
:param plots: dictionary with plots
:param upload_pephub: upload bedset to pephub (create view in pephub)
:param upload_s3: upload bedset to s3
Expand All @@ -249,6 +257,9 @@ def create(
raise BedSetExistsError(identifier)
self.delete(identifier)

if not isinstance(annotation, dict):
annotation = {}

if upload_pephub:
try:
self._create_pephub_view(identifier, description, bedid_list, no_fail)
Expand All @@ -264,6 +275,8 @@ def create(
bedset_means=stats.mean.model_dump() if stats else None,
bedset_standard_deviation=stats.sd.model_dump() if stats else None,
md5sum=compute_md5sum_bedset(bedid_list),
author=annotation.get("author"),
source=annotation.get("source"),
)

if upload_s3:
Expand Down Expand Up @@ -434,7 +447,16 @@ def get_bedset_bedfiles(self, identifier: str) -> BedSetBedFiles:
with Session(self._db_engine.engine) as session:
bedfiles_list = session.scalars(statement)
results = [
BedMetadataBasic(**bedfile_obj.__dict__)
BedMetadataBasic(
**bedfile_obj.__dict__,
annotation=StandardMeta(
**(
bedfile_obj.annotations.__dict__
if bedfile_obj.annotations
else {}
)
),
)
for bedfile_obj in bedfiles_list
]

Expand Down
9 changes: 9 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.

# [0.9.0] - 2024-11-06
## Changed
- Fixed bug with uploading tss dist plot\

## Added
- Added annotations to bedsets (author, source)
- get_genome_list method to bedfiles, that lists all available genomes
- Added method that lists all missing plots for bedfiles (get_missing_plots)

# [0.8.0] - 2024-10-23
## Changed
- Updated text to bed search (now using bivec)
Expand Down
13 changes: 12 additions & 1 deletion manual_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,20 @@ def get_pep():
prj


def get_id_plots_missing():
from bbconf import BedBaseAgent

agent = BedBaseAgent(config="/home/bnt4me/virginia/repos/bedhost/config.yaml")

results = agent.bed.get_missing_plots("gccontent", limit=5000)
print(results)
print(agent.get_list_genomes())


if __name__ == "__main__":
# zarr_s3()
# add_s3()
# get_from_s3()
# biocframe()
get_pep()
# get_pep()
get_id_plots_missing()

0 comments on commit 5138c2a

Please sign in to comment.