From 65091e6cd9e727952c33dd34d1f3f9f0ab07f8ed Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 19 Feb 2024 21:04:49 -0500 Subject: [PATCH] clean up tutorials, geniml docs, etc --- autodoc.py | 102 ++++ docs/README.md | 9 +- .../code/config.md} | 45 +- .../bbconf_demo.md => bbconf/code/demo.md} | 79 +-- docs/bbconf/{ => notebooks}/config.ipynb | 0 docs/bbconf/{ => notebooks}/demo.ipynb | 0 .../code/bedbase-tutorial.md} | 0 .../code/bedmaker-tutorial.md} | 0 .../code/bedqc-tutorial.md} | 0 .../code/bedstat-tutorial.md} | 0 .../code/tutorial-all.md} | 0 .../bedboss/notebooks/bedbase-tutorial.ipynb | 0 .../bedboss/notebooks/bedmaker-tutorial.ipynb | 0 .../bedboss/notebooks/bedqc-tutorial.ipynb | 0 .../bedboss/notebooks/bedstat-tutorial.ipynb | 0 .../bedboss/notebooks/tutorial-all.ipynb | 0 docs/bedhost/README.md | 4 +- docs/bedhost/about.md | 6 - docs/citations.md | 4 + docs/geniml/README.md | 19 +- docs/geniml/contributing.md | 2 +- docs/geniml/img/geniml_logo.svg | 507 ++++++++++++++-- docs/geniml/img/geniml_logo1.svg | 63 ++ docs/geniml/img/geniml_logo_horizontal.svg | 255 ++++++++ docs/geniml/tutorials/assess-universe.md | 2 +- docs/geniml/tutorials/bedspace.md | 21 +- docs/stylesheets/extra.css | 9 + docs_jupyter/bbconf_config.ipynb | 149 ----- docs_jupyter/bbconf_demo.ipynb | 552 ------------------ docs_jupyter/build/.gitignore | 2 - mkdocs.yml | 41 +- 31 files changed, 1003 insertions(+), 868 deletions(-) create mode 100644 autodoc.py rename docs/{autodoc_build/bbconf_config.md => bbconf/code/config.md} (72%) rename docs/{autodoc_build/bbconf_demo.md => bbconf/code/demo.md} (85%) rename docs/bbconf/{ => notebooks}/config.ipynb (100%) rename docs/bbconf/{ => notebooks}/demo.ipynb (100%) rename docs/{autodoc_build/bedbase_tutorial.md => bedboss/code/bedbase-tutorial.md} (100%) rename docs/{autodoc_build/bedmaker_tutorial.md => bedboss/code/bedmaker-tutorial.md} (100%) rename docs/{autodoc_build/bedqc_tutorial.md => bedboss/code/bedqc-tutorial.md} (100%) rename docs/{autodoc_build/bedstat_tutorial.md => bedboss/code/bedstat-tutorial.md} (100%) rename docs/{autodoc_build/tutorial_all.md => bedboss/code/tutorial-all.md} (100%) rename docs_jupyter/bedbase_tutorial.ipynb => docs/bedboss/notebooks/bedbase-tutorial.ipynb (100%) rename docs_jupyter/bedmaker_tutorial.ipynb => docs/bedboss/notebooks/bedmaker-tutorial.ipynb (100%) rename docs_jupyter/bedqc_tutorial.ipynb => docs/bedboss/notebooks/bedqc-tutorial.ipynb (100%) rename docs_jupyter/bedstat_tutorial.ipynb => docs/bedboss/notebooks/bedstat-tutorial.ipynb (100%) rename docs_jupyter/tutorial_all.ipynb => docs/bedboss/notebooks/tutorial-all.ipynb (100%) delete mode 100644 docs/bedhost/about.md create mode 100644 docs/geniml/img/geniml_logo1.svg create mode 100644 docs/geniml/img/geniml_logo_horizontal.svg delete mode 100644 docs_jupyter/bbconf_config.ipynb delete mode 100644 docs_jupyter/bbconf_demo.ipynb delete mode 100644 docs_jupyter/build/.gitignore diff --git a/autodoc.py b/autodoc.py new file mode 100644 index 0000000..e5c7aa1 --- /dev/null +++ b/autodoc.py @@ -0,0 +1,102 @@ +# This script will auto-generate documentation for Python code, CLI usage, and Jupyter notebooks +# It is intended to be run as a pre-build step in a MkDocs project +# It will read the mkdocs.yml file for configuration +# It will use the lucidoc package to auto-generate documentation for Python code +# It will use the subprocess package to run CLI commands and capture the output +# It will use the nbconvert package to convert Jupyter notebooks to markdown + +import lucidoc +import yaml +import subprocess +import glob +import nbconvert +import os +from pathlib import Path + +import argparse + +parser = argparse.ArgumentParser(description="Description of your program") +parser.add_argument( + "--x-usage", + help="Exclude usage", + required=False, + default=False, + action="store_true", +) +parser.add_argument( + "--x-lucidoc", + help="Exclude lucidoc", + required=False, + default=False, + action="store_true", +) +parser.add_argument( + "--x-jupyter", + help="Exclude jupyter", + required=False, + default=False, + action="store_true", +) + +args = vars(parser.parse_args()) + +print(args) + +# Read the mkdocs config +with open("mkdocs.yml") as stream: + cfg = yaml.safe_load(stream) + + +if "autodoc" not in cfg: + print("No autodoc configuration found in mkdocs.yml") + exit(1) +else: + cfg = cfg["autodoc"] + +# Process auto-documented Python code +if args["x_lucidoc"] is False and "lucidoc" in cfg: + for bundle in cfg["lucidoc"]: + print(f"Documenting lucidoc '{bundle['pkg']}' at {bundle['outfile']}") + lucidoc.run_lucidoc(parse_style="rst", **bundle) +else: + print("Skipping lucidoc") + + +usage_tpl = """ +\n`{cmd}` +\n +```console +{usage} +``` +""" + +# Process CLI usage +if args["x_usage"] is False and "cli_usage" in cfg: + for item in cfg["cli_usage"]: + result = "" + with open(item["template"], "r") as file: + result = file.read() + for cmd in item["commands"]: + print(f"Documenting command '{cmd}' to '{item['outfile']}'") + usage = subprocess.check_output(cmd, shell=True).decode("utf-8") + content = usage_tpl.format(cmd=cmd, usage=usage) + result += content + with open(item["outfile"], "w") as file: + file.write(result) +else: + print("Skipping usage documentation") + +# # Render Juptyer notebooks to markdown +if args["x_jupyter"] is False and "jupyter" in cfg: + for item in cfg["jupyter"]: + files = glob.glob(f"docs/{item['in']}/*.ipynb") + for nb in files: + bn, _ = os.path.splitext(os.path.basename(nb)) + out = f"docs/{item['out']}/{bn}.md" + print(f"Converting '{nb}' to '{out}'") + md_result = nbconvert.exporters.export(nbconvert.MarkdownExporter(), nb)[0] + Path(os.path.dirname(out)).mkdir(parents=True, exist_ok=True) + with open(out, "w") as stream: + stream.write(md_result) +else: + print("Skipping jupyter notebooks") \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index 842ba3f..bf5ef5d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,12 @@ +--- +hide: + - navigation + - toc + - navigation.footer +--- + # Welcome to -This site hosts developer and user documentation for components of BEDbase and related tools. Use the tab navigation above to find the project of interest. +This site hosts developer and user documentation for components of BEDbase and related tools, notably including [geniml](geniml/README.md), our package for machine learning on genomic intervals. Use the tab navigation above to find the project of interest. You can access the main BEDbase interface at . \ No newline at end of file diff --git a/docs/autodoc_build/bbconf_config.md b/docs/bbconf/code/config.md similarity index 72% rename from docs/autodoc_build/bbconf_config.md rename to docs/bbconf/code/config.md index 1890e4c..ea8e269 100644 --- a/docs/autodoc_build/bbconf_config.md +++ b/docs/bbconf/code/config.md @@ -1,4 +1,3 @@ -jupyter:True # Bedbase configuration file documentation In order to start working with the `BedBaseConf` object, it has to be initialized first. The constuctor requires one argument, which is a path to the configuration file (in YAML format). @@ -18,14 +17,12 @@ Here's an example of a minimal bedbase configuration file: !cat ../tests/data/config_min.yaml ``` -```.output -# min config example. Refer to bbconf/const.py for key names and default values - -path: - pipeline_output_path: $HOME/bedbase - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output -``` + # min config example. Refer to bbconf/const.py for key names and default values + + path: + pipeline_output_path: $HOME/bedbase + bedstat_dir: bedstat_output + bedbuncher_dir: bedbuncher_output ## Example config file @@ -38,22 +35,20 @@ Here's an example of a complete bedbase configuration file: !cat ../tests/data/config.yaml ``` -```.output -database: - name: pipestat-test - user: postgres - password: pipestat-password - host: localhost -# port: 5432; intentionally commented out to test the defaults setting system -path: - pipeline_output_path: $BEDBASE_DATA_PATH/outputs - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output - remote_url_base: null -server: - host: 0.0.0.0 - port: 8000 -``` + database: + name: pipestat-test + user: postgres + password: pipestat-password + host: localhost + # port: 5432; intentionally commented out to test the defaults setting system + path: + pipeline_output_path: $BEDBASE_DATA_PATH/outputs + bedstat_dir: bedstat_output + bedbuncher_dir: bedbuncher_output + remote_url_base: null + server: + host: 0.0.0.0 + port: 8000 ## Default values diff --git a/docs/autodoc_build/bbconf_demo.md b/docs/bbconf/code/demo.md similarity index 85% rename from docs/autodoc_build/bbconf_demo.md rename to docs/bbconf/code/demo.md index 48ab2bf..c4dcfff 100644 --- a/docs/autodoc_build/bbconf_demo.md +++ b/docs/bbconf/code/demo.md @@ -1,4 +1,3 @@ -jupyter:True # `BedBaseConf` object usage demonstration `bbconf` standardizes reporting of [bedstat](https://github.com/databio/bedstat) and [bedbuncher](https://github.com/databio/bedsbuncher) results. It formalizes a way for these pipelines and downstream tools communicate -- the produced results can easily and reliably become an @@ -16,10 +15,8 @@ from bbconf import * bbc = BedBaseConf(config_path="../tests/data/config.yaml") ``` -```.output -DEBU 10:09:08 | bbconf:est:266 > Configured logger 'bbconf' using logmuse v0.2.6 + DEBU 10:09:08 | bbconf:est:266 > Configured logger 'bbconf' using logmuse v0.2.6 -``` As you can see above, missing entries are populated with default values. @@ -36,14 +33,12 @@ The `PipestatManager` instances for bedfiles and bedsets can be accessed via the print(bbc.bed) ``` -```.output -PipestatManager (bedfiles) -Backend: PostgreSQL -Results schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/bbconf/schemas/bedfiles_schema.yaml -Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml -Records count: 11 + PipestatManager (bedfiles) + Backend: PostgreSQL + Results schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/bbconf/schemas/bedfiles_schema.yaml + Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml + Records count: 11 -``` ### `BedBaseConf.bedset`: @@ -52,14 +47,12 @@ Records count: 11 print(bbc.bedset) ``` -```.output -PipestatManager (bedsets) -Backend: PostgreSQL -Results schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/bbconf/schemas/bedsets_schema.yaml -Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml -Records count: 3 + PipestatManager (bedsets) + Backend: PostgreSQL + Results schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/bbconf/schemas/bedsets_schema.yaml + Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml + Records count: 3 -``` ### `BedBaseConf.config`: @@ -70,23 +63,21 @@ Additionally, there's a `BedBaseConf.config` property, that can be used to retri print(bbc.config) ``` -```.output -database: - name: pipestat-test - user: postgres - password: pipestat-password - host: localhost - port: 5432 -path: - pipeline_output_path: $BEDBASE_DATA_PATH/outputs - bedstat_dir: bedstat_output - bedbuncher_dir: bedbuncher_output - remote_url_base: null -server: - host: 0.0.0.0 - port: 8000 + database: + name: pipestat-test + user: postgres + password: pipestat-password + host: localhost + port: 5432 + path: + pipeline_output_path: $BEDBASE_DATA_PATH/outputs + bedstat_dir: bedstat_output + bedbuncher_dir: bedbuncher_output + remote_url_base: null + server: + host: 0.0.0.0 + port: 8000 -``` ## Running a database @@ -108,11 +99,9 @@ print(bbc.bed.schema["name"]) print(bbc.bed.schema["widths_histogram"]) ``` -```.output -{'type': 'string', 'description': 'BED file name'} -{'type': 'image', 'description': 'Quantile-trimmed histogram of widths'} + {'type': 'string', 'description': 'BED file name'} + {'type': 'image', 'description': 'Quantile-trimmed histogram of widths'} -``` A result of type `image` is in fact a mapping with three required elements: `path`, `thumbnail_path` and `title`. The actual jsonschema schemas can be accessed as `result_schemas` property for both tables: @@ -158,10 +147,8 @@ bbc.bed.record_count bbc.bed.report(record_identifier="78c0e4753d04b238fc07e4ebe5a02984", values={"name": "some_name"}) ``` -```.output -These results exist for '78c0e4753d04b238fc07e4ebe5a02984': ['name'] + These results exist for '78c0e4753d04b238fc07e4ebe5a02984': ['name'] -``` @@ -216,11 +203,9 @@ Let's try reporting a new bedfile then: bbc.bed.report(record_identifier="78c1e4111d04b238fc11e4ebe5a02984", values={"name": "some_name"}) ``` -```.output -Reported records for '78c1e4111d04b238fc11e4ebe5a02984' in 'bedfiles' namespace: - - name: some_name + Reported records for '78c1e4111d04b238fc11e4ebe5a02984' in 'bedfiles' namespace: + - name: some_name -``` @@ -266,10 +251,8 @@ Naturally, a record can be removed: bbc.bed.remove(record_identifier="78c1e4111d04b238fc11e4ebe5a02984") ``` -```.output -Removing '78c1e4111d04b238fc11e4ebe5a02984' record + Removing '78c1e4111d04b238fc11e4ebe5a02984' record -``` diff --git a/docs/bbconf/config.ipynb b/docs/bbconf/notebooks/config.ipynb similarity index 100% rename from docs/bbconf/config.ipynb rename to docs/bbconf/notebooks/config.ipynb diff --git a/docs/bbconf/demo.ipynb b/docs/bbconf/notebooks/demo.ipynb similarity index 100% rename from docs/bbconf/demo.ipynb rename to docs/bbconf/notebooks/demo.ipynb diff --git a/docs/autodoc_build/bedbase_tutorial.md b/docs/bedboss/code/bedbase-tutorial.md similarity index 100% rename from docs/autodoc_build/bedbase_tutorial.md rename to docs/bedboss/code/bedbase-tutorial.md diff --git a/docs/autodoc_build/bedmaker_tutorial.md b/docs/bedboss/code/bedmaker-tutorial.md similarity index 100% rename from docs/autodoc_build/bedmaker_tutorial.md rename to docs/bedboss/code/bedmaker-tutorial.md diff --git a/docs/autodoc_build/bedqc_tutorial.md b/docs/bedboss/code/bedqc-tutorial.md similarity index 100% rename from docs/autodoc_build/bedqc_tutorial.md rename to docs/bedboss/code/bedqc-tutorial.md diff --git a/docs/autodoc_build/bedstat_tutorial.md b/docs/bedboss/code/bedstat-tutorial.md similarity index 100% rename from docs/autodoc_build/bedstat_tutorial.md rename to docs/bedboss/code/bedstat-tutorial.md diff --git a/docs/autodoc_build/tutorial_all.md b/docs/bedboss/code/tutorial-all.md similarity index 100% rename from docs/autodoc_build/tutorial_all.md rename to docs/bedboss/code/tutorial-all.md diff --git a/docs_jupyter/bedbase_tutorial.ipynb b/docs/bedboss/notebooks/bedbase-tutorial.ipynb similarity index 100% rename from docs_jupyter/bedbase_tutorial.ipynb rename to docs/bedboss/notebooks/bedbase-tutorial.ipynb diff --git a/docs_jupyter/bedmaker_tutorial.ipynb b/docs/bedboss/notebooks/bedmaker-tutorial.ipynb similarity index 100% rename from docs_jupyter/bedmaker_tutorial.ipynb rename to docs/bedboss/notebooks/bedmaker-tutorial.ipynb diff --git a/docs_jupyter/bedqc_tutorial.ipynb b/docs/bedboss/notebooks/bedqc-tutorial.ipynb similarity index 100% rename from docs_jupyter/bedqc_tutorial.ipynb rename to docs/bedboss/notebooks/bedqc-tutorial.ipynb diff --git a/docs_jupyter/bedstat_tutorial.ipynb b/docs/bedboss/notebooks/bedstat-tutorial.ipynb similarity index 100% rename from docs_jupyter/bedstat_tutorial.ipynb rename to docs/bedboss/notebooks/bedstat-tutorial.ipynb diff --git a/docs_jupyter/tutorial_all.ipynb b/docs/bedboss/notebooks/tutorial-all.ipynb similarity index 100% rename from docs_jupyter/tutorial_all.ipynb rename to docs/bedboss/notebooks/tutorial-all.ipynb diff --git a/docs/bedhost/README.md b/docs/bedhost/README.md index 0f24129..a6a53bf 100644 --- a/docs/bedhost/README.md +++ b/docs/bedhost/README.md @@ -31,10 +31,10 @@ Object IDs take the form `..`. An exa So, you can get information about this object like this: -`GET` [/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile](/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile) +`GET` [https://api.bedbase.org/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile](https://api.bedbase.org/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile) Or, you can get a URL to download the actual file with: -`GET` [/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile/access/http](/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile/access/http) +`GET` [https://api.bedbase.org/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile/access/http](https://api.bedbase.org/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile/access/http) diff --git a/docs/bedhost/about.md b/docs/bedhost/about.md deleted file mode 100644 index 529dfcf..0000000 --- a/docs/bedhost/about.md +++ /dev/null @@ -1,6 +0,0 @@ -# about - -testing about - -[here is a link](/docs) - diff --git a/docs/citations.md b/docs/citations.md index 122be5b..937674f 100644 --- a/docs/citations.md +++ b/docs/citations.md @@ -13,6 +13,10 @@ Thanks for citing us! If you use BEDbase, geniml, or their components in your re |---------------|-----------------| | `geniml` region set evaluations | Zheng et al. (2023) *bioRxiv* | | `region2vec` embeddings | Gharavi et al. (2021) *Bioinformatics* | +| `bedspace` search and embeddings | Gharavi et al. (2023) *bioRxiv* | +| `geniml hmm` module | Rymuza et al. (2023) *bioRxiv* | +| `bedbase` database | Unpublished | +| `scEmbed` single-cell embedding framework | LeRoy et al. (2023) *bioRxiv* | diff --git a/docs/geniml/README.md b/docs/geniml/README.md index 8ff1d8d..d38773c 100644 --- a/docs/geniml/README.md +++ b/docs/geniml/README.md @@ -1,19 +1,24 @@ -# Genomic interval toolkit +# + +

+ + +

+ + ## Introduction -Geniml is a python package for building machine learning models of genomic interval data (BED files). It also includes ancillary functions to support other types of analyses of genomic interval data. +Geniml is a *genomic interval machine learning toolkit*, a Python package for building machine learning models of genomic interval data (BED files). It also includes ancillary functions to support other types of analyses of genomic interval data. + +As of Feburary 2024, this package and its documentation are undergoing rapid development, leading to some tutorials getting outdated. Please raise [github issues](https://github.com/databio/geniml) if you find outdated or unclear directions, so we know where to focus effort that will benefit users. ## Install ``` -pip install --user --upgrade . +pip install --user --upgrade geniml ``` ## Modules `geniml` is organized into modules. The next section is an overview of each module. You can also proceed to the how-to guides for recipes on how to do specfic tasks. - -## Citing - -If you find `geniml` useful for your research, please cite us! It helps us convince others that our work is useful. You can find a [published papers describing geniml components](manuscripts.md). \ No newline at end of file diff --git a/docs/geniml/contributing.md b/docs/geniml/contributing.md index 2ae9044..dfee014 100644 --- a/docs/geniml/contributing.md +++ b/docs/geniml/contributing.md @@ -56,5 +56,5 @@ geniml.hmm.function() ### Shared code -Any variables, functions, or other code that is shared across modules should be placed in the parent module, which is held in the [geniml](geniml) folder. +Any variables, functions, or other code that is shared across modules should be placed in the parent module, which is held in the geniml folder. diff --git a/docs/geniml/img/geniml_logo.svg b/docs/geniml/img/geniml_logo.svg index 7aeb1b5..376af38 100644 --- a/docs/geniml/img/geniml_logo.svg +++ b/docs/geniml/img/geniml_logo.svg @@ -1,63 +1,474 @@ + + + style="stroke-width:0.0306422" /> diff --git a/docs/geniml/img/geniml_logo1.svg b/docs/geniml/img/geniml_logo1.svg new file mode 100644 index 0000000..7aeb1b5 --- /dev/null +++ b/docs/geniml/img/geniml_logo1.svg @@ -0,0 +1,63 @@ + + diff --git a/docs/geniml/img/geniml_logo_horizontal.svg b/docs/geniml/img/geniml_logo_horizontal.svg new file mode 100644 index 0000000..3381c23 --- /dev/null +++ b/docs/geniml/img/geniml_logo_horizontal.svg @@ -0,0 +1,255 @@ + + + + diff --git a/docs/geniml/tutorials/assess-universe.md b/docs/geniml/tutorials/assess-universe.md index 2cd6045..ac50fb7 100644 --- a/docs/geniml/tutorials/assess-universe.md +++ b/docs/geniml/tutorials/assess-universe.md @@ -92,7 +92,7 @@ closeness_score = get_closeness_score("tests/consensus/raw", ## Universe likelihood We can also calculate the likelihood of universe given collection of file. For that we -will need [likelihood model](consensus-peaks.md#making-likelihood-model-). We can do it +will need [likelihood model](create-consensus-peaks.md#making-likelihood-model-). We can do it either for hard universe: ``` diff --git a/docs/geniml/tutorials/bedspace.md b/docs/geniml/tutorials/bedspace.md index be78bb1..de58daf 100644 --- a/docs/geniml/tutorials/bedspace.md +++ b/docs/geniml/tutorials/bedspace.md @@ -2,14 +2,25 @@ ## Introduction -To ensure that everything is working correctly, run: `python -c "from geniml import bedspace"`. There are four main commands in `bedspace`: +BEDspace is an application of the StarSpace model to genomic interval data, described in [Gharavi et al. 2023](../../citations.md). It allows us to train numerical embeddings for a collection of region sets simultaneously with their metadata labels, capturing similarity between region sets and their metadata in a low-dimensional space. Using these learned co-embeddings, BEDspace solves three related information retrieval tasks using embedding distance computations: retrieving region sets related to a user query string; suggesting new labels for database region sets; and retrieving database region sets similar to a query region set. + +## Installation + +The `bedspace` module is installed with `geniml`. To ensure that everything is working correctly, run: `python -c "from geniml import bedspace"`. + +## BEDspace operations + +There are four main commands in `bedspace`: 1. `bedspace preprocess`: preprocesses a set of genomic interval regions and their associated metadata into a format that can be used by `bedspace train`. 2. `bedspace train`: trains a StarSpace model on the preprocessed data. 3. `bedspace distances`: computes distances between region sets in the trained model and metadata labels. 4. `bedspace search`: searches for the most similar region sets and metadata labels to a given query. Three scenarios for this command are described in the details. +These commands are accessed via the command line with `genimtools bedspace `. + ### `bedspace preprocess` + The `preprocess` command will prepare a set of region sets and metadata labels for training. This includes things like adding the `__label__` prefix to metadata labels, and converting the region sets into a format that can be used by StarSpace. The command takes in a set of region sets and metadata labels, and outputs a set of preprocessed region sets and metadata labels. The command can be run as follows: ```console @@ -30,6 +41,7 @@ Input Description: ### `bedspace train` + The `train` command will train a StarSpace model on the preprocessed region sets and metadata labels. It requires that you have run the `preprocess` command first. The `train` command takes in a set of preprocessed region sets and metadata labels, and outputs a trained StarSpace model. The command can be run as follows: ```console @@ -51,12 +63,8 @@ Input Description: `--epochs`: Specifies the number of epochs to train the StartSpace model. `--lr`: Sets the learning rate for the training process. - - - - - ### `bedspace distances` + The `distances` command will compute the distances between all of the region sets and metadata labels in the trained model. It requires that you have ran the `train` command first. The `distances` command takes in a trained StarSpace model, and outputs a set of distances between all of the region sets and metadata labels in the model. The command can be run as follows: ```console @@ -117,7 +125,6 @@ geniml bedspace search \ path/to/regions.bed ``` - Input Description: `-t`: Specifies the search type. diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css index 4ee90a5..2649237 100644 --- a/docs/stylesheets/extra.css +++ b/docs/stylesheets/extra.css @@ -35,6 +35,15 @@ text-align: center; } +.img-header { + text-align: center; + display: block; + margin-left: auto; + margin-right: auto; + width: 50%; +} + + .jumbotron { padding-top: 30px; padding-bottom: 30px; diff --git a/docs_jupyter/bbconf_config.ipynb b/docs_jupyter/bbconf_config.ipynb deleted file mode 100644 index 7e285d3..0000000 --- a/docs_jupyter/bbconf_config.ipynb +++ /dev/null @@ -1,149 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Bedbase configuration file documentation\n", - "\n", - "In order to start working with the `BedBaseConf` object, it has to be initialized first. The constuctor requires one argument, which is a path to the configuration file (in YAML format).\n", - "\n", - "## Minimal config file\n", - "\n", - "The minimal configuration must define the `path` section with 3 keys:\n", - "\n", - "- `pipeline_output_path`: path to the desired output directory for the pipelines\n", - "- `bedstat_dir`: name of the [bedstat](https://github.com/databio/bedstat) pipeline output directory\n", - "- `bedbuncher_dir`: name of the [bedbuncher](https://github.com/databio/bedbuncher) pipeline output directory\n", - "\n", - "Here's an example of a minimal bedbase configuration file:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# min config example. Refer to bbconf/const.py for key names and default values\r\n", - "\r\n", - "path:\r\n", - " pipeline_output_path: $HOME/bedbase\r\n", - " bedstat_dir: bedstat_output\r\n", - " bedbuncher_dir: bedbuncher_output" - ] - } - ], - "source": [ - "!cat ../tests/data/config_min.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example config file\n", - "\n", - "Apart from the required `path` section, there are 2 other sections that can be used to configure the PostgreSQL database, used to store the metadata about the bedfiles and bedsets (`database` section) and to configure the bedhost server that displays the pipeline results and provides an API to query them (`server` section).\n", - "\n", - "Here's an example of a complete bedbase configuration file:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "database:\r\n", - " name: pipestat-test\r\n", - " user: postgres\r\n", - " password: pipestat-password\r\n", - " host: localhost\r\n", - "# port: 5432; intentionally commented out to test the defaults setting system\r\n", - "path:\r\n", - " pipeline_output_path: $BEDBASE_DATA_PATH/outputs\r\n", - " bedstat_dir: bedstat_output\r\n", - " bedbuncher_dir: bedbuncher_output\r\n", - " remote_url_base: null\r\n", - "server:\r\n", - " host: 0.0.0.0\r\n", - " port: 8000" - ] - } - ], - "source": [ - "!cat ../tests/data/config.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Default values\n", - "\n", - "In case any of the values shown below is not provided in the configuration file, it will be set to a default value" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AttMap\n", - "path:\n", - " remote_url_base: null\n", - "database:\n", - " user: postgres\n", - " password: bedbasepassword\n", - " name: postgres\n", - " port: 5432\n", - " host: localhost\n", - "server:\n", - " host: 0.0.0.0\n", - " port: 80" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bbconf.const import DEFAULT_SECTION_VALUES\n", - "from attmap import AttMap\n", - "AttMap(DEFAULT_SECTION_VALUES)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs_jupyter/bbconf_demo.ipynb b/docs_jupyter/bbconf_demo.ipynb deleted file mode 100644 index 634cc11..0000000 --- a/docs_jupyter/bbconf_demo.ipynb +++ /dev/null @@ -1,552 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# `BedBaseConf` object usage demonstration\n", - "\n", - "`bbconf` standardizes reporting of [bedstat](https://github.com/databio/bedstat) and [bedbuncher](https://github.com/databio/bedsbuncher) results. It formalizes a way for these pipelines and downstream tools communicate -- the produced results can easily and reliably become an\n", - "input for the server ([bedhost](https://github.com/databio/bedhost)). The object exposes API for interacting with the results and is backed by a [PostgreSQL](https://www.postgresql.org/) database.\n", - "\n", - "\n", - "`bbconf` provides a way to easily determine a path to the required configuration file. The file can be pointed to by the `$BEDBASE` environment variable. `get_bedbase_cfg` function returns a path which can be either excplicitly provided as an argument or read from the environment variable." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "DEBU 10:09:08 | bbconf:est:266 > Configured logger 'bbconf' using logmuse v0.2.6 \n" - ] - } - ], - "source": [ - "import logmuse\n", - "logmuse.init_logger(\"bbconf\", \"DEBUG\")\n", - "from bbconf import *\n", - "\n", - "bbc = BedBaseConf(config_path=\"../tests/data/config.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As you can see above, missing entries are populated with default values.\n", - "\n", - "## Object contents" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`BedBaseConf` objects consist of two [`PipestatManager`](http://pipestat.databio.org/) instances. These objects are responsible for bedfiles and bedsets metadata management. Additionally, `BedBaseConf` maintains a \"relationship table\" that stores the information regarding the bedfile-bedset relationsips, i.e. which bedfile is a part of which bedset.\n", - "\n", - "The `PipestatManager` instances for bedfiles and bedsets can be accessed via the object properties: `BedBaseConf.bed` and `BedBaseConf.bedset`, respectively:\n", - "\n", - "### `BedBaseConf.bed`:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PipestatManager (bedfiles)\n", - "Backend: PostgreSQL\n", - "Results schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/bbconf/schemas/bedfiles_schema.yaml\n", - "Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml\n", - "Records count: 11\n" - ] - } - ], - "source": [ - "print(bbc.bed)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `BedBaseConf.bedset`:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PipestatManager (bedsets)\n", - "Backend: PostgreSQL\n", - "Results schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/bbconf/schemas/bedsets_schema.yaml\n", - "Status schema source: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/schemas/status_schema.yaml\n", - "Records count: 3\n" - ] - } - ], - "source": [ - "print(bbc.bedset)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### `BedBaseConf.config`:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Additionally, there's a `BedBaseConf.config` property, that can be used to retrieve the bedbase project configuration values, which include both ones declared in the configuration file and default ones:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "database:\n", - " name: pipestat-test\n", - " user: postgres\n", - " password: pipestat-password\n", - " host: localhost\n", - " port: 5432\n", - "path:\n", - " pipeline_output_path: $BEDBASE_DATA_PATH/outputs\n", - " bedstat_dir: bedstat_output\n", - " bedbuncher_dir: bedbuncher_output\n", - " remote_url_base: null\n", - "server:\n", - " host: 0.0.0.0\n", - " port: 8000\n" - ] - } - ], - "source": [ - "print(bbc.config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running a database" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Before we start interacting with the database, we need to establish the connection. The required database information is sourced from the object itself. Obviously, the PostgreSQL database instance has to be launched before and running in the background. For example, to run the database in a Docker container, execute these two lines:\n", - "\n", - "```\n", - "docker volume create postgres-data\n", - "docker run -d --name bedbase-postgres -p 5432:5432 -e POSTGRES_PASSWORD=bedbasepassword -e POSTGRES_USER=postgres -e POSTGRES_DB=postgres -v postgres-data:/var/lib/postgresql/data postgres\n", - "```\n", - "The environment variables passed to the container need to match the settings in `BedBaseConf` object." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Standardized metadata specification\n", - "\n", - "`bbconf` package comes with a predefined schemas, that describe the required bed and bedset metadata including the identifiers and types. For example, name of the bedfile, that will be stored in the column `\"name\"` has to be a string, whereas columns `\"widths_histogram\"` expects an image:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'type': 'string', 'description': 'BED file name'}\n", - "{'type': 'image', 'description': 'Quantile-trimmed histogram of widths'}\n" - ] - } - ], - "source": [ - "print(bbc.bed.schema[\"name\"])\n", - "print(bbc.bed.schema[\"widths_histogram\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A result of type `image` is in fact a mapping with three required elements: `path`, `thumbnail_path` and `title`. The actual jsonschema schemas can be accessed as `result_schemas` property for both tables:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'type': 'object',\n", - " 'description': 'Quantile-trimmed histogram of widths',\n", - " 'properties': {'path': {'type': 'string'},\n", - " 'thumbnail_path': {'type': 'string'},\n", - " 'title': {'type': 'string'}},\n", - " 'required': ['path', 'thumbnail_path', 'title']}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.result_schemas[\"widths_histogram\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Convenient metadata management and exploration\n", - "\n", - "Building on `PipestatManager`s `BedBaseConf` offers multiple methods for bedfile and bedset metadata management and exploration. Here are some examples:\n", - "\n", - "### Get the number of reported bedfiles" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.record_count" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Report metadata for a bedfile or bedset" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "These results exist for '78c0e4753d04b238fc07e4ebe5a02984': ['name']\n" - ] - }, - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.report(record_identifier=\"78c0e4753d04b238fc07e4ebe5a02984\", values={\"name\": \"some_name\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Oops, `name` for this bedfile has been reported already. `BedBaseConf`, does not allow reporting results overwriting, unless it's explicitly forced with `force_overwrite=True`.\n", - "\n", - "Let's try reporting a different value:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "ename": "AssertionError", - "evalue": "'test' is not a known result. Results defined in the schema are: ['name', 'md5sum', 'bedfile', 'bigbedfile', 'regions_no', 'gc_content', 'mean_absolute_tss_dist', 'mean_region_width', 'exon_frequency', 'intron_frequency', 'promoterprox_frequency', 'intergenic_frequency', 'promotercore_frequency', 'fiveutr_frequency', 'threeutr_frequency', 'fiveutr_percentage', 'threeutr_percentage', 'promoterprox_percentage', 'exon_percentage', 'intron_percentage', 'intergenic_percentage', 'promotercore_percentage', 'tssdist', 'chrombins', 'gccontent', 'paritions', 'expected_partitions', 'cumulative_partitions', 'widths_histogram', 'neighbor_distances', 'open_chromatin', 'other'].", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbbc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreport\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord_identifier\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"78c0e4753d04b238fc07e4ebe5a02984\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"test\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"some_value\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/pipestat.py\u001b[0m in \u001b[0;36mreport\u001b[0;34m(self, values, record_identifier, force_overwrite, strict_type, return_id)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mSchemaNotFoundError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"report results\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 765\u001b[0m \u001b[0mresult_identifiers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 766\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0massert_results_defined\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresult_identifiers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 767\u001b[0m existing = self._check_which_results_exist(\n\u001b[1;32m 768\u001b[0m rid=record_identifier, results=result_identifiers)\n", - "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/pipestat/pipestat.py\u001b[0m in \u001b[0;36massert_results_defined\u001b[0;34m(self, results)\u001b[0m\n\u001b[1;32m 1029\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mr\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1030\u001b[0m assert r in known_results, SchemaError(\n\u001b[0;32m-> 1031\u001b[0;31m \u001b[0;34mf\"'{r}' is not a known result. Results defined in the \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1032\u001b[0m f\"schema are: {list(known_results)}.\")\n\u001b[1;32m 1033\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAssertionError\u001b[0m: 'test' is not a known result. Results defined in the schema are: ['name', 'md5sum', 'bedfile', 'bigbedfile', 'regions_no', 'gc_content', 'mean_absolute_tss_dist', 'mean_region_width', 'exon_frequency', 'intron_frequency', 'promoterprox_frequency', 'intergenic_frequency', 'promotercore_frequency', 'fiveutr_frequency', 'threeutr_frequency', 'fiveutr_percentage', 'threeutr_percentage', 'promoterprox_percentage', 'exon_percentage', 'intron_percentage', 'intergenic_percentage', 'promotercore_percentage', 'tssdist', 'chrombins', 'gccontent', 'paritions', 'expected_partitions', 'cumulative_partitions', 'widths_histogram', 'neighbor_distances', 'open_chromatin', 'other']." - ] - } - ], - "source": [ - "bbc.bed.report(record_identifier=\"78c0e4753d04b238fc07e4ebe5a02984\", values={\"test\": \"some_value\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Oops, the result `test` is not allowed, since it hasn't been specified in the schema. Results that are allowed are prinded in the error message above.\n", - "\n", - "Let's try reporting a new bedfile then:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Reported records for '78c1e4111d04b238fc11e4ebe5a02984' in 'bedfiles' namespace:\n", - " - name: some_name\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.report(record_identifier=\"78c1e4111d04b238fc11e4ebe5a02984\", values={\"name\": \"some_name\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Success, the name for the bedfile identified by `78c1e4111d04b238fc11e4ebe5a02984` has been reported.\n", - "\n", - "Therefore, we can retrieve this result:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'some_name'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.retrieve(record_identifier=\"78c1e4111d04b238fc11e4ebe5a02984\", result_identifier=\"name\") " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Or all the reported results:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'name': 'some_name'}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.retrieve(record_identifier=\"78c1e4111d04b238fc11e4ebe5a02984\") " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Naturally, a record can be removed:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Removing '78c1e4111d04b238fc11e4ebe5a02984' record\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.bed.remove(record_identifier=\"78c1e4111d04b238fc11e4ebe5a02984\") " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Report bedfile-bedset relationships" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Another useful feature of `BedBaseConf` is conveninent many to many bedfile-bedset relationships handling. To report one use `BedBaseConf.report_relationship` method:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "bbc.report_relationship(bedfile_id=3, bedset_id=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can select bedfiles that are part of a bedsets with name \"bedsetOver1kRegions\". Therefore they need to match the following query: `name='bedsetOver1kRegions'`. With `bedfile_col` argument we select the bedfile table columns we're interested in:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[1, 'GSE105587_ENCFF018NNF_conservative_idr_thresholded_peaks_GRCh38'],\n", - " [2, 'GSE105977_ENCFF617QGK_optimal_idr_thresholded_peaks_GRCh38'],\n", - " [3, 'GSE105977_ENCFF793SZW_conservative_idr_thresholded_peaks_GRCh38'],\n", - " [4, 'GSE105977_ENCFF937CGY_peaks_GRCh38'],\n", - " [5, 'GSE91663_ENCFF316ASR_peaks_GRCh38'],\n", - " [6, 'GSE91663_ENCFF319TPR_conservative_idr_thresholded_peaks_GRCh38'],\n", - " [7, 'GSE91663_ENCFF553KIK_optimal_idr_thresholded_peaks_GRCh38'],\n", - " [8, 'GSM2423312_ENCFF155HVK_peaks_GRCh38'],\n", - " [9, 'GSM2423313_ENCFF722AOG_peaks_GRCh38'],\n", - " [10, 'GSM2827349_ENCFF196DNQ_peaks_GRCh38'],\n", - " [11, 'GSM2827350_ENCFF928JXU_peaks_GRCh38']]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bbc.select_bedfiles_for_bedset(condition=\"name=%s\", condition_val=[\"bedsetOver1kRegions\"], bedfile_col=[\"id\", \"name\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The unwanted relationships can be removed with `BedBaseConf.remove_relationship` method:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "bbc.remove_relationship(bedfile_ids=[3], bedset_id=2)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs_jupyter/build/.gitignore b/docs_jupyter/build/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/docs_jupyter/build/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/mkdocs.yml b/mkdocs.yml index e635b92..6194fdb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,7 +25,7 @@ markdown_extensions: custom_fences: - name: mermaid class: mermaid - format: !!python/name:pymdownx.superfences.fence_code_format + format: "!!python/name:pymdownx.superfences.fence_code_format" extra_css: - stylesheets/extra.css @@ -39,6 +39,7 @@ copyright: > nav: + - Home: README.md - BEDbase: - BEDbase: bedbase/README.md - UI guide: @@ -49,15 +50,24 @@ nav: - Changelog: bedhost/changelog.md - BBConf: - BBConf: bbconf/README.md + - Configuring: bbconf/code/config.md + - Demo: bbconf/code/demo.md - Guide: bbconf/bbc_api.md - Changelog: bbconf/changelog.md - Reference: + - How to cite: citations.md - Usage: bedboss_usage.md - Support: https://github.com/bedbase/bedboss/issues - Contributing: contributing.md - Changelog: changelog.md - BEDboss: - BEDBoss: bedboss/README.md + - Tutorials: + - BEDbase tutorial: bedboss/code/bedbase-tutorial.md + - BEDmaker tutorial: bedboss/code/bedmaker-tutorial.md + - BEDqc tutorial: bedboss/code/bedqc-tutorial.md + - BEDstat tutorial: bedboss/code/bedstat-tutorial.md + - Everything tutorial: bedboss/code/tutorial-all.md - How to guides: - Configure bedboss: bedboss/how-to-configure.md - Run from Python: bedboss/how-to-run-from-python.md @@ -65,26 +75,28 @@ nav: - Create BEDbase database: bedboss/how-to-create-database.md - BEDboss insert: bedboss/bedboss-insert.md - Reference: + - How to cite: citations.md - Usage: bedboss/usage.md - Geniml: - Geniml: geniml/README.md - Getting Started: - Module overviews: geniml/modules.md - How-to guides: - - Assess universe fit: geniml/tutorials/assess-universe.md - - Search intervals with BEDSpace: geniml/tutorials/bedspace.md - - Evaluate embeddings: geniml/tutorials/evaluation.md - Train region2vec embeddings: geniml/tutorials/region2vec.md - Train single-cell embeddings: geniml/tutorials/train-scembed-model.md - - Load vector database with embeddings: tutorials/load-qdrant-with-cell-embeddings.md - - Cell-type prediction using KNN: tutorials/cell-type-annotation-with-knn.md - Tokenization: geniml/tutorials/tokenization.md - Tokenize a BED file on the command line: geniml/tutorials/cli-tokenization.md + - Evaluate embeddings: geniml/tutorials/evaluation.md + - Search intervals with BEDSpace: geniml/tutorials/bedspace.md + - Load vector database with embeddings: tutorials/load-qdrant-with-cell-embeddings.md + - Cell-type prediction using KNN: tutorials/cell-type-annotation-with-knn.md - Create consensus peaks: geniml/tutorials/create-consensus-peaks.md + - Assess universe fit: geniml/tutorials/assess-universe.md - Fine-tune embeddings: geniml/tutorials/fine-tune-region2vec-model.md - Randomize bed files: geniml/tutorials/bedshift.md - Create evaluation dataset with bedshift: geniml/tutorials/bedshift-evaluation-guide.md - Reference: + - How to cite: citations.md - API: geniml/autodoc_build/geniml.md - Support: geniml/support.md - Contributing: geniml/contributing.md @@ -93,16 +105,7 @@ nav: - Genimtools: genimtools/README.md - How to cite: citations.md -jupyter: - - in: eido/notebooks - out: eido/code - - in: geofetch/notebooks - out: geofetch/code - - in: looper/notebooks - out: looper/code - - in: peppy/notebooks - out: peppy/code - - in: pipestat/notebooks - out: pipestat/code - - in: pypiper/notebooks - out: pypiper/code +autodoc: + jupyter: + - in: bbconf/notebooks + out: bbconf/code