From 6ca50222d0af66c52dc6d367a61f1ec90303942a Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 7 Feb 2024 17:34:47 -0500 Subject: [PATCH] remove docs --- docs/README.md | 94 - docs/autodoc_build/.gitignore | 2 - docs/changelog.md | 123 - docs/contributing.md | 13 - docs/faq.md | 13 - docs/file-specification.md | 20 - docs/gse_finder.md | 81 - docs/howto-location.md | 19 - docs/img/arguments_outputs.svg | 8186 ------ docs/img/geofetch_bug.svg | 81 - docs/img/geofetch_bug_dark.svg | 81 - docs/img/geofetch_logo.svg | 141 - docs/img/geofetch_logo_dark.svg | 129 - docs/img/meta_integration.svg | 21668 ---------------- docs/img/pipeline.svg | 7893 ------ docs/install.md | 37 - docs/metadata_output.md | 46 - docs/sra_convert.md | 18 - docs/usage.md | 127 - docs/usage_template.md | 5 - .../build/processed-data-downloading.md | 371 - docs_jupyter/build/python-usage.md | 360 - docs_jupyter/build/raw-data-downloading.md | 403 - .../how_to_convert_fastq_from_sra.ipynb | 736 - docs_jupyter/processed-data-downloading.ipynb | 522 - docs_jupyter/python-usage.ipynb | 718 - docs_jupyter/raw-data-downloading.ipynb | 519 - update_usage_docs.sh | 27 - 28 files changed, 42433 deletions(-) delete mode 100644 docs/README.md delete mode 100644 docs/autodoc_build/.gitignore delete mode 100644 docs/changelog.md delete mode 100644 docs/contributing.md delete mode 100644 docs/faq.md delete mode 100644 docs/file-specification.md delete mode 100644 docs/gse_finder.md delete mode 100644 docs/howto-location.md delete mode 100644 docs/img/arguments_outputs.svg delete mode 100644 docs/img/geofetch_bug.svg delete mode 100644 docs/img/geofetch_bug_dark.svg delete mode 100644 docs/img/geofetch_logo.svg delete mode 100644 docs/img/geofetch_logo_dark.svg delete mode 100644 docs/img/meta_integration.svg delete mode 100644 docs/img/pipeline.svg delete mode 100644 docs/install.md delete mode 100644 docs/metadata_output.md delete mode 100644 docs/sra_convert.md delete mode 100644 docs/usage.md delete mode 100644 docs/usage_template.md delete mode 100644 docs_jupyter/build/processed-data-downloading.md delete mode 100644 docs_jupyter/build/python-usage.md delete mode 100644 docs_jupyter/build/raw-data-downloading.md delete mode 100644 docs_jupyter/how_to_convert_fastq_from_sra.ipynb delete mode 100644 docs_jupyter/processed-data-downloading.ipynb delete mode 100644 docs_jupyter/python-usage.ipynb delete mode 100644 docs_jupyter/raw-data-downloading.ipynb delete mode 100755 update_usage_docs.sh diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 3d00313..0000000 --- a/docs/README.md +++ /dev/null @@ -1,94 +0,0 @@ -# - -[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](https://pepkit.github.io) -![Run pytests](https://github.com/pepkit/geofetch/workflows/Run%20pytests/badge.svg) -[![docs-badge](https://readthedocs.org/projects/geofetch/badge/?version=latest)](https://geofetch.databio.org/en/latest/) -[![pypi-badge](https://img.shields.io/pypi/v/geofetch)](https://pypi.org/project/geofetch) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) - -`geofetch` is a command-line tool that downloads and organizes data and metadata from GEO and SRA. When given one or more GEO/SRA accessions, `geofetch` will: - - - Download either raw or processed data from either [SRA](https://www.ncbi.nlm.nih.gov/sra) or [GEO](https://www.ncbi.nlm.nih.gov/geo/) - - Produce a standardized [PEP](http://pepkit.github.io) sample table. This makes it really easy to run [looper](https://pepkit.github.io/docs/looper/)-compatible pipelines on public datasets by handling data acquisition and metadata formatting and standardization for you. - - Prepare a project to run with [sraconvert](sra_convert.md) to convert SRA files into FASTQ files. - -![](./img/pipeline.svg) - -## Key geofetch advantages: - -- Works with GEO and SRA metadata -- Combines samples from different projects -- ![](./img/meta_integration.svg) -- Standardizes output metadata -- Filters type and size of processed files (from GEO) before downloading them -- Easy to use -- Fast execution time -- Can search GEO to find relevant data -- Can be used either as a command-line tool or from within Python using an API - - - -## Quick example - -`geofetch` runs on the command line. This command will download the raw data and metadata for the given GSE number. - -```console -geofetch -i GSE95654 -``` - -You can add `--processed` if you want to download processed files from the given experiment. - - -```console -geofetch -i GSE95654 --processed -``` - - -You can add `--just-metadata` if you want to download metadata without the raw SRA files or processed GEO files. - -```console -geofetch -i GSE95654 --just-metadata -``` - -```console -geofetch -i GSE95654 --processed --just-metadata -``` - - -⁣**Note:** We ensure that GEOfetch is compatible with Unix, Linux, and Mac OS X. -However, due to dependencies, some features of GEOfetch may not be available on Windows. - -### Check out what exactly argument you want to use to download data: - -![](./img/arguments_outputs.svg) - ---- -### New features available in geofetch 0.11.0: -1) Now geofetch is available as Python API package. Geofetch can initialize [peppy](http://peppy.databio.org/) projects without downloading any soft files. Example: - -```python -from geofetch import Geofetcher - -# initiate Geofetcher with all necessary arguments: -geof = Geofetcher(processed=True, acc_anno=True, discard_soft=True) - -# get projects by providing as input GSE or file with GSEs -geof.get_projects("GSE160204") -``` - -2) Now to find GSEs and save them to file you can use `Finder` - GSE finder tool: - -```python -from geofetch import Finder - -# initiate Finder (use filters if necessary) -find_gse = Finder(filters='bed') - -# get all projects that were found: -gse_list = find_gse.get_gse_all() -``` -Find more information here: [GSE Finder](./gse_finder.md) - - -For more details, check out the [usage](usage.md) reference, [installation instructions](install.md), or head on over to the [tutorial for raw data](raw-data-downloading.md) and [tutorial for processed data](processed-data-downloading.md) for a detailed walkthrough. - diff --git a/docs/autodoc_build/.gitignore b/docs/autodoc_build/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/docs/autodoc_build/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/docs/changelog.md b/docs/changelog.md deleted file mode 100644 index e7af384..0000000 --- a/docs/changelog.md +++ /dev/null @@ -1,123 +0,0 @@ -# Changelog - -## [0.12.6] -- 2024-02-05 -- Updated support for Windows in Prefetch (Note: Some functionality may still be unavailable on Windows) - -## [0.12.5] -- 2023-11-29 -- Fixed bug, where description was not populated in PEP - -## [0.12.4] -- 2023-08-01 -- Fixed SRA convert -- Added how to convert SRA - -## [0.12.3] -- 2023-06-21 -- Fixed preserving order of project keys (#119) - -## [0.12.2] -- 2023-04-25 -- Added `max-prefetch-size` argument. #113 -- Improved code and logger structure. - -## [0.12.0] -- 2023-03-27 -- Added functionality that saves gse metadata to config file -- Fixed description in initialization of pepy object - -## [0.11.2] -- 2022-12-25 -- Changed sample_name of PEP of processed files to file oriented -- Added `--max-soft-size` argument, that sets size limit of soft files -- - Added functionality that skips downloading GEO tables that are in soft files -- Fixed bug of creating unwanted empty folders -- Fixed problem with missing data - -## [0.11.1] -- 2022-11-28 -- Fixed requirements file -- Fixed bug in expanding metadata list -- Fixed bug in metadata links - -## [0.11.0] -- 2022-10-26 -- Added initialization of peppy Project without saving any files (from within Python using an API) -- Added Finder (searching GSE tool) -- Added progress bar -- Switched way of saving soft files to request library -- Improved documentation -- Refactored code -- Added `--add-convert-modifier` flag -- fixed looper amendments in the config file -- Fixed special character bug in the config file -- Fixed None issue in config file -- Fixed saving raw peps bug - -## [0.10.1] -- 2022-08-04 -- Updated metadata fetching requests from SRA database - -## [0.10.0] -- 2022-07-07 -- Fixed subprocesses continuing to run during program interrupt. -- Fixed issues with compatibility with NCBI API - -## [0.9.0] -- 2022-06-20 -- Updated `--pipeline-interface` argument that adds it in for looper. `--pipeline-interface` argument was divided into: -`--pipeline-samples` and `--pipeline-project`. -- Fixed empty sample_name error while creating PEP. -- Added `--discard-soft` argument. -- Added `--const-limit-project` argument. -- Added `--const-limit-discard` argument. -- Added `--attr-limit-truncate` argument. -- Added `"--add-dotfile"` argument. -- Disabled creating combined pep when flag `--acc-anno` is set. -- Improved finding and separating metadata keys and genome assembly information. -- Added standardization of column names by replacing characters to lowercase and spaces by underscore. - - -## [0.8.0] -- 2022-03-10 -- Added `--filter-size` argument. -- Added `--data-source` argument. -- Removed `--tar_re` argument. -- Added PEP for processed data. -- Updated regex filter (case-insensitive update). -- Changed way of downloading processed data (downloading each file separately). -- Fixed code errors. -- Separated sample and experiment processed data. - - -## [0.7.0] -- 2020-05-21 -- Fixed user interface for bam conversions -- Added regex filter for processed data filenames, which will also auto-extract from tar archives -- Updated output to PEP 2.0 -- Added `--skip` argument -- Added more control over where to store results. -- Integrate `sraconvert` into geofetch package. - - -## [0.6.0] -- 2019-06-20 -- Fixed a bug with specifying a processed data output folder -- Added a pre-check and warning message for `prefetch` command - - -## [0.5.0] -- 2019-05-09 - -- `geofetch` will now re-try a failed prefetch 3 times and warn if unsuccessful. -- Fixed a bug that prevented writing metadata in python3. -- More robust SOFT line parsing -- Use [`logmuse`](http://logmuse.databio.org/en/latest/) for messaging -- Improve modularity to facilitate non-CLI use if desired -- Better documentation - -## [0.4.0] -- (2019-03-13) - -- Fixed a bug with default generic config template -- Added `--version` option -- Improved python 3 compatibility - -## [0.2.0] -- (2019-02-28) - -- Fixed bugs that prevented install from pypi - -## [0.1.0] -- (2019-02-27) - -- First official release -- Enabled command-line usage -- Packaged `geofetch` for release on PyPI - - -## [0.0.0] -- (2017-10-24) - - - Legacy, unversioned development initiated diff --git a/docs/contributing.md b/docs/contributing.md deleted file mode 100644 index 4dd5843..0000000 --- a/docs/contributing.md +++ /dev/null @@ -1,13 +0,0 @@ -## Contributing - -Pull requests or issues are welcome. - -- After adding tests in `tests` for a new feature or a bug fix, please run the test suite. -- To do so, the only additional dependencies needed beyond those for the package can be -installed with: - - ```pip install -r requirements/requirements-dev.txt``` - -- Once those are installed, the tests can be run with `pytest`. Alternatively, -`python setup.py test` can be used. - diff --git a/docs/faq.md b/docs/faq.md deleted file mode 100644 index de63b57..0000000 --- a/docs/faq.md +++ /dev/null @@ -1,13 +0,0 @@ -# FAQ - - -## I get an error: `geofetch: command not found` after installing. Why isn't the `geofetch` executable in my path? - - -By default, Python packages are installed to `~/.local/bin`. You can add this location to your path by appending it: - -``` -export PATH=$PATH:~/.local/bin -``` - -Add this line to your `.bashrc` or `.profile` to make it permanent. diff --git a/docs/file-specification.md b/docs/file-specification.md deleted file mode 100644 index 7b1449c..0000000 --- a/docs/file-specification.md +++ /dev/null @@ -1,20 +0,0 @@ -# How to specify samples to download - -The command-line interface provides a way to give GSE or SRA accession IDs. By default, `geofetch` will download all the samples it can find in the accession you give it. What if you want to restrict the download to just a few samples? Or what if you want to combine samples from multiple accessions? If you want more control, either because you have multiple accessions or you want to specify a subset of samples, then you can use the *file-based sample specification*, in which you provide `geofetch` with a file listing your GSE/GSM accessions. - -## The file-based sample specification - - -Create a file with 3 columns that correspond to `GSE`, `GSM`, and `Sample_name. You may mix 1, 2, and 3 column lines in the file. An example input file could look like this: - -```console -GSE123 GSM#### Sample1 -GSE123 GSM#### Sample2 -GSE123 GSM#### -GSE456 -``` - -By default, `geofetch` will download all the samples in every included accession, but you can limit this by adding a second column with **GSM accessions** (which specify individual samples with a **GSE dataset**). If the second column is included, a third column may also be included and will be used -as the sample_name; otherwise, the sample will be named according to the GEO Sample_title field. Any columns after the third will be ignored. - -This will download 3 particular GSM experiments from GSE123, and everything from GSE456. It will name the first two samples Sample1 and Sample2, and the third, plus any from GSE456, will have names according to GEO metadata. diff --git a/docs/gse_finder.md b/docs/gse_finder.md deleted file mode 100644 index 5daa083..0000000 --- a/docs/gse_finder.md +++ /dev/null @@ -1,81 +0,0 @@ -is a geofetch class that provides functions to find and retrieve a list of GSE ([GEO](https://www.ncbi.nlm.nih.gov/geo/) accession number) by using NCBI searching tool. - - -### The main features of the geofetch Finder are: -- Find GEO accession numbers (GSE) of the project that were uploaded or updated in certain period of time. -- Use the same filter query as [GEO DataSets Advanced Search Builder](https://www.ncbi.nlm.nih.gov/gds/advanced) is using -- Save list of the GSEs to file (This file with geo can be used later in **[geofetch](http://geofetch.databio.org/en/latest/)**) -- Easier and faster to get GSEs using NCBI filter and certain period of time. - - -___ -## Tutorial - -0) Initiale Finder object. -```python -from geofetch import Finder -gse_obj = Finder() - -# Optionally: provide filter string and max number of retrieve elements -gse_obj = Finder(filters="((bed) OR narrow peak) AND Homo sapiens[Organism]", retmax=10) -``` - -1) Get list of all GSE in GEO -```python - -gse_list = gse_obj.get_gse_all() - -``` - -2) Get list of GSE that were uploaded and updated last week -```python - -gse_list = gse_obj.get_gse_last_week() - -``` - -3) Get list of GSE that were uploaded and updated last 3 month -```python - -gse_list = gse_obj.get_gse_last_3_month() - -``` - -4) Get list of GSE that were uploaded and updated in las *number of days* -```python - -# project that were uploaded in last 5 days: -gse_list = gse_obj.get_gse_by_day_count(5) - -``` - -5) Get list of GSE that were uploaded in certain period of time -```python - -gse_list = gse_obj.get_gse_by_date(start_date="2015/05/05", end_date="2020/05/05") - -``` - -6) Save last searched list of items to the file -```python - -gse_obj.generate_file("path/to/the/file") - -# if you want to save different list of files you can provide it to the funciton -gse_obj.generate_file("path/to/the/file", gse_list=["123", "124"]) - -``` - -7) Compare two lists: -```python - -new_gse_list = gse_obj.find_differences(list1, list2) - -``` - ----- - -More information about gse and queries and id: -- https://www.ncbi.nlm.nih.gov/geo/info/geo_paccess.html -- https://newarkcaptain.com/how-to-retrieve-ncbi-geo-information-using-apis-part1/ -- https://www.ncbi.nlm.nih.gov/books/NBK3837/#EntrezHelp.Using_the_Advanced_Search_Pag \ No newline at end of file diff --git a/docs/howto-location.md b/docs/howto-location.md deleted file mode 100644 index 1204624..0000000 --- a/docs/howto-location.md +++ /dev/null @@ -1,19 +0,0 @@ - -## Setting data download location with `sratools` - -`geofetch` is using the [sratoolkit](https://trace.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc&f=std) to download raw data from SRA -- which means it's stuck with the [default path for downloading SRA data](http://databio.org/posts/downloading_sra_data.html), which I've written about. So before you run `geofetch`, make sure you have set up your download location to the correct place. In our group, we use a shared group environment variable called `${SRARAW}`, which points to a shared folder (`${DATA}/sra`) where the whole group has access to downloaded SRA data. You can point the `sratoolkit` (and therefore `geofetch`) to use that location with this one-time configuration code: - -``` -# Set your $DATA environment variable -export DATA="/path/to/data/" -``` - -``` -echo "/repository/user/main/public/root = \"$DATA\"" > ${HOME}/.ncbi/user-settings.mkfg -``` - -Now `sratoolkit` will download data into an `/sra` folder in `${DATA}`, which is what `${SRARAW}` points to. - -If you are getting an error that the `.ncbi` folder does not exist in your home directory, you can just make a folder `.ncbi` with an empty file `user-settings.mkfg` and follow the same command above. - - diff --git a/docs/img/arguments_outputs.svg b/docs/img/arguments_outputs.svg deleted file mode 100644 index 89cb3c5..0000000 --- a/docs/img/arguments_outputs.svg +++ /dev/null @@ -1,8186 +0,0 @@ - - - ---processed --just-metadata --data-source samples--processed--data-source samples--processed --just-metadata--data-source series--processed--data-source series--processed--data-source all--processed --just-metadata --data-source allArguments#12345678--just-metadataOutputData SourceSamplesSamplesSeriesSeriesallallSamplesSamplesMetadataProcessedProcessedProcessedProcessedProcessedProcessedRawRawDataProcessedProcessedProcessedNoneNoneNoneRawNone diff --git a/docs/img/geofetch_bug.svg b/docs/img/geofetch_bug.svg deleted file mode 100644 index b98b3a6..0000000 --- a/docs/img/geofetch_bug.svg +++ /dev/null @@ -1,81 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - diff --git a/docs/img/geofetch_bug_dark.svg b/docs/img/geofetch_bug_dark.svg deleted file mode 100644 index d2ae046..0000000 --- a/docs/img/geofetch_bug_dark.svg +++ /dev/null @@ -1,81 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - diff --git a/docs/img/geofetch_logo.svg b/docs/img/geofetch_logo.svg deleted file mode 100644 index 372c82c..0000000 --- a/docs/img/geofetch_logo.svg +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - geofetch - - - - - - - - - - - - diff --git a/docs/img/geofetch_logo_dark.svg b/docs/img/geofetch_logo_dark.svg deleted file mode 100644 index ed90258..0000000 --- a/docs/img/geofetch_logo_dark.svg +++ /dev/null @@ -1,129 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - geofetch - - diff --git a/docs/img/meta_integration.svg b/docs/img/meta_integration.svg deleted file mode 100644 index 4bb0024..0000000 --- a/docs/img/meta_integration.svg +++ /dev/null @@ -1,21668 +0,0 @@ - - - -image/svg+xml Integration of multiple GEO projects GSE1GSE2GSE3GEO projects PortableEncapsulatedProject diff --git a/docs/img/pipeline.svg b/docs/img/pipeline.svg deleted file mode 100644 index e411359..0000000 --- a/docs/img/pipeline.svg +++ /dev/null @@ -1,7893 +0,0 @@ - - - -PortableEncapsulatedProjectpeppypeprGEO/SRAData diff --git a/docs/install.md b/docs/install.md deleted file mode 100644 index 01c9c12..0000000 --- a/docs/install.md +++ /dev/null @@ -1,37 +0,0 @@ -# Installing geofetch - -## Installing geofetch - -Releases are posted as [GitHub releases](https://github.com/pepkit/geofetch/releases), or you can install from PyPI using `pip`: - -```bash -pip install geofetch -``` - -Confirm it was successful by running it on the command line: - -```console -geofetch --help -``` - -If the executable in not in your $PATH, append this to your `.bashrc` or `.profile` (or `.bash_profile` on macOS): - -``` -export PATH=~/.local/bin:$PATH -``` - -## Prerequisites for SRA data downloading - -To download **raw data** You must have the [sratoolkit from NCBI](https://www.ncbi.nlm.nih.gov/books/NBK158900/) installed, with the tools in your PATH. Once it's installed, you should check to make sure you can run `prefetch`. Also, make sure it's configured to store SRA files where you want them. For more information, see how to change sratools download location. - -## Setting data download location for `sratools` - -`geofetch` is using the [sratoolkit](https://trace.ncbi.nlm.nih.gov/Traces/sra/?view=toolkit_doc&f=std) to download raw data from SRA -- which means it's stuck with the [default path for downloading SRA data](http://databio.org/posts/downloading_sra_data.html), which is in your home directory. So before you run `geofetch`, make sure you have set up your download location to the correct place. In our group, we use a shared group environment variable called `${SRARAW}`, which points to a shared folder (`${DATA}/sra`) where the whole group has access to downloaded SRA data. You can point the `sratoolkit` (and therefore `geofetch`) to use that location with this one-time configuration code: - -``` -echo "/repository/user/main/public/root = \"$DATA\"" > ${HOME}/.ncbi/user-settings.mkfg -``` - -Now `sratoolkit` will download data into an `/sra` folder in `${DATA}`, which is what `${SRARAW}` points to. - -If you are getting an error that the `.ncbi` folder does not exist in your home directory, you can just make a folder `.ncbi` with an empty file `user-settings.mkfg` and follow the same command above. \ No newline at end of file diff --git a/docs/metadata_output.md b/docs/metadata_output.md deleted file mode 100644 index 5763b2f..0000000 --- a/docs/metadata_output.md +++ /dev/null @@ -1,46 +0,0 @@ -# Metadata output - -Geofetch produces [PEPs](http://pep.databio.org/) for either processed or raw data (including metadata from SRA). -A project can be created either for a single combined (whole) input or for each project separately. -(if `--acc-anno` is set). "combined" means that it will have rows for every sample in every GSE included -in your input. So if you just gave a single GSE, then the combined file is the same as the GSE file. - -**For raw data**: a metadata file will be created including SRA and GSM annotation. - -**For processed data**: a metadata file will be created just for GSE and GSM annotation. User -can choose which data should he download. There are 3 downloading options for processed: samples, series and both. - -### Single PEP will contain: -- project_name.csv - all metadata for sample processed data -- project_name_subannotation.csv (*just for raw data*) - for *merged* samples -(samples for which there are multiple SRR Runs for a single SRX `Experiment`) -- project_name.yaml - project config file that stores all project information + common samples metadata - -Storing common metadata in project file is an efficient way to reduce project size and complexity of csv files. -To specify and manage common metadata (where and how it should be stored) you can use next arguments: -`--const-limit-project`, `--const-limit-discard`, `--attr-limit-truncate` - -### Saving actual data: -Actual data will be saved if `--just-metadata` argument is not set. User should specify path to the folder where this -data should be downloaded. - ----- -Additionally, for each GSE input accession (ACC), `geofetch` produces (if discard-soft is not set): - -- GSE_ACC####.soft a SOFT file (annotating the experiment itself) -- GSM_ACC####.soft a SOFT file (annotating the samples within the experiment) -- SRA_ACC####.soft a CSV file (annotating each SRA Run, retrieved from GSE->GSM->SRA) - -____ -# geofetch - Geofetcher using Python - -user can use geofetch in Python without saving any files. All the geofetch projects will be automatically downloaded -as peppy Project. It helps save time and processing work. - -THe output in this case will be dictionary of projects: -```python -{'key1': (some_project), - 'key2': (second_project)} -``` - -More information you can find in tutorial files. \ No newline at end of file diff --git a/docs/sra_convert.md b/docs/sra_convert.md deleted file mode 100644 index 14f0725..0000000 --- a/docs/sra_convert.md +++ /dev/null @@ -1,18 +0,0 @@ -# Sraconvert - -When you install geofetch, you also get a second utility called `sraconvert` that handles converting sra data into either `bam` or `fastq` format for downstream processing. Sraconvert is essentially a wrapper around NCBI's sra-tools that provides more convenient interface to converting pre-downloaded `sra` files. - -The basic advantages over just using prefetch are: - -- it provides the same interface to either download or delete sra files -- it uses the same interface to delete converted files, if desired -- it can automatically delete sra data that has been already converted -- it allows a more flexible specification of locations, using either environment variables or command-line arguments. - -This effectively makes it easier to interact with *project-level* management of sra and fastq data using [looper](http://looper.databio.org) and PEP-compatible projects. - - - -## Tutorial - -See the [how-to](how_to_convert_fastq_from_sra.md) for an example of how to use `sraconvert`. \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md deleted file mode 100644 index 63f1db7..0000000 --- a/docs/usage.md +++ /dev/null @@ -1,127 +0,0 @@ -# usage reference - -`geofetch` command-line usage instructions: - - - -`geofetch --help` -```{console} -usage: geofetch [] - -The example how to use geofetch (to download GSE573030 just metadata): - geofetch -i GSE67303 -m --just-metadata - -To download all processed data of GSE57303: - geofetch -i GSE67303 --processed --geo-folder -m - -Automatic GEO and SRA data downloader - -options: - -h, --help show this help message and exit - -V, --version show program's version number and exit - -i INPUT, --input INPUT - required: a GEO (GSE) accession, or a file with a list - of GSE numbers - -n NAME, --name NAME Specify a project name. Defaults to GSE number - -m METADATA_ROOT, --metadata-root METADATA_ROOT - Specify a parent folder location to store metadata. - The project name will be added as a subfolder - [Default: $SRAMETA:] - -u METADATA_FOLDER, --metadata-folder METADATA_FOLDER - Specify an absolute folder location to store metadata. - No subfolder will be added. Overrides value of - --metadata-root. - --just-metadata If set, don't actually run downloads, just create - metadata - -r, --refresh-metadata - If set, re-download metadata even if it exists. - --config-template CONFIG_TEMPLATE - Project config yaml file template. - --pipeline-samples PIPELINE_SAMPLES - Optional: Specify one or more filepaths to SAMPLES - pipeline interface yaml files. These will be added to - the project config file to make it immediately - compatible with looper. [Default: null] - --pipeline-project PIPELINE_PROJECT - Optional: Specify one or more filepaths to PROJECT - pipeline interface yaml files. These will be added to - the project config file to make it immediately - compatible with looper. [Default: null] - --disable-progressbar - Optional: Disable progressbar - -k SKIP, --skip SKIP Skip some accessions. [Default: no skip]. - --acc-anno Optional: Produce annotation sheets for each - accession. Project combined PEP for the whole project - won't be produced. - --discard-soft Optional: After creation of PEP files, all .soft files - will be deleted - --const-limit-project CONST_LIMIT_PROJECT - Optional: Limit of the number of the constant sample - characters that should not be in project yaml. - [Default: 50] - --const-limit-discard CONST_LIMIT_DISCARD - Optional: Limit of the number of the constant sample - characters that should not be discarded [Default: 250] - --attr-limit-truncate ATTR_LIMIT_TRUNCATE - Optional: Limit of the number of sample characters.Any - attribute with more than X characters will truncate to - the first X, where X is a number of characters - [Default: 500] - --add-dotfile Optional: Add .pep.yaml file that points .yaml PEP - file - --max-soft-size MAX_SOFT_SIZE - Optional: Max size of soft file. [Default: 1GB]. - Supported input formats : 12B, 12KB, 12MB, 12GB. - --max-prefetch-size MAX_PREFETCH_SIZE - Argument to pass to prefetch program's --max-size - option, if prefetch will be used in this run of - geofetch; for reference: https://github.com/ncbi/sra- - tools/wiki/08.-prefetch-and-fasterq-dump#check-the- - maximum-size-limit-of-the-prefetch-tool - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. - -processed: - -p, --processed Download processed data [Default: download raw data]. - --data-source {all,samples,series} - Optional: Specifies the source of data on the GEO - record to retrieve processed data, which may be - attached to the collective series entity, or to - individual samples. Allowable values are: samples, - series or both (all). Ignored unless 'processed' flag - is set. [Default: samples] - --filter FILTER Optional: Filter regex for processed filenames - [Default: None].Ignored unless 'processed' flag is - set. - --filter-size FILTER_SIZE - Optional: Filter size for processed files that are - stored as sample repository [Default: None]. Works - only for sample data. Supported input formats : 12B, - 12KB, 12MB, 12GB. Ignored unless 'processed' flag is - set. - -g GEO_FOLDER, --geo-folder GEO_FOLDER - Optional: Specify a location to store processed GEO - files. Ignored unless 'processed' flag is - set.[Default: $GEODATA:] - -raw: - -x, --split-experiments - Split SRR runs into individual samples. By default, - SRX experiments with multiple SRR Runs will have a - single entry in the annotation table, with each run as - a separate row in the subannotation table. This - setting instead treats each run as a separate sample - -b BAM_FOLDER, --bam-folder BAM_FOLDER - Optional: Specify folder of bam files. Geofetch will - not download sra files when corresponding bam files - already exist. [Default: $SRABAM:] - -f FQ_FOLDER, --fq-folder FQ_FOLDER - Optional: Specify folder of fastq files. Geofetch will - not download sra files when corresponding fastq files - already exist. [Default: $SRAFQ:] - --use-key-subset Use just the keys defined in this module when writing - out metadata. - --add-convert-modifier - Add looper SRA convert modifier to config file. -``` diff --git a/docs/usage_template.md b/docs/usage_template.md deleted file mode 100644 index 81e6348..0000000 --- a/docs/usage_template.md +++ /dev/null @@ -1,5 +0,0 @@ -# usage reference - -`geofetch` command-line usage instructions: - - diff --git a/docs_jupyter/build/processed-data-downloading.md b/docs_jupyter/build/processed-data-downloading.md deleted file mode 100644 index b851a61..0000000 --- a/docs_jupyter/build/processed-data-downloading.md +++ /dev/null @@ -1,371 +0,0 @@ -jupyter:True -# geofetch tutorial for processed data - -The [GSE185701 data set](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE185701) has about 355 Mb of processed data that contains 57 Supplementary files, so it's a quick download for a test case. Let's take a quick peek at the geofetch version: - - -```bash -geofetch --version -``` - -```.output -geofetch 0.10.1 - -``` - -To see your CLI options, invoke `geofetch -h`: - -Calling geofetch will do 4 tasks: - -1. download all or filtered processed files from `GSE#####` into your geo folder. -2. download all metadata from GEO and store in your metadata folder. -2. produce a PEP-compatible sample table, `PROJECT_NAME_sample_processed.csv` and `PROJECT_NAME_series_processed.csv`, in your metadata folder. -3. produce a PEP-compatible project configuration file, `PROJECT_NAME_sample_processed.yaml` and `PROJECT_NAME_series_processed.yaml`, in your metadata folder. - -Complete details about geofetch outputs is cataloged in the [metadata outputs reference](metadata_output.md). - -from IPython.core.display import SVG -SVG(filename='logo.svg') - -![arguments_outputs.svg](attachment:arguments_outputs.svg) - -## Download the data - -First, create the metadata for processed data (by adding --processed and --just-metadata): - - -```bash -geofetch -i GSE185701 --processed -n bright_test --just-metadata -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test -Trying GSE185701 (not a file) as accession... -Skipped 0 accessions. Starting now. -Processing accession 1 of 1: 'GSE185701' ---2022-07-08 12:34:57-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gse&acc=GSE185701&form=text&view=full -Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110 -Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: unspecified [geo/text] -Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSE.soft’ - -/home/bnt4me/Virgin [ <=> ] 2.82K --.-KB/s in 0s - -2022-07-08 12:34:57 (973 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSE.soft’ saved [2885] - ---2022-07-08 12:34:57-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE185701&form=text&view=full -Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110 -Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: unspecified [geo/text] -Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSM.soft’ - -/home/bnt4me/Virgin [ <=> ] 39.51K 132KB/s in 0.3s - -2022-07-08 12:34:58 (132 KB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSM.soft’ saved [40454] - - ---2022-07-08 12:34:58-- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE185nnn/GSE185701/suppl/filelist.txt - => ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_file_list.txt’ -Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::10, 2607:f220:41e:250::7, 165.112.9.229, ... -Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::10|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -==> TYPE I ... done. ==> CWD (1) /geo/series/GSE185nnn/GSE185701/suppl ... done. -==> SIZE filelist.txt ... 794 -==> EPSV ... done. ==> RETR filelist.txt ... done. -Length: 794 (unauthoritative) - -filelist.txt 100%[===================>] 794 --.-KB/s in 0s - -2022-07-08 12:34:58 (219 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_file_list.txt’ saved [794] - -0 - -Total number of processed SAMPLES files found is: 8 -Total number of processed SERIES files found is: 1 -Expanding metadata list... -Expanding metadata list... -Finished processing 1 accession(s) -Unifying and saving of metadata... -File /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/PEP_samples/GSE185701_samples.csv has been saved successfully - Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/PEP_samples/GSE185701_samples.yaml - -``` - - -```bash -ls bright_test -``` - -```.output -GSE185701_file_list.txt GSE185701_GSE.soft GSE185701_GSM.soft PEP_samples - -``` - -The `.soft` files are the direct output from GEO, which contain all the metadata as stored by GEO, for both the experiment (`_GSE`) and for the individual samples (`_GSM`). Geofetch also produces a `csv` file with the SRA metadata. The filtered version (ending in `_filt`) would contain only the specified subset of the samples if we didn't request them all, but in this case, since we only gave an accession, it is identical to the complete file. Additionally, file_list.txt is downloaded, that contains information about size, type and creation date of all sample files. - -Finally, there are the 2 files that make up the PEP: the `_config.yaml` file and the `_annotation.csv` file (for samples and series). Let's see what's in these files now. - - -```bash -cat bright_test/PEP_samples/GSE185701_samples.yaml -``` - -```.output -# Autogenerated by geofetch - -pep_version: 2.1.0 -project_name: GSE185701 -sample_table: GSE185701_samples.csv - -sample_modifiers: - append: - output_file_path: FILES - sample_growth_protocol_ch1: Huh 7 was cultured in Dulbecco’s modified Eagle’s medium (DMEM) (Invitrogen, Carlsbad, CA, USA) containing 10% fetal bovine serum (FBS) (HyClone, Logan, UT, USA) and antibiotics (penicillin and streptomycin, Invitrogen) at 37 °C in 5% CO2. - - derive: - attributes: [output_file_path] - sources: - FILES: /{gse}/{file} - - - - -``` - -There are few important things to note in this file: - -* First, see in the PEP that `sample_table` points to the csv file produced by geofetch. -* Second: output_file_path is location of all the files. -* Third: sample_modifier Sample_growth_protocol_ch1 is constant sample character and is larger then 50 characters so it is deleted from csv file. For large project it can significantly reduced size of the metadata - -Now let's look at the first 100 characters of the csv file: - - -```bash -cut -c -100 bright_test/PEP_samples/GSE185701_samples.csv -``` - -```.output -sample_taxid_ch1,sample_geo_accession,sample_channel_count,sample_instrument_model,biosample,supplem -9606,GSM5621756,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223730,wig files were gen -9606,GSM5621756,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223730,wig files were gen -9606,GSM5621758,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223732,wig files were gen -9606,GSM5621758,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223732,wig files were gen -9606,GSM5621760,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223728,wig files were gen -9606,GSM5621760,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223728,wig files were gen -9606,GSM5621761,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223729,wig files were gen -9606,GSM5621761,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223729,wig files were gen - -``` - -Now let's download the actual data. This time we will will be downloading data from the [GSE185701 data set](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE185701) . - -Let's additionally add few arguments: - -* _geo-folder_ (required) - path to the location where processed files have to be saved -* _filter_ argument, to download only _bed_ files (--filter ".Bed.gz$") -* _data-source_ argument, to download files only from sample location (--data-source samples) - - -```bash -geofetch -i GSE185701 --processed -n bright_test --filter ".bed.gz$" --data-source samples \ ---geo-folder /home/bnt4me/Virginia/for_docs/geo -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter -Trying GSE185701 (not a file) as accession... -Skipped 0 accessions. Starting now. -Processing accession 1 of 1: 'GSE185701' ---2022-07-08 12:36:16-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gse&acc=GSE185701&form=text&view=full -Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110 -Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: unspecified [geo/text] -Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSE.soft’ - -/home/bnt4me/Virgin [ <=> ] 2.82K --.-KB/s in 0s - -2022-07-08 12:36:16 (245 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSE.soft’ saved [2885] - ---2022-07-08 12:36:16-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE185701&form=text&view=full -Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110 -Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: unspecified [geo/text] -Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSM.soft’ - -/home/bnt4me/Virgin [ <=> ] 39.51K --.-KB/s in 0.1s - -2022-07-08 12:36:16 (269 KB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSM.soft’ saved [40454] - - ---2022-07-08 12:36:16-- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE185nnn/GSE185701/suppl/filelist.txt - => ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_file_list.txt’ -Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::12, 2607:f220:41e:250::13, 130.14.250.13, ... -Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::12|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -==> TYPE I ... done. ==> CWD (1) /geo/series/GSE185nnn/GSE185701/suppl ... done. -==> SIZE filelist.txt ... 794 -==> EPSV ... done. ==> RETR filelist.txt ... done. -Length: 794 (unauthoritative) - -filelist.txt 100%[===================>] 794 --.-KB/s in 0s - -2022-07-08 12:36:17 (2.55 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_file_list.txt’ saved [794] - -0 - -Total number of processed SAMPLES files found is: 8 -Total number of files after filter is: 4 -Total number of processed SERIES files found is: 1 -Total number of files after filter is: 0 -Expanding metadata list... -Expanding metadata list... - ---2022-07-08 12:36:17-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621756/suppl/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz - => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz’ -Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ... -Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621756/suppl ... done. -==> SIZE GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz ... 785486 -==> EPSV ... done. ==> RETR GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz ... done. -Length: 785486 (767K) (unauthoritative) - -GSM5621756_ChIPseq_ 100%[===================>] 767.08K 1.64MB/s in 0.5s - -2022-07-08 12:36:19 (1.64 MB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz’ saved [785486] - -0 - -File /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz has been downloaded successfully - ---2022-07-08 12:36:19-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621758/suppl/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz - => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz’ -Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ... -Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621758/suppl ... done. -==> SIZE GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz ... 784432 -==> EPSV ... done. ==> RETR GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz ... done. -Length: 784432 (766K) (unauthoritative) - -GSM5621758_ChIPseq_ 100%[===================>] 766.05K 1.03MB/s in 0.7s - -2022-07-08 12:36:20 (1.03 MB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz’ saved [784432] - -0 - -File /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz has been downloaded successfully - ---2022-07-08 12:36:21-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621760/suppl/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz - => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz’ -Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ... -Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621760/suppl ... done. -==> SIZE GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz ... 163441 -==> EPSV ... done. ==> RETR GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz ... done. -Length: 163441 (160K) (unauthoritative) - -GSM5621760_CUTTag_H 100%[===================>] 159.61K 816KB/s in 0.2s - -2022-07-08 12:36:21 (816 KB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz’ saved [163441] - -0 - -File /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz has been downloaded successfully - ---2022-07-08 12:36:22-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621761/suppl/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz - => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz’ -Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ... -Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected. -Logging in as anonymous ... Logged in! -==> SYST ... done. ==> PWD ... done. -==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621761/suppl ... done. -==> SIZE GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz ... 117250 -==> EPSV ... done. ==> RETR GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz ... done. -Length: 117250 (115K) (unauthoritative) - -GSM5621761_CUTTag_H 100%[===================>] 114.50K 318KB/s in 0.4s - -2022-07-08 12:36:23 (318 KB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz’ saved [117250] - -0 - -File /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz has been downloaded successfully -Finished processing 1 accession(s) -Unifying and saving of metadata... -File /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/PEP_samples/GSE185701_samples.csv has been saved successfully - Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/PEP_samples/GSE185701_samples.yaml - -``` - -Now lets list the folder to see what data is there. And let's see what's in pep files now. - - -```bash -ls /home/bnt4me/Virginia/for_docs/geo/GSE185701 -``` - -```.output -GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz -GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz -GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz -GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz - -``` - - -```bash -cut -c -100 cat PEP_samples/GSE185701_samples.csv -``` - -```.output -cut: cat: No such file or directory -sample_platform_id,sample_library_strategy,sample_contact_country,sample_contact_name,sample_contact -GPL20795,ChIP-Seq,China,"Xianghuo,,He",Shanghai,HCC,"transfected with siNC using Lipofectamine RNAiM -GPL20795,ChIP-Seq,China,"Xianghuo,,He",Shanghai,HCC,"transfected with siDHX37 using Lipofectamine RN -GPL20795,OTHER,China,"Xianghuo,,He",Shanghai,HCC,"transfected with Flag-DHX37 lentivirus, renew the -GPL20795,OTHER,China,"Xianghuo,,He",Shanghai,HCC,untreated,SRA,Huh 7,hg38,Homo sapiens,HiSeq X Ten,h - -``` - - - - -```bash -cat PEP_samples/GSE185701_samples.yaml -``` - -```.output -# Autogenerated by geofetch - -pep_version: 2.1.0 -project_name: GSE185701 -sample_table: GSE185701_samples.csv - -sample_modifiers: - append: - output_file_path: FILES - sample_growth_protocol_ch1: Huh 7 was cultured in Dulbecco’s modified Eagle’s medium (DMEM) (Invitrogen, Carlsbad, CA, USA) containing 10% fetal bovine serum (FBS) (HyClone, Logan, UT, USA) and antibiotics (penicillin and streptomycin, Invitrogen) at 37 °C in 5% CO2. - - derive: - attributes: [output_file_path] - sources: - FILES: /home/bnt4me/Virginia/for_docs/geo/{gse}/{file} - - - - -``` - -Now we have easy access to this data by using [peppy](http://peppy.databio.org/en/latest/) package in python or [pepr](https://code.databio.org/pepr/) in r in further analysis diff --git a/docs_jupyter/build/python-usage.md b/docs_jupyter/build/python-usage.md deleted file mode 100644 index 13e98c6..0000000 --- a/docs_jupyter/build/python-usage.md +++ /dev/null @@ -1,360 +0,0 @@ -jupyter:True -# Tutorial of usage geofetch as python package - -♪♫*•♪♪♫*•♪♪♫*•♪♪♫*•♪♪♫* - -Geofetch provides python fuctions to fetch metadata and metadata from GEO and SRA by using python language. `get_project` function returns dictionary of peppy projects that were found using filters and input you specified. - peppy is a Python package that provides an API for handling standardized project and sample metadata. - -More information you can get here: - -http://peppy.databio.org/en/latest/ - -http://pep.databio.org/en/2.0.0/ - -### First let's import geofetch - - -```python -from geofetch import Geofetcher -``` - -### Initiate Geofetch object by specifing parameters that you want to use for downloading metadata/data - -1) If you won't specify any parameters, defaul parameters will be used - - -```python -geof = Geofetcher() -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name - -``` - -2) To download processed data with samples and series specify this two arguments: - - -```python -geof = Geofetcher(processed=True, data_source="all") -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name - -``` - -3) To tune project parameter, where metadata should be stored use next parameters: - - -```python -geof = Geofetcher(processed=True, data_source="all", const_limit_project = 20, const_limit_discard = 500, attr_limit_truncate = 10000 ) -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name - -``` - -4) To add more filter of other options see documentation - -## Run Geofetch - -### By default: -1) No actual data will be downloaded (just_metadata=True) - -2) No soft files will be saved on the disc (discard_soft=True) - - -```python -projects = geof.get_projects("GSE95654") -``` - -```.output -Trying GSE95654 (not a file) as accession... -Trying GSE95654 (not a file) as accession... - -``` - - - Output() - - -```.output -Skipped 0 accessions. Starting now. -Processing accession 1 of 1: 'GSE95654' - -Total number of processed SAMPLES files found is: 40 -Total number of processed SERIES files found is: 0 -Expanding metadata list... -Expanding metadata list... - -``` - - -

-
-
-
-
-
-
- - - -```.output -Finished processing 1 accession(s) -Cleaning soft files ... -Unifying and saving of metadata... - -``` - - - Output() - - - -

-
-
-
-
-
-
- - - - -
-
- - - - - Output() - - - -

-
-
-
-
-
-
- - - - -
-
- - - -```.output -No files found. No data to save. File /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name/GSE95654_series/GSE95654_series.csv won't be created - -``` - -Check if projects were created by checking dict keys: - - -```python -projects.keys() -``` - - - - - dict_keys(['GSE95654_samples']) - - - -project for smaples was created! Now let's look into it. - -\* the values of the dictionary are peppy projects. More information about peppy Project you can find in the documentation: http://peppy.databio.org/en/latest/ - - -```python -len(projects['GSE95654_samples'].samples) -``` - - - - - 40 - - - -We got 40 samples from GSE95654 project. If you want to check if it's correct information go into: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE95654 - -Now let's see actuall data. first 15 project and 5 clolumns: - - -```python -projects['GSE95654_samples'].sample_table.iloc[:15 , :5] -``` - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sample_namesample_library_strategygenome_buildtissuesample_organism_ch1
sample_name
RRBS_on_CRC_patient_8RRBS_on_CRC_patient_8Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_8RRBS_on_adjacent_normal_colon_patient_8Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_32RRBS_on_CRC_patient_32Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_32RRBS_on_adjacent_normal_colon_patient_32Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_41RRBS_on_CRC_patient_41Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_41RRBS_on_adjacent_normal_colon_patient_41Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_42RRBS_on_CRC_patient_42Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_42RRBS_on_adjacent_normal_colon_patient_42Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_ACF_patient_173RRBS_on_ACF_patient_173Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_ACF_patient_515RRBS_on_ACF_patient_515Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_139RRBS_on_normal_crypts_patient_139Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_143RRBS_on_ACF_patient_143Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_143RRBS_on_normal_crypts_patient_143Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_normal_crypts_patient_165RRBS_on_normal_crypts_patient_165Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_165RRBS_on_ACF_patient_165Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
-
- - diff --git a/docs_jupyter/build/raw-data-downloading.md b/docs_jupyter/build/raw-data-downloading.md deleted file mode 100644 index 54539e3..0000000 --- a/docs_jupyter/build/raw-data-downloading.md +++ /dev/null @@ -1,403 +0,0 @@ -jupyter:True -# geofetch tutorial for raw data - -The [GSE67303 data set](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67303) has about 250 mb of data across 4 samples, so it's a quick download for a test case. Let's take a quick peek at the geofetch version: - - -```bash -geofetch --version -``` - -```.output -geofetch 0.10.1 - -``` - -To see your CLI options, invoke `geofetch -h`: - - -```bash -geofetch -h -``` - -```.output -usage: geofetch [-h] [-V] -i INPUT [-n NAME] [-m METADATA_ROOT] - [-u METADATA_FOLDER] [--just-metadata] [-r] - [--config-template CONFIG_TEMPLATE] - [--pipeline-samples PIPELINE_SAMPLES] - [--pipeline-project PIPELINE_PROJECT] [-k SKIP] [--acc-anno] - [--discard-soft] [--const-limit-project CONST_LIMIT_PROJECT] - [--const-limit-discard CONST_LIMIT_DISCARD] - [--attr-limit-truncate ATTR_LIMIT_TRUNCATE] [--add-dotfile] - [-p] [--data-source {all,samples,series}] [--filter FILTER] - [--filter-size FILTER_SIZE] [-g GEO_FOLDER] [-x] - [-b BAM_FOLDER] [-f FQ_FOLDER] [--use-key-subset] [--silent] - [--verbosity V] [--logdev] - -Automatic GEO and SRA data downloader - -optional arguments: - -h, --help show this help message and exit - -V, --version show program's version number and exit - -i INPUT, --input INPUT - required: a GEO (GSE) accession, or a file with a list - of GSE numbers - -n NAME, --name NAME Specify a project name. Defaults to GSE number - -m METADATA_ROOT, --metadata-root METADATA_ROOT - Specify a parent folder location to store metadata. - The project name will be added as a subfolder - [Default: $SRAMETA:] - -u METADATA_FOLDER, --metadata-folder METADATA_FOLDER - Specify an absolute folder location to store metadata. - No subfolder will be added. Overrides value of - --metadata-root [Default: Not used (--metadata-root is - used by default)] - --just-metadata If set, don't actually run downloads, just create - metadata - -r, --refresh-metadata - If set, re-download metadata even if it exists. - --config-template CONFIG_TEMPLATE - Project config yaml file template. - --pipeline-samples PIPELINE_SAMPLES - Optional: Specify one or more filepaths to SAMPLES - pipeline interface yaml files. These will be added to - the project config file to make it immediately - compatible with looper. [Default: null] - --pipeline-project PIPELINE_PROJECT - Optional: Specify one or more filepaths to PROJECT - pipeline interface yaml files. These will be added to - the project config file to make it immediately - compatible with looper. [Default: null] - -k SKIP, --skip SKIP Skip some accessions. [Default: no skip]. - --acc-anno Optional: Produce annotation sheets for each - accession. Project combined PEP for the whole project - won't be produced. - --discard-soft Optional: After creation of PEP files, all soft and - additional files will be deleted - --const-limit-project CONST_LIMIT_PROJECT - Optional: Limit of the number of the constant sample - characters that should not be in project yaml. - [Default: 50] - --const-limit-discard CONST_LIMIT_DISCARD - Optional: Limit of the number of the constant sample - characters that should not be discarded [Default: 250] - --attr-limit-truncate ATTR_LIMIT_TRUNCATE - Optional: Limit of the number of sample characters.Any - attribute with more than X characters will truncate to - the first X, where X is a number of characters - [Default: 500] - --add-dotfile Optional: Add .pep.yaml file that points .yaml PEP - file - --silent Silence logging. Overrides verbosity. - --verbosity V Set logging level (1-5 or logging module level name) - --logdev Expand content of logging message format. - -processed: - -p, --processed Download processed data [Default: download raw data]. - --data-source {all,samples,series} - Optional: Specifies the source of data on the GEO - record to retrieve processed data, which may be - attached to the collective series entity, or to - individual samples. Allowable values are: samples, - series or both (all). Ignored unless 'processed' flag - is set. [Default: samples] - --filter FILTER Optional: Filter regex for processed filenames - [Default: None].Ignored unless 'processed' flag is - set. - --filter-size FILTER_SIZE - Optional: Filter size for processed files that are - stored as sample repository [Default: None]. Works - only for sample data. Supported input formats : 12B, - 12KB, 12MB, 12GB. Ignored unless 'processed' flag is - set. - -g GEO_FOLDER, --geo-folder GEO_FOLDER - Optional: Specify a location to store processed GEO - files. Ignored unless 'processed' flag is - set.[Default: $GEODATA:] - -raw: - -x, --split-experiments - Split SRR runs into individual samples. By default, - SRX experiments with multiple SRR Runs will have a - single entry in the annotation table, with each run as - a separate row in the subannotation table. This - setting instead treats each run as a separate sample - -b BAM_FOLDER, --bam-folder BAM_FOLDER - Optional: Specify folder of bam files. Geofetch will - not download sra files when corresponding bam files - already exist. [Default: $SRABAM:] - -f FQ_FOLDER, --fq-folder FQ_FOLDER - Optional: Specify folder of fastq files. Geofetch will - not download sra files when corresponding fastq files - already exist. [Default: $SRAFQ:] - --use-key-subset Use just the keys defined in this module when writing - out metadata. - -``` - -Calling geofetch will do 4 tasks: - -1. download all `.sra` files from `GSE#####` into your SRA folder (wherever you have configured `sratools` to stick data). -2. download all metadata from GEO and SRA and store in your metadata folder. -2. produce a PEP-compatible sample table, `PROJECT_NAME_annotation.csv`, in your metadata folder. -3. produce a PEP-compatible project configuration file, `PROJECT_NAME_config.yaml`, in your metadata folder. - -Complete details about geofetch outputs is cataloged in the [metadata outputs reference](metadata_output.md). - -## Download the data - -First, create the metadata: - - -```bash -geofetch -i GSE67303 -n red_algae -m `pwd` --just-metadata -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae -Trying GSE67303 (not a file) as accession... -Skipped 0 accessions. Starting now. -Processing accession 1 of 1: 'GSE67303' ---2022-07-08 12:39:24-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gse&acc=GSE67303&form=text&view=full -Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110 -Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: unspecified [geo/text] -Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSE.soft’ - -/home/bnt4me/Virgin [ <=> ] 3.19K --.-KB/s in 0s - -2022-07-08 12:39:24 (134 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSE.soft’ saved [3266] - ---2022-07-08 12:39:24-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE67303&form=text&view=full -Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110 -Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected. -HTTP request sent, awaiting response... 200 OK -Length: unspecified [geo/text] -Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSM.soft’ - -/home/bnt4me/Virgin [ <=> ] 10.70K --.-KB/s in 0.05s - -2022-07-08 12:39:24 (218 KB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSM.soft’ saved [10956] - -Processed 4 samples. -Found SRA Project accession: SRP056574 -Downloading SRP056574 sra metadata -Parsing SRA file to download SRR records -sample_name does not exist, creating new... -Getting SRR: SRR1930183 (SRX969073) -Dry run (no raw data will be download) -sample_name does not exist, creating new... -Getting SRR: SRR1930184 (SRX969074) -Dry run (no raw data will be download) -sample_name does not exist, creating new... -Getting SRR: SRR1930185 (SRX969075) -Dry run (no raw data will be download) -sample_name does not exist, creating new... -Getting SRR: SRR1930186 (SRX969076) -Dry run (no raw data will be download) -Finished processing 1 accession(s) -Creating complete project annotation sheets and config file... -Sample annotation sheet: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv -Writing: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv - Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_config.yaml - -``` - -The `-m` parameter specifies to use the current directory, storing the data according to the name (`-n`) parameter. So, we'll now have a `red_alga` subfolder, where the results will be saved. Inside that folder you'll see the output of the command: - - -```bash -ls red_algae -``` - -```.output -GSE67303_annotation.csv GSE67303_GSE.soft GSE67303_SRA.csv -GSE67303_config.yaml GSE67303_GSM.soft - -``` - -The `.soft` files are the direct output from GEO, which contain all the metadata as stored by GEO, for both the experiment (`_GSE`) and for the individual samples (`_GSM`). Geofetch also produces a `csv` file with the SRA metadata. The filtered version (ending in `_filt`) would contain only the specified subset of the samples if we didn't request them all, but in this case, since we only gave an accession, it is identical to the complete file. - -Finally, there are the 2 files that make up the PEP: the `_config.yaml` file and the `_annotation.csv` file. Let's see what's in these files now. - - -```bash -cat red_algae/GSE67303_config.yaml -``` - -```.output -# Autogenerated by geofetch - -name: GSE67303 -pep_version: 2.1.0 -sample_table: GSE67303_annotation.csv -subsample_table: null - -looper: - output_dir: GSE67303 - pipeline_interfaces: {pipeline_interfaces} - -sample_modifiers: - append: - Sample_growth_protocol_ch1: Cyanidioschyzon merolae cells were grown in 2xMA media - Sample_data_processing: Supplementary_files_format_and_content: Excel spreadsheet includes FPKM values for Darkness and Blue-Light exposed samples with p and q values of cuffdiff output. - Sample_extract_protocol_ch1: RNA libraries were prepared for sequencing using standard Illumina protocols - Sample_treatment_protocol_ch1: Cells were exposed to blue-light (15 µmole m-2s-1) for 30 minutes - SRR_files: SRA - - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRABAM}/{SRR}.bam" - FQ: "${SRAFQ}/{SRR}.fastq.gz" - FQ1: "${SRAFQ}/{SRR}_1.fastq.gz" - FQ2: "${SRAFQ}/{SRR}_2.fastq.gz" - imply: - - if: - organism: "Mus musculus" - then: - genome: mm10 - - if: - organism: "Homo sapiens" - then: - genome: hg38 - - if: - read_type: "PAIRED" - then: - read1: FQ1 - read2: FQ2 - - if: - read_type: "SINGLE" - then: - read1: FQ1 - -project_modifiers: - amend: - sra_convert: - looper: - results_subdir: sra_convert_results - sample_modifiers: - append: - SRR_files: SRA - pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml - derive: - attributes: [read1, read2, SRR_files] - sources: - SRA: "${SRARAW}/{SRR}.sra" - FQ: "${SRAFQ}/{SRR}.fastq.gz" - FQ1: "${SRAFQ}/{SRR}_1.fastq.gz" - FQ2: "${SRAFQ}/{SRR}_2.fastq.gz" - - - - -``` - -There are two important things to note in his file: First, see in the PEP that `sample_table` points to the csv file produced by geofetch. Second, look at the amendment called `sra_convert`. This adds a pipeline interface to the sra conversion pipeline, and adds derived attributes for SRA files and fastq files that rely on environment variables called `$SRARAW` and `$SRAFQ`. These environment variables should point to folders where you store your raw .sra files and the converted fastq files. - -Now let's look at the first 100 characters of the csv file: - - -```bash -cut -c -100 red_algae/GSE67303_annotation.csv -``` - -```.output -sample_name,protocol,organism,read_type,data_source,SRR,SRX,Sample_title,Sample_geo_accession,Sample -Cm_BlueLight_Rep1,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930183,SRX969073,Cm_BlueLig -Cm_BlueLight_Rep2,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930184,SRX969074,Cm_BlueLig -Cm_Darkness_Rep1,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930185,SRX969075,Cm_Darkness -Cm_Darkness_Rep2,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930186,SRX969076,Cm_Darkness - -``` - -Now let's download the actual data. - - -```bash -geofetch -i GSE67303 -n red_algae -m `pwd` -``` - -```.output -Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae -Trying GSE67303 (not a file) as accession... -Skipped 0 accessions. Starting now. -Processing accession 1 of 1: 'GSE67303' -Found previous GSE file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSE.soft -Found previous GSM file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSM.soft -Processed 4 samples. -Found SRA Project accession: SRP056574 -Found SRA metadata, opening.. -Parsing SRA file to download SRR records -sample_name does not exist, creating new... -Getting SRR: SRR1930183 (SRX969073) - -2022-07-08T16:40:20 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. -2022-07-08T16:40:20 prefetch.2.11.2: 1) Downloading 'SRR1930183'... -2022-07-08T16:40:20 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. -2022-07-08T16:40:20 prefetch.2.11.2: Downloading via HTTPS... -2022-07-08T16:41:28 prefetch.2.11.2: HTTPS download succeed -2022-07-08T16:41:28 prefetch.2.11.2: 'SRR1930183' is valid -2022-07-08T16:41:28 prefetch.2.11.2: 1) 'SRR1930183' was downloaded successfully -2022-07-08T16:41:28 prefetch.2.11.2: 'SRR1930183' has 0 unresolved dependencies -sample_name does not exist, creating new... -Getting SRR: SRR1930184 (SRX969074) - -2022-07-08T16:41:39 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. -2022-07-08T16:41:40 prefetch.2.11.2: 1) Downloading 'SRR1930184'... -2022-07-08T16:41:40 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. -2022-07-08T16:41:40 prefetch.2.11.2: Downloading via HTTPS... -2022-07-08T16:42:43 prefetch.2.11.2: HTTPS download succeed -2022-07-08T16:42:43 prefetch.2.11.2: 'SRR1930184' is valid -2022-07-08T16:42:43 prefetch.2.11.2: 1) 'SRR1930184' was downloaded successfully -2022-07-08T16:42:43 prefetch.2.11.2: 'SRR1930184' has 0 unresolved dependencies -sample_name does not exist, creating new... -Getting SRR: SRR1930185 (SRX969075) - -2022-07-08T16:42:54 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. -2022-07-08T16:42:55 prefetch.2.11.2: 1) Downloading 'SRR1930185'... -2022-07-08T16:42:55 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. -2022-07-08T16:42:55 prefetch.2.11.2: Downloading via HTTPS... -2022-07-08T16:45:00 prefetch.2.11.2: HTTPS download succeed -2022-07-08T16:45:00 prefetch.2.11.2: 'SRR1930185' is valid -2022-07-08T16:45:00 prefetch.2.11.2: 1) 'SRR1930185' was downloaded successfully -2022-07-08T16:45:00 prefetch.2.11.2: 'SRR1930185' has 0 unresolved dependencies -sample_name does not exist, creating new... -Getting SRR: SRR1930186 (SRX969076) - -2022-07-08T16:45:11 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores. -2022-07-08T16:45:12 prefetch.2.11.2: 1) Downloading 'SRR1930186'... -2022-07-08T16:45:12 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability. -2022-07-08T16:45:12 prefetch.2.11.2: Downloading via HTTPS... -2022-07-08T16:46:49 prefetch.2.11.2: HTTPS download succeed -2022-07-08T16:46:49 prefetch.2.11.2: 'SRR1930186' is valid -2022-07-08T16:46:49 prefetch.2.11.2: 1) 'SRR1930186' was downloaded successfully -2022-07-08T16:46:49 prefetch.2.11.2: 'SRR1930186' has 0 unresolved dependencies -Finished processing 1 accession(s) -Creating complete project annotation sheets and config file... -Sample annotation sheet: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv -Writing: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv - Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_config.yaml - -``` - - -## Finalize the project config and sample annotation - -That's basically it! `geofetch` will have produced a general-purpose PEP for you, but you'll need to modify it for whatever purpose you have. For example, one common thing is to link to the pipeline you want to use by adding a `pipeline_interface` to the project config file. You may also need to adjust the `sample_annotation` file to make sure you have the right column names and attributes needed by the pipeline you're using. GEO submitters are notoriously bad at getting the metadata correct. - - -## Selecting samples to download. - -By default, `geofetch` downloads all the data for one accession of interest. If you need more fine-grained control, either because you have multiple accessions or you need a subset of samples within them, you can use the [file-based sample specification](file-specification.md). - - -## Tips - -* Set an environment variable for `$SRABAM` (where `.bam` files will live), and `geofetch` will check to see if you have an already-converted bamfile there before issuing the command to download the `sra` file. In this way, you can delete old `sra` files after conversion and not have to worry about re-downloading them. - -* The config template uses an environment variable `$SRARAW` for where `.sra` files will live. If you set this variable to the same place you instructed `sratoolkit` to download `sra` files, you won't have to tweak the config file. For more information refer to the [`sratools` page](howto-location.md). - -You can find a complete example of [using `geofetch` for RNA-seq data](https://github.com/databio/example-projects/tree/master/rna-seq). - diff --git a/docs_jupyter/how_to_convert_fastq_from_sra.ipynb b/docs_jupyter/how_to_convert_fastq_from_sra.ipynb deleted file mode 100644 index 86758e7..0000000 --- a/docs_jupyter/how_to_convert_fastq_from_sra.ipynb +++ /dev/null @@ -1,736 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b5093d6d", - "metadata": {}, - "source": [ - "## How to extract fastq files from SRA" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "5d04aca7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "geofetch 0.12.4\n" - ] - } - ], - "source": [ - "geofetch --version" - ] - }, - { - "cell_type": "markdown", - "id": "51be28fa", - "metadata": {}, - "source": [ - "1) Download SRA files and PEP using GEOfetch\n", - "\n", - "Add flags: \n", - "a) `--add-convert-modifier` (To add looper configurations for conversion)\n", - "b) `--discard-soft` (To delete soft files. We don't need them :D)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5d1d2a6a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae\n", - "Trying GSE67303 (not a file) as accession...\n", - "Skipped 0 accessions. Starting now.\n", - "\u001B[38;5;200mProcessing accession 1 of 1: 'GSE67303'\u001B[0m\n", - "Processed 4 samples.\n", - "Expanding metadata list...\n", - "Found SRA Project accession: SRP056574\n", - "Downloading SRP056574 sra metadata\n", - "Parsing SRA file to download SRR records\n", - "Getting SRR: SRR1930183 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:12 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:12 prefetch.2.11.3: 1) Downloading 'SRR1930183'...\n", - "2023-08-01T17:04:12 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:12 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:14 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:15 prefetch.2.11.3: 'SRR1930183' is valid\n", - "2023-08-01T17:04:15 prefetch.2.11.3: 1) 'SRR1930183' was downloaded successfully\n", - "2023-08-01T17:04:15 prefetch.2.11.3: 'SRR1930183' has 0 unresolved dependencies\n", - "Getting SRR: SRR1930184 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:15 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:16 prefetch.2.11.3: 1) Downloading 'SRR1930184'...\n", - "2023-08-01T17:04:16 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:16 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:17 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:18 prefetch.2.11.3: 'SRR1930184' is valid\n", - "2023-08-01T17:04:18 prefetch.2.11.3: 1) 'SRR1930184' was downloaded successfully\n", - "2023-08-01T17:04:18 prefetch.2.11.3: 'SRR1930184' has 0 unresolved dependencies\n", - "Getting SRR: SRR1930185 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:19 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:19 prefetch.2.11.3: 1) Downloading 'SRR1930185'...\n", - "2023-08-01T17:04:19 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:19 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:22 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:22 prefetch.2.11.3: 'SRR1930185' is valid\n", - "2023-08-01T17:04:22 prefetch.2.11.3: 1) 'SRR1930185' was downloaded successfully\n", - "2023-08-01T17:04:22 prefetch.2.11.3: 'SRR1930185' has 0 unresolved dependencies\n", - "Getting SRR: SRR1930186 in (GSE67303)\n", - "\n", - "2023-08-01T17:04:22 prefetch.2.11.3: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2023-08-01T17:04:23 prefetch.2.11.3: 1) Downloading 'SRR1930186'...\n", - "2023-08-01T17:04:23 prefetch.2.11.3: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2023-08-01T17:04:23 prefetch.2.11.3: Downloading via HTTPS...\n", - "2023-08-01T17:04:25 prefetch.2.11.3: HTTPS download succeed\n", - "2023-08-01T17:04:25 prefetch.2.11.3: 'SRR1930186' is valid\n", - "2023-08-01T17:04:25 prefetch.2.11.3: 1) 'SRR1930186' was downloaded successfully\n", - "2023-08-01T17:04:25 prefetch.2.11.3: 'SRR1930186' has 0 unresolved dependencies\n", - "Finished processing 1 accession(s)\n", - "Cleaning soft files ...\n", - "Creating complete project annotation sheets and config file...\n", - "\u001B[92mSample annotation sheet: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae/GSE67303_PEP/GSE67303_PEP_raw.csv . Saved!\u001B[0m\n", - "\u001B[92mFile has been saved successfully\u001B[0m\n", - " Config file: /home/bnt4me/virginia/repos/geofetch/docs_jupyter/red_algae/GSE67303_PEP/GSE67303_PEP.yaml\n" - ] - } - ], - "source": [ - "geofetch -i GSE67303 -n red_algae -m `pwd` --add-convert-modifier --discard-soft" - ] - }, - { - "cell_type": "markdown", - "id": "a6b24693", - "metadata": {}, - "source": [ - "Let's see if files were downloaded:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "37def9a3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[0m\u001B[01;34mbuild\u001B[0m python-usage.ipynb \u001B[01;34mSRR1930184\u001B[0m\n", - "\u001B[01;34mcode\u001B[0m raw-data-downloading.ipynb \u001B[01;34mSRR1930185\u001B[0m\n", - "how_to_fastq_from_sra.ipynb \u001B[01;34mred_algae\u001B[0m \u001B[01;34mSRR1930186\u001B[0m\n", - "processed-data-downloading.ipynb \u001B[01;34mSRR1930183\u001B[0m\n" - ] - } - ], - "source": [ - "ls" - ] - }, - { - "cell_type": "markdown", - "id": "6831883b", - "metadata": {}, - "source": [ - "now let's check how does our config file looks like:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c13991dd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Autogenerated by geofetch\n", - "\n", - "name: GSE67303\n", - "pep_version: 2.1.0\n", - "sample_table: GSE67303_PEP_raw.csv\n", - "\n", - "\"experiment_metadata\":\n", - " \"series_contact_address\": \"930 N University Ave\"\n", - " \"series_contact_city\": \"Ann Arbor\"\n", - " \"series_contact_country\": \"USA\"\n", - " \"series_contact_department\": \"Chemistry\"\n", - " \"series_contact_email\": \"mtardu@umich.edu\"\n", - " \"series_contact_institute\": \"University of Michigan\"\n", - " \"series_contact_laboratory\": \"Koutmou Lab\"\n", - " \"series_contact_name\": \"mehmet,,tardu\"\n", - " \"series_contact_state\": \"MI\"\n", - " \"series_contact_zip_postal_code\": \"48109\"\n", - " \"series_contributor\": \"Mehmet,,Tardu + Ugur,M,Dikbas + Ibrahim,,Baris + Ibrahim,H,Kavakli\"\n", - " \"series_geo_accession\": \"GSE67303\"\n", - " \"series_last_update_date\": \"May 15 2019\"\n", - " \"series_overall_design\": \"Identification of blue light and red light regulated genes\\\n", - " \\ by deep sequencing in biological duplicates. qRT-PCR was performed to verify\\\n", - " \\ the RNA-seq results.\"\n", - " \"series_platform_id\": \"GPL19949\"\n", - " \"series_platform_organism\": \"Cyanidioschyzon merolae strain 10D\"\n", - " \"series_platform_taxid\": \"280699\"\n", - " \"series_pubmed_id\": \"27614431\"\n", - " \"series_relation\": \"BioProject: https://www.ncbi.nlm.nih.gov/bioproject/PRJNA279462\\\n", - " \\ + SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRP056574\"\n", - " \"series_sample_id\": \"GSM1644066 + GSM1644067 + GSM1644068 + GSM1644069\"\n", - " \"series_sample_organism\": \"Cyanidioschyzon merolae strain 10D\"\n", - " \"series_sample_taxid\": \"280699\"\n", - " \"series_status\": \"Public on Sep 01 2016\"\n", - " \"series_submission_date\": \"Mar 26 2015\"\n", - " \"series_summary\": \"Light is one of the main environmental cues that affects the\\\n", - " \\ physiology and behavior of many organisms. The effect of light on genome-wide\\\n", - " \\ transcriptional regulation has been well-studied in green algae and plants,\\\n", - " \\ but not in red algae. Cyanidioschyzon merolae is used as a model red algae,\\\n", - " \\ and is suitable for studies on transcriptomics because of its compact genome\\\n", - " \\ with a relatively small number of genes. In addition, complete genome sequences\\\n", - " \\ of the nucleus, mitochondrion, and chloroplast of this organism have been determined.\\\n", - " \\ Together, these attributes make C. merolae an ideal model organism to study\\\n", - " \\ the response to light stimuli at the transcriptional and the systems biology\\\n", - " \\ levels. Previous studies have shown that light significantly affects cell signaling\\\n", - " \\ in this organism, but there are no reports on its blue light- and red light-mediated\\\n", - " \\ transcriptional responses. We investigated the direct effects of blue and red\\\n", - " \\ light at the transcriptional level using RNA-seq. Blue and red light were found\\\n", - " \\ to regulate 35% of the total genes in C. merolae. Blue light affected the transcription\\\n", - " \\ of genes involved protein synthesis while red light specifically regulated the\\\n", - " \\ transcription of genes involved in photosynthesis and DNA repair. Blue or red\\\n", - " \\ light regulated genes involved in carbon metabolism and pigment biosynthesis.\\\n", - " \\ Overall, our data showed that red and blue light regulate the majority of the\\\n", - " \\ cellular, cell division, and repair processes in C. merolae.\"\n", - " \"series_supplementary_file\": \"ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE67nnn/GSE67303/suppl/GSE67303_DEG_cuffdiff.xlsx\"\n", - " \"series_title\": \"RNA-seq analysis of the transcriptional response to blue and red\\\n", - " \\ light in the extremophilic red alga, Cyanidioschyzon merolae\"\n", - " \"series_type\": \"Expression profiling by high throughput sequencing\"\n", - "\n", - "\n", - "\n", - "sample_modifiers:\n", - " append:\n", - " # Project metadata:\n", - " sample_treatment_protocol_ch1: \"Cells were exposed to blue-light (15 µmole m-2s-1) for 30 minutes\"\n", - " sample_growth_protocol_ch1: \"Cyanidioschyzon merolae cells were grown in 2xMA media\"\n", - " sample_extract_protocol_ch1: \"Dark kept and blue-light exposed C.merolae cells were removed and RNA was harvested using Trizol reagent. Illumina TruSeq RNA Sample Prep Kit (Cat#RS-122-2001) was used with 1 ug of total RNA for the construction of sequencing libraries., RNA libraries were prepared for sequencing using standard Illumina protocols\"\n", - " sample_data_processing: \"The purified cDNA library was sequenced on Illumina''s MiSeq sequencing platform following vendor''s instruction for running the instrument., Sequenced reads were trimmed for adaptor sequence, and masked for low-complexity or low-quality sequence, then mapped to Cyanidioschyzon merolae 10D reference genome (assembly ID:ASM9120v1) using TopHat (v2.0.5)., Differential expression analysis was conducted by using cuffdiff tool in cufflink suite (v2.2)\"\n", - " supplementary_files_format_and_content: \"Excel spreadsheet includes FPKM values for Darkness and Blue-Light exposed samples with p and q values of cuffdiff output.\"\n", - " # End of project metadata\n", - " \n", - "\n", - " # Adding sra convert looper pipeline\n", - " SRR_files: SRA\n", - "\n", - " derive:\n", - " attributes: [read1, read2, SRR_files]\n", - " sources:\n", - " SRA: \"${SRABAM}/{srr}.bam\"\n", - " FQ: \"${SRAFQ}/{srr}.fastq.gz\"\n", - " FQ1: \"${SRAFQ}/{srr}_1.fastq.gz\"\n", - " FQ2: \"${SRAFQ}/{srr}_2.fastq.gz\"\n", - " imply:\n", - " - if:\n", - " organism: \"Mus musculus\"\n", - " then:\n", - " genome: mm10\n", - " - if:\n", - " organism: \"Homo sapiens\"\n", - " then:\n", - " genome: hg38\n", - " - if:\n", - " read_type: \"PAIRED\"\n", - " then:\n", - " read1: FQ1\n", - " read2: FQ2\n", - " - if:\n", - " read_type: \"SINGLE\"\n", - " then:\n", - " read1: FQ1\n", - "\n", - "project_modifiers:\n", - " amend:\n", - " sra_convert:\n", - " looper:\n", - " results_subdir: sra_convert_results\n", - " sample_modifiers:\n", - " append:\n", - " SRR_files: SRA\n", - " pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml\n", - " derive:\n", - " attributes: [read1, read2, SRR_files]\n", - " sources:\n", - " SRA: \"${SRARAW}/{srr}/{srr}.sra\"\n", - " FQ: \"${SRAFQ}/{srr}.fastq.gz\"\n", - " FQ1: \"${SRAFQ}/{srr}_1.fastq.gz\"\n", - " FQ2: \"${SRAFQ}/{srr}_2.fastq.gz\"\n", - "\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "cat ./red_algae/GSE67303_PEP/GSE67303_PEP.yaml" - ] - }, - { - "cell_type": "markdown", - "id": "13a128a6", - "metadata": {}, - "source": [ - "To run pipeline, you should set up few enviromental variables:\n", - "1) SRARAW - folder where SRA files were downloaded\n", - "2) SRAFQ -folder where fastq should be produced\n", - "3) CODE - (first you should clone geofetch), and $CODE is where geofetch folder is located" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d4af5280", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SRARAW env\n", - "export SRARAW=`pwd`" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "981f6073", - "metadata": {}, - "outputs": [], - "source": [ - "# Create folder where you want to store fq\n", - "mkdir fq_folder" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "c2cb5330", - "metadata": {}, - "outputs": [], - "source": [ - "# Set SRAFQ env\n", - "export SRAFQ=`pwd`/fq_folder" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "45bee81f", - "metadata": {}, - "outputs": [], - "source": [ - "# Unfortunately you have to pull gefetch folder from github, and set CODE variable:\n", - "mkdir code && cd code && git clone https://github.com/pepkit/geofetch.git && export CODE=`pwd` && cd .." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "1153dab2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[0m\u001B[01;34mbuild\u001B[0m processed-data-downloading.ipynb \u001B[01;34mSRR1930183\u001B[0m\n", - "\u001B[01;34mcode\u001B[0m python-usage.ipynb \u001B[01;34mSRR1930184\u001B[0m\n", - "\u001B[01;34mfq_folder\u001B[0m raw-data-downloading.ipynb \u001B[01;34mSRR1930185\u001B[0m\n", - "how_to_fastq_from_sra.ipynb \u001B[01;34mred_algae\u001B[0m \u001B[01;34mSRR1930186\u001B[0m\n" - ] - } - ], - "source": [ - "ls" - ] - }, - { - "cell_type": "markdown", - "id": "d03578ac", - "metadata": {}, - "source": [ - "### Now install looper if you don't have it" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "b4aa8176", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "looper 1.4.3\n", - "\u001B[0m\n" - ] - } - ], - "source": [ - "looper --version" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "0bcd03a7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[0m\u001B[01;34mGSE67303_PEP\u001B[0m\n" - ] - } - ], - "source": [ - "ls red_algae" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "a9a67e5c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looper version: 1.4.3\n", - "Command: run\n", - "Using default config. No config found in env var: ['DIVCFG']\n", - "Using amendments: sra_convert\n", - "Activating compute package 'local'\n", - "Pipestat compatible: False\n", - "\u001B[36m## [1 of 4] sample: cm_bluelight_rep1; pipeline: sra_convert\u001B[0m\n", - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_bluelight_rep1.sub\n", - "Job script (n=1; 0.06Gb): ./submission/sra_convert_cm_bluelight_rep1.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:42\n", - "Using outfolder: ./sra_convert_results/SRR1930183\n", - "### Pipeline run code and environment:\n", - "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930183/\n", - "* Pipeline started at: (08-01 13:06:42) elapsed: 0.0 _TIME_\n", - "\n", - "### Version log:\n", - "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", - "\n", - "### Arguments passed to pipeline:\n", - "\n", - "* `bamfolder`: ``\n", - "* `config_file`: `sraconvert.yaml`\n", - "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", - "* `keep_sra`: `False`\n", - "* `logdev`: `False`\n", - "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", - "* `recover`: `False`\n", - "* `sample_name`: `None`\n", - "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra']`\n", - "* `verbosity`: `None`\n", - "\n", - "----------------------------------------\n", - "\n", - "Processing 1 of 1 files: SRR1930183\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930183_1.fastq.gz` \n", - "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930183/SRR1930183.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (744928)\n", - "
\n",
-      "spots read      : 1,068,319\n",
-      "reads read      : 2,136,638\n",
-      "reads written   : 2,136,638\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.08GB. \n", - " PID: 744928;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.08GB\n", - "\n", - "Already completed files: []\n", - "\n", - "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:02\n", - "* Total elapsed time (all runs): 0:00:02\n", - "* Peak memory (this run): 0.0803 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:44\n", - "\u001B[36m## [2 of 4] sample: cm_bluelight_rep2; pipeline: sra_convert\u001B[0m\n", - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_bluelight_rep2.sub\n", - "Job script (n=1; 0.04Gb): ./submission/sra_convert_cm_bluelight_rep2.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:44\n", - "Using outfolder: ./sra_convert_results/SRR1930184\n", - "### Pipeline run code and environment:\n", - "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930184/\n", - "* Pipeline started at: (08-01 13:06:45) elapsed: 0.0 _TIME_\n", - "\n", - "### Version log:\n", - "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", - "\n", - "### Arguments passed to pipeline:\n", - "\n", - "* `bamfolder`: ``\n", - "* `config_file`: `sraconvert.yaml`\n", - "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", - "* `keep_sra`: `False`\n", - "* `logdev`: `False`\n", - "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", - "* `recover`: `False`\n", - "* `sample_name`: `None`\n", - "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra']`\n", - "* `verbosity`: `None`\n", - "\n", - "----------------------------------------\n", - "\n", - "Processing 1 of 1 files: SRR1930184\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930184_1.fastq.gz` \n", - "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930184/SRR1930184.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (744973)\n", - "
\n",
-      "spots read      : 762,229\n",
-      "reads read      : 1,524,458\n",
-      "reads written   : 1,524,458\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.012GB. \n", - " PID: 744973;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.012GB\n", - "\n", - "Already completed files: []\n", - "\n", - "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:02\n", - "* Total elapsed time (all runs): 0:00:02\n", - "* Peak memory (this run): 0.0118 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:47\n", - "\u001B[36m## [3 of 4] sample: cm_darkness_rep1; pipeline: sra_convert\u001B[0m\n", - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_darkness_rep1.sub\n", - "Job script (n=1; 0.09Gb): ./submission/sra_convert_cm_darkness_rep1.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:47\n", - "Using outfolder: ./sra_convert_results/SRR1930185\n", - "### Pipeline run code and environment:\n", - "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930185/\n", - "* Pipeline started at: (08-01 13:06:47) elapsed: 0.0 _TIME_\n", - "\n", - "### Version log:\n", - "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", - "\n", - "### Arguments passed to pipeline:\n", - "\n", - "* `bamfolder`: ``\n", - "* `config_file`: `sraconvert.yaml`\n", - "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", - "* `keep_sra`: `False`\n", - "* `logdev`: `False`\n", - "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", - "* `recover`: `False`\n", - "* `sample_name`: `None`\n", - "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra']`\n", - "* `verbosity`: `None`\n", - "\n", - "----------------------------------------\n", - "\n", - "Processing 1 of 1 files: SRR1930185\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930185_1.fastq.gz` \n", - "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930185/SRR1930185.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (745021)\n", - "
\n",
-      "spots read      : 1,707,508\n",
-      "reads read      : 3,415,016\n",
-      "reads written   : 3,415,016\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:03. Running peak memory: 0.079GB. \n", - " PID: 745021;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.079GB\n", - "\n", - "Already completed files: []\n", - "\n", - "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:03\n", - "* Total elapsed time (all runs): 0:00:03\n", - "* Peak memory (this run): 0.0793 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:50\n", - "\u001B[36m## [4 of 4] sample: cm_darkness_rep2; pipeline: sra_convert\u001B[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Writing script to /home/bnt4me/virginia/repos/geofetch/docs_jupyter/submission/sra_convert_cm_darkness_rep2.sub\n", - "Job script (n=1; 0.07Gb): ./submission/sra_convert_cm_darkness_rep2.sub\n", - "Compute node: bnt4me-Precision-5560\n", - "Start time: 2023-08-01 13:06:50\n", - "Using outfolder: ./sra_convert_results/SRR1930186\n", - "### Pipeline run code and environment:\n", - "\n", - "* Command: `/home/bnt4me/virginia/venv/jupyter/bin/sraconvert --srr /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra -O ./sra_convert_results`\n", - "* Compute host: bnt4me-Precision-5560\n", - "* Working dir: /home/bnt4me/virginia/repos/geofetch/docs_jupyter\n", - "* Outfolder: ./sra_convert_results/SRR1930186/\n", - "* Pipeline started at: (08-01 13:06:51) elapsed: 0.0 _TIME_\n", - "\n", - "### Version log:\n", - "\n", - "* Python version: 3.10.6\n", - "* Pypiper dir: `/home/bnt4me/virginia/venv/jupyter/lib/python3.10/site-packages/pypiper`\n", - "* Pypiper version: 0.12.3\n", - "* Pipeline dir: `/home/bnt4me/virginia/venv/jupyter/bin`\n", - "* Pipeline version: None\n", - "\n", - "### Arguments passed to pipeline:\n", - "\n", - "* `bamfolder`: ``\n", - "* `config_file`: `sraconvert.yaml`\n", - "* `format`: `fastq`\n", - "* `fqfolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder`\n", - "* `keep_sra`: `False`\n", - "* `logdev`: `False`\n", - "* `mode`: `convert`\n", - "* `output_parent`: `./sra_convert_results`\n", - "* `recover`: `False`\n", - "* `sample_name`: `None`\n", - "* `silent`: `False`\n", - "* `srafolder`: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter`\n", - "* `srr`: `['/home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra']`\n", - "* `verbosity`: `None`\n", - "\n", - "----------------------------------------\n", - "\n", - "Processing 1 of 1 files: SRR1930186\n", - "Target to produce: `/home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder/SRR1930186_1.fastq.gz` \n", - "\n", - "> `fasterq-dump /home/bnt4me/virginia/repos/geofetch/docs_jupyter/SRR1930186/SRR1930186.sra -O /home/bnt4me/virginia/repos/geofetch/docs_jupyter/fq_folder` (745069)\n", - "
\n",
-      "spots read      : 1,224,029\n",
-      "reads read      : 2,448,058\n",
-      "reads written   : 2,448,058\n",
-      "
\n", - "Command completed. Elapsed time: 0:00:02. Running peak memory: 0.081GB. \n", - " PID: 745069;\tCommand: fasterq-dump;\tReturn code: 0;\tMemory used: 0.081GB\n", - "\n", - "Already completed files: []\n", - "\n", - "### Pipeline completed. Epilogue\n", - "* Elapsed time (this run): 0:00:02\n", - "* Total elapsed time (all runs): 0:00:02\n", - "* Peak memory (this run): 0.0813 GB\n", - "* Pipeline completed time: 2023-08-01 13:06:53\n", - "\n", - "Looper finished\n", - "Samples valid for job generation: 4 of 4\n", - "Commands submitted: 4 of 4\n", - "Jobs submitted: 4\n", - "\u001B[0m\n" - ] - } - ], - "source": [ - "looper run red_algae/GSE67303_PEP/GSE67303_PEP.yaml -a sra_convert -p local --output-dir ." - ] - }, - { - "cell_type": "markdown", - "id": "36d24512", - "metadata": {}, - "source": [ - "### Check if everything worked:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "2a79f578", - "metadata": {}, - "outputs": [], - "source": [ - "cd fq_folder" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "fefdf187", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SRR1930183_1.fastq SRR1930184_1.fastq SRR1930185_1.fastq SRR1930186_1.fastq\n", - "SRR1930183_2.fastq SRR1930184_2.fastq SRR1930185_2.fastq SRR1930186_2.fastq\n" - ] - } - ], - "source": [ - "ls" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs_jupyter/processed-data-downloading.ipynb b/docs_jupyter/processed-data-downloading.ipynb deleted file mode 100644 index b386a74..0000000 --- a/docs_jupyter/processed-data-downloading.ipynb +++ /dev/null @@ -1,522 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# geofetch tutorial for processed data\n", - "\n", - "The [GSE185701 data set](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE185701) has about 355 Mb of processed data that contains 57 Supplementary files, so it's a quick download for a test case. Let's take a quick peek at the geofetch version:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "geofetch 0.10.1\n" - ] - } - ], - "source": [ - "geofetch --version" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see your CLI options, invoke `geofetch -h`:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calling geofetch will do 4 tasks: \n", - "\n", - "1. download all or filtered processed files from `GSE#####` into your geo folder.\n", - "2. download all metadata from GEO and store in your metadata folder.\n", - "2. produce a PEP-compatible sample table, `PROJECT_NAME_sample_processed.csv` and `PROJECT_NAME_series_processed.csv`, in your metadata folder.\n", - "3. produce a PEP-compatible project configuration file, `PROJECT_NAME_sample_processed.yaml` and `PROJECT_NAME_series_processed.yaml`, in your metadata folder.\n", - "\n", - "Complete details about geofetch outputs is cataloged in the [metadata outputs reference](metadata_output.md)." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "from IPython.core.display import SVG\n", - "SVG(filename='logo.svg')" - ] - }, - { - "attachments": { - "arguments_outputs.svg": { - "image/svg+xml": [ - "" - ] - } - }, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![arguments_outputs.svg](attachment:arguments_outputs.svg)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the data\n", - "\n", - "First, create the metadata for processed data (by adding --processed and --just-metadata):" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test\n", - "Trying GSE185701 (not a file) as accession...\n", - "Skipped 0 accessions. Starting now.\n", - "\u001b[38;5;200mProcessing accession 1 of 1: 'GSE185701'\u001b[0m\n", - "--2022-07-08 12:34:57-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gse&acc=GSE185701&form=text&view=full\n", - "Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110\n", - "Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [geo/text]\n", - "Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSE.soft’\n", - "\n", - "/home/bnt4me/Virgin [ <=> ] 2.82K --.-KB/s in 0s \n", - "\n", - "2022-07-08 12:34:57 (973 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSE.soft’ saved [2885]\n", - "\n", - "--2022-07-08 12:34:57-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE185701&form=text&view=full\n", - "Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110\n", - "Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [geo/text]\n", - "Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSM.soft’\n", - "\n", - "/home/bnt4me/Virgin [ <=> ] 39.51K 132KB/s in 0.3s \n", - "\n", - "2022-07-08 12:34:58 (132 KB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_GSM.soft’ saved [40454]\n", - "\n", - "\u001b[38;5;242m\n", - "--2022-07-08 12:34:58-- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE185nnn/GSE185701/suppl/filelist.txt\n", - " => ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_file_list.txt’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::10, 2607:f220:41e:250::7, 165.112.9.229, ...\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::10|:21... connected.\n", - "Logging in as anonymous ... Logged in!\n", - "==> SYST ... done. ==> PWD ... done.\n", - "==> TYPE I ... done. ==> CWD (1) /geo/series/GSE185nnn/GSE185701/suppl ... done.\n", - "==> SIZE filelist.txt ... 794\n", - "==> EPSV ... done. ==> RETR filelist.txt ... done.\n", - "Length: 794 (unauthoritative)\n", - "\n", - "filelist.txt 100%[===================>] 794 --.-KB/s in 0s \n", - "\n", - "2022-07-08 12:34:58 (219 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/GSE185701_file_list.txt’ saved [794]\n", - "\n", - "\u001b[38;5;242m0\u001b[0m\n", - "\u001b[0m\n", - "Total number of processed SAMPLES files found is: 8\n", - "Total number of processed SERIES files found is: 1\n", - "Expanding metadata list...\n", - "Expanding metadata list...\n", - "Finished processing 1 accession(s)\n", - "Unifying and saving of metadata... \n", - "\u001b[92mFile /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/PEP_samples/GSE185701_samples.csv has been saved successfully\u001b[0m\n", - " Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/bright_test/PEP_samples/GSE185701_samples.yaml\n" - ] - } - ], - "source": [ - "geofetch -i GSE185701 --processed -n bright_test --just-metadata" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GSE185701_file_list.txt GSE185701_GSE.soft GSE185701_GSM.soft \u001b[0m\u001b[01;34mPEP_samples\u001b[0m\n" - ] - } - ], - "source": [ - "ls bright_test" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `.soft` files are the direct output from GEO, which contain all the metadata as stored by GEO, for both the experiment (`_GSE`) and for the individual samples (`_GSM`). Geofetch also produces a `csv` file with the SRA metadata. The filtered version (ending in `_filt`) would contain only the specified subset of the samples if we didn't request them all, but in this case, since we only gave an accession, it is identical to the complete file. Additionally, file_list.txt is downloaded, that contains information about size, type and creation date of all sample files.\n", - "\n", - "Finally, there are the 2 files that make up the PEP: the `_config.yaml` file and the `_annotation.csv` file (for samples and series). Let's see what's in these files now." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Autogenerated by geofetch\n", - "\n", - "pep_version: 2.1.0\n", - "project_name: GSE185701\n", - "sample_table: GSE185701_samples.csv\n", - "\n", - "sample_modifiers:\n", - " append:\n", - " output_file_path: FILES\n", - " sample_growth_protocol_ch1: Huh 7 was cultured in Dulbecco’s modified Eagle’s medium (DMEM) (Invitrogen, Carlsbad, CA, USA) containing 10% fetal bovine serum (FBS) (HyClone, Logan, UT, USA) and antibiotics (penicillin and streptomycin, Invitrogen) at 37 °C in 5% CO2.\n", - " \n", - " derive:\n", - " attributes: [output_file_path]\n", - " sources:\n", - " FILES: /{gse}/{file}\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "cat bright_test/PEP_samples/GSE185701_samples.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are few important things to note in this file:\n", - "\n", - "* First, see in the PEP that `sample_table` points to the csv file produced by geofetch.\n", - "* Second: output_file_path is location of all the files. \n", - "* Third: sample_modifier Sample_growth_protocol_ch1 is constant sample character and is larger then 50 characters so it is deleted from csv file. For large project it can significantly reduced size of the metadata\n", - "\n", - "Now let's look at the first 100 characters of the csv file:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample_taxid_ch1,sample_geo_accession,sample_channel_count,sample_instrument_model,biosample,supplem\n", - "9606,GSM5621756,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223730,wig files were gen\n", - "9606,GSM5621756,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223730,wig files were gen\n", - "9606,GSM5621758,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223732,wig files were gen\n", - "9606,GSM5621758,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223732,wig files were gen\n", - "9606,GSM5621760,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223728,wig files were gen\n", - "9606,GSM5621760,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223728,wig files were gen\n", - "9606,GSM5621761,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223729,wig files were gen\n", - "9606,GSM5621761,1,HiSeq X Ten,https://www.ncbi.nlm.nih.gov/biosample/SAMN22223729,wig files were gen\n" - ] - } - ], - "source": [ - "cut -c -100 bright_test/PEP_samples/GSE185701_samples.csv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's download the actual data. This time we will will be downloading data from the [GSE185701 data set](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE185701) .\n", - "\n", - "Let's additionally add few arguments:\n", - "\n", - "* _geo-folder_ (required) - path to the location where processed files have to be saved\n", - "* _filter_ argument, to download only _bed_ files (--filter \".Bed.gz$\")\n", - "* _data-source_ argument, to download files only from sample location (--data-source samples)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter\n", - "Trying GSE185701 (not a file) as accession...\n", - "Skipped 0 accessions. Starting now.\n", - "\u001b[38;5;200mProcessing accession 1 of 1: 'GSE185701'\u001b[0m\n", - "--2022-07-08 12:36:16-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gse&acc=GSE185701&form=text&view=full\n", - "Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110\n", - "Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [geo/text]\n", - "Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSE.soft’\n", - "\n", - "/home/bnt4me/Virgin [ <=> ] 2.82K --.-KB/s in 0s \n", - "\n", - "2022-07-08 12:36:16 (245 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSE.soft’ saved [2885]\n", - "\n", - "--2022-07-08 12:36:16-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE185701&form=text&view=full\n", - "Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110\n", - "Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [geo/text]\n", - "Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSM.soft’\n", - "\n", - "/home/bnt4me/Virgin [ <=> ] 39.51K --.-KB/s in 0.1s \n", - "\n", - "2022-07-08 12:36:16 (269 KB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_GSM.soft’ saved [40454]\n", - "\n", - "\u001b[38;5;242m\n", - "--2022-07-08 12:36:16-- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE185nnn/GSE185701/suppl/filelist.txt\n", - " => ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_file_list.txt’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::12, 2607:f220:41e:250::13, 130.14.250.13, ...\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::12|:21... connected.\n", - "Logging in as anonymous ... Logged in!\n", - "==> SYST ... done. ==> PWD ... done.\n", - "==> TYPE I ... done. ==> CWD (1) /geo/series/GSE185nnn/GSE185701/suppl ... done.\n", - "==> SIZE filelist.txt ... 794\n", - "==> EPSV ... done. ==> RETR filelist.txt ... done.\n", - "Length: 794 (unauthoritative)\n", - "\n", - "filelist.txt 100%[===================>] 794 --.-KB/s in 0s \n", - "\n", - "2022-07-08 12:36:17 (2.55 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/GSE185701_file_list.txt’ saved [794]\n", - "\n", - "\u001b[38;5;242m0\u001b[0m\n", - "\u001b[0m\n", - "Total number of processed SAMPLES files found is: 8\n", - "\u001b[32mTotal number of files after filter is: 4 \u001b[0m\n", - "Total number of processed SERIES files found is: 1\n", - "\u001b[32mTotal number of files after filter is: 0 \u001b[0m\n", - "Expanding metadata list...\n", - "Expanding metadata list...\n", - "\u001b[38;5;242m\n", - "--2022-07-08 12:36:17-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621756/suppl/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz\n", - " => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ...\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected.\n", - "Logging in as anonymous ... Logged in!\n", - "==> SYST ... done. ==> PWD ... done.\n", - "==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621756/suppl ... done.\n", - "==> SIZE GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz ... 785486\n", - "==> EPSV ... done. ==> RETR GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz ... done.\n", - "Length: 785486 (767K) (unauthoritative)\n", - "\n", - "GSM5621756_ChIPseq_ 100%[===================>] 767.08K 1.64MB/s in 0.5s \n", - "\n", - "2022-07-08 12:36:19 (1.64 MB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz’ saved [785486]\n", - "\n", - "\u001b[38;5;242m0\u001b[0m\n", - "\u001b[0m\n", - "\u001b[92mFile /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz has been downloaded successfully\u001b[0m\n", - "\u001b[38;5;242m\n", - "--2022-07-08 12:36:19-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621758/suppl/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz\n", - " => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ...\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected.\n", - "Logging in as anonymous ... Logged in!\n", - "==> SYST ... done. ==> PWD ... done.\n", - "==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621758/suppl ... done.\n", - "==> SIZE GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz ... 784432\n", - "==> EPSV ... done. ==> RETR GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz ... done.\n", - "Length: 784432 (766K) (unauthoritative)\n", - "\n", - "GSM5621758_ChIPseq_ 100%[===================>] 766.05K 1.03MB/s in 0.7s \n", - "\n", - "2022-07-08 12:36:20 (1.03 MB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz’ saved [784432]\n", - "\n", - "\u001b[38;5;242m0\u001b[0m\n", - "\u001b[0m\n", - "\u001b[92mFile /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz has been downloaded successfully\u001b[0m\n", - "\u001b[38;5;242m\n", - "--2022-07-08 12:36:21-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621760/suppl/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz\n", - " => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ...\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected.\n", - "Logging in as anonymous ... Logged in!\n", - "==> SYST ... done. ==> PWD ... done.\n", - "==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621760/suppl ... done.\n", - "==> SIZE GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz ... 163441\n", - "==> EPSV ... done. ==> RETR GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz ... done.\n", - "Length: 163441 (160K) (unauthoritative)\n", - "\n", - "GSM5621760_CUTTag_H 100%[===================>] 159.61K 816KB/s in 0.2s \n", - "\n", - "2022-07-08 12:36:21 (816 KB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz’ saved [163441]\n", - "\n", - "\u001b[38;5;242m0\u001b[0m\n", - "\u001b[0m\n", - "\u001b[92mFile /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz has been downloaded successfully\u001b[0m\n", - "\u001b[38;5;242m\n", - "--2022-07-08 12:36:22-- ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM5621nnn/GSM5621761/suppl/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz\n", - " => ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz’\n", - "Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 2607:f220:41e:250::13, 2607:f220:41e:250::12, 165.112.9.229, ...\n", - "Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|2607:f220:41e:250::13|:21... connected.\n", - "Logging in as anonymous ... Logged in!\n", - "==> SYST ... done. ==> PWD ... done.\n", - "==> TYPE I ... done. ==> CWD (1) /geo/samples/GSM5621nnn/GSM5621761/suppl ... done.\n", - "==> SIZE GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz ... 117250\n", - "==> EPSV ... done. ==> RETR GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz ... done.\n", - "Length: 117250 (115K) (unauthoritative)\n", - "\n", - "GSM5621761_CUTTag_H 100%[===================>] 114.50K 318KB/s in 0.4s \n", - "\n", - "2022-07-08 12:36:23 (318 KB/s) - ‘/home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz’ saved [117250]\n", - "\n", - "\u001b[38;5;242m0\u001b[0m\n", - "\u001b[0m\n", - "\u001b[92mFile /home/bnt4me/Virginia/for_docs/geo/GSE185701/GSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz has been downloaded successfully\u001b[0m\n", - "Finished processing 1 accession(s)\n", - "Unifying and saving of metadata... \n", - "\u001b[92mFile /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/PEP_samples/GSE185701_samples.csv has been saved successfully\u001b[0m\n", - " Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/PEP_samples/GSE185701_samples.yaml\n" - ] - } - ], - "source": [ - "geofetch -i GSE185701 --processed -n bright_test --filter \".bed.gz$\" --data-source samples \\\n", - "--geo-folder /home/bnt4me/Virginia/for_docs/geo" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now lets list the folder to see what data is there. And let's see what's in pep files now." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[0m\u001b[01;31mGSM5621756_ChIPseq_Huh7_siNC_H3K27ac_summits.bed.gz\u001b[0m\n", - "\u001b[01;31mGSM5621758_ChIPseq_Huh7_siDHX37_H3K27ac_summits.bed.gz\u001b[0m\n", - "\u001b[01;31mGSM5621760_CUTTag_Huh7_DHX37_summits.bed.gz\u001b[0m\n", - "\u001b[01;31mGSM5621761_CUTTag_Huh7_PLRG1_summits.bed.gz\u001b[0m\n" - ] - } - ], - "source": [ - "ls /home/bnt4me/Virginia/for_docs/geo/GSE185701" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cut: cat: No such file or directory\n", - "sample_platform_id,sample_library_strategy,sample_contact_country,sample_contact_name,sample_contact\n", - "GPL20795,ChIP-Seq,China,\"Xianghuo,,He\",Shanghai,HCC,\"transfected with siNC using Lipofectamine RNAiM\n", - "GPL20795,ChIP-Seq,China,\"Xianghuo,,He\",Shanghai,HCC,\"transfected with siDHX37 using Lipofectamine RN\n", - "GPL20795,OTHER,China,\"Xianghuo,,He\",Shanghai,HCC,\"transfected with Flag-DHX37 lentivirus, renew the \n", - "GPL20795,OTHER,China,\"Xianghuo,,He\",Shanghai,HCC,untreated,SRA,Huh 7,hg38,Homo sapiens,HiSeq X Ten,h\n" - ] - }, - { - "ename": "", - "evalue": "1", - "output_type": "error", - "traceback": [] - } - ], - "source": [ - "cut -c -100 cat PEP_samples/GSE185701_samples.csv" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Autogenerated by geofetch\n", - "\n", - "pep_version: 2.1.0\n", - "project_name: GSE185701\n", - "sample_table: GSE185701_samples.csv\n", - "\n", - "sample_modifiers:\n", - " append:\n", - " output_file_path: FILES\n", - " sample_growth_protocol_ch1: Huh 7 was cultured in Dulbecco’s modified Eagle’s medium (DMEM) (Invitrogen, Carlsbad, CA, USA) containing 10% fetal bovine serum (FBS) (HyClone, Logan, UT, USA) and antibiotics (penicillin and streptomycin, Invitrogen) at 37 °C in 5% CO2.\n", - " \n", - " derive:\n", - " attributes: [output_file_path]\n", - " sources:\n", - " FILES: /home/bnt4me/Virginia/for_docs/geo/{gse}/{file}\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "cat PEP_samples/GSE185701_samples.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we have easy access to this data by using [peppy](http://peppy.databio.org/en/latest/) package in python or [pepr](https://code.databio.org/pepr/) in r in further analysis " - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs_jupyter/python-usage.ipynb b/docs_jupyter/python-usage.ipynb deleted file mode 100644 index 39e6111..0000000 --- a/docs_jupyter/python-usage.ipynb +++ /dev/null @@ -1,718 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "67fc2596", - "metadata": {}, - "source": [ - "# Tutorial of usage geofetch as python package" - ] - }, - { - "cell_type": "markdown", - "id": "3ced4b1e", - "metadata": {}, - "source": [ - "♪♫*•♪♪♫*•♪♪♫*•♪♪♫*•♪♪♫*" - ] - }, - { - "cell_type": "markdown", - "id": "0093b8ef", - "metadata": {}, - "source": [ - "Geofetch provides python fuctions to fetch metadata and metadata from GEO and SRA by using python language. `get_project` function returns dictionary of peppy projects that were found using filters and input you specified.\n", - " peppy is a Python package that provides an API for handling standardized project and sample metadata. \n", - " \n", - "More information you can get here:\n", - " \n", - "http://peppy.databio.org/en/latest/\n", - "\n", - "http://pep.databio.org/en/2.0.0/" - ] - }, - { - "cell_type": "markdown", - "id": "64746e18", - "metadata": {}, - "source": [ - "### First let's import geofetch" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "60b65668", - "metadata": {}, - "outputs": [], - "source": [ - "from geofetch import Geofetcher" - ] - }, - { - "cell_type": "markdown", - "id": "b6edbdd7", - "metadata": {}, - "source": [ - "### Initiate Geofetch object by specifing parameters that you want to use for downloading metadata/data" - ] - }, - { - "cell_type": "markdown", - "id": "dc107c16", - "metadata": {}, - "source": [ - "1) If you won't specify any parameters, defaul parameters will be used" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "af268078", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name\n" - ] - } - ], - "source": [ - "geof = Geofetcher()" - ] - }, - { - "cell_type": "markdown", - "id": "1916922e", - "metadata": {}, - "source": [ - "2) To download processed data with samples and series specify this two arguments:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d451856a", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name\n" - ] - } - ], - "source": [ - "geof = Geofetcher(processed=True, data_source=\"all\")" - ] - }, - { - "cell_type": "markdown", - "id": "8debdd11", - "metadata": {}, - "source": [ - "3) To tune project parameter, where metadata should be stored use next parameters:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f8edb462", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name\n" - ] - } - ], - "source": [ - "geof = Geofetcher(\n", - " processed=True,\n", - " data_source=\"all\",\n", - " const_limit_project=20,\n", - " const_limit_discard=500,\n", - " attr_limit_truncate=10000,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "d2739b13", - "metadata": {}, - "source": [ - "4) To add more filter of other options see documentation" - ] - }, - { - "cell_type": "markdown", - "id": "00b66d4a", - "metadata": {}, - "source": [ - "## Run Geofetch" - ] - }, - { - "cell_type": "markdown", - "id": "5e6c5df8", - "metadata": {}, - "source": [ - "### By default: \n", - "1) No actual data will be downloaded (just_metadata=True)\n", - "\n", - "2) No soft files will be saved on the disc (discard_soft=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "12d70387", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Trying GSE95654 (not a file) as accession...\n", - "Trying GSE95654 (not a file) as accession...\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0f96c1a1ee8c48f4af31e0dc939fe116", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Skipped 0 accessions. Starting now.\n", - "\u001b[38;5;200mProcessing accession 1 of 1: 'GSE95654'\u001b[0m\n", - "\n", - "Total number of processed SAMPLES files found is: 40\n", - "Total number of processed SERIES files found is: 0\n", - "Expanding metadata list...\n", - "Expanding metadata list...\n" - ] - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Finished processing 1 accession(s)\n", - "Cleaning soft files ...\n", - "Unifying and saving of metadata... \n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "b2c4c738728b4b43938fa6e7f29615ef", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "02401b3d938a4a588052ba99af677f84", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No files found. No data to save. File /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/project_name/GSE95654_series/GSE95654_series.csv won't be created\n" - ] - } - ], - "source": [ - "projects = geof.get_projects(\"GSE95654\")" - ] - }, - { - "cell_type": "markdown", - "id": "bc198009", - "metadata": {}, - "source": [ - "Check if projects were created by checking dict keys:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "95896f25", - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['GSE95654_samples'])" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "projects.keys()" - ] - }, - { - "cell_type": "markdown", - "id": "4e27f971", - "metadata": {}, - "source": [ - "project for smaples was created! Now let's look into it." - ] - }, - { - "cell_type": "markdown", - "id": "fa2d0bda", - "metadata": {}, - "source": [ - "\\* the values of the dictionary are peppy projects. More information about peppy Project you can find in the documentation: http://peppy.databio.org/en/latest/" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e8642711", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "40" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(projects[\"GSE95654_samples\"].samples)" - ] - }, - { - "cell_type": "markdown", - "id": "a4d50082", - "metadata": {}, - "source": [ - "We got 40 samples from GSE95654 project. If you want to check if it's correct information go into: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE95654" - ] - }, - { - "cell_type": "markdown", - "id": "d0cd958a", - "metadata": {}, - "source": [ - "Now let's see actuall data. first 15 project and 5 clolumns:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ba7be762", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
sample_namesample_library_strategygenome_buildtissuesample_organism_ch1
sample_name
RRBS_on_CRC_patient_8RRBS_on_CRC_patient_8Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_8RRBS_on_adjacent_normal_colon_patient_8Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_32RRBS_on_CRC_patient_32Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_32RRBS_on_adjacent_normal_colon_patient_32Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_41RRBS_on_CRC_patient_41Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_41RRBS_on_adjacent_normal_colon_patient_41Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_CRC_patient_42RRBS_on_CRC_patient_42Bisulfite-Seqhg19primary tumorHomo sapiens
RRBS_on_adjacent_normal_colon_patient_42RRBS_on_adjacent_normal_colon_patient_42Bisulfite-Seqhg19adjacent normal colonHomo sapiens
RRBS_on_ACF_patient_173RRBS_on_ACF_patient_173Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_ACF_patient_515RRBS_on_ACF_patient_515Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_139RRBS_on_normal_crypts_patient_139Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_143RRBS_on_ACF_patient_143Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
RRBS_on_normal_crypts_patient_143RRBS_on_normal_crypts_patient_143Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_normal_crypts_patient_165RRBS_on_normal_crypts_patient_165Bisulfite-Seqhg19normal colonic cryptHomo sapiens
RRBS_on_ACF_patient_165RRBS_on_ACF_patient_165Bisulfite-Seqhg19aberrant crypt fociHomo sapiens
\n", - "
" - ], - "text/plain": [ - " sample_name \\\n", - "sample_name \n", - "RRBS_on_CRC_patient_8 RRBS_on_CRC_patient_8 \n", - "RRBS_on_adjacent_normal_colon_patient_8 RRBS_on_adjacent_normal_colon_patient_8 \n", - "RRBS_on_CRC_patient_32 RRBS_on_CRC_patient_32 \n", - "RRBS_on_adjacent_normal_colon_patient_32 RRBS_on_adjacent_normal_colon_patient_32 \n", - "RRBS_on_CRC_patient_41 RRBS_on_CRC_patient_41 \n", - "RRBS_on_adjacent_normal_colon_patient_41 RRBS_on_adjacent_normal_colon_patient_41 \n", - "RRBS_on_CRC_patient_42 RRBS_on_CRC_patient_42 \n", - "RRBS_on_adjacent_normal_colon_patient_42 RRBS_on_adjacent_normal_colon_patient_42 \n", - "RRBS_on_ACF_patient_173 RRBS_on_ACF_patient_173 \n", - "RRBS_on_ACF_patient_515 RRBS_on_ACF_patient_515 \n", - "RRBS_on_normal_crypts_patient_139 RRBS_on_normal_crypts_patient_139 \n", - "RRBS_on_ACF_patient_143 RRBS_on_ACF_patient_143 \n", - "RRBS_on_normal_crypts_patient_143 RRBS_on_normal_crypts_patient_143 \n", - "RRBS_on_normal_crypts_patient_165 RRBS_on_normal_crypts_patient_165 \n", - "RRBS_on_ACF_patient_165 RRBS_on_ACF_patient_165 \n", - "\n", - " sample_library_strategy genome_build \\\n", - "sample_name \n", - "RRBS_on_CRC_patient_8 Bisulfite-Seq hg19 \n", - "RRBS_on_adjacent_normal_colon_patient_8 Bisulfite-Seq hg19 \n", - "RRBS_on_CRC_patient_32 Bisulfite-Seq hg19 \n", - "RRBS_on_adjacent_normal_colon_patient_32 Bisulfite-Seq hg19 \n", - "RRBS_on_CRC_patient_41 Bisulfite-Seq hg19 \n", - "RRBS_on_adjacent_normal_colon_patient_41 Bisulfite-Seq hg19 \n", - "RRBS_on_CRC_patient_42 Bisulfite-Seq hg19 \n", - "RRBS_on_adjacent_normal_colon_patient_42 Bisulfite-Seq hg19 \n", - "RRBS_on_ACF_patient_173 Bisulfite-Seq hg19 \n", - "RRBS_on_ACF_patient_515 Bisulfite-Seq hg19 \n", - "RRBS_on_normal_crypts_patient_139 Bisulfite-Seq hg19 \n", - "RRBS_on_ACF_patient_143 Bisulfite-Seq hg19 \n", - "RRBS_on_normal_crypts_patient_143 Bisulfite-Seq hg19 \n", - "RRBS_on_normal_crypts_patient_165 Bisulfite-Seq hg19 \n", - "RRBS_on_ACF_patient_165 Bisulfite-Seq hg19 \n", - "\n", - " tissue \\\n", - "sample_name \n", - "RRBS_on_CRC_patient_8 primary tumor \n", - "RRBS_on_adjacent_normal_colon_patient_8 adjacent normal colon \n", - "RRBS_on_CRC_patient_32 primary tumor \n", - "RRBS_on_adjacent_normal_colon_patient_32 adjacent normal colon \n", - "RRBS_on_CRC_patient_41 primary tumor \n", - "RRBS_on_adjacent_normal_colon_patient_41 adjacent normal colon \n", - "RRBS_on_CRC_patient_42 primary tumor \n", - "RRBS_on_adjacent_normal_colon_patient_42 adjacent normal colon \n", - "RRBS_on_ACF_patient_173 aberrant crypt foci \n", - "RRBS_on_ACF_patient_515 aberrant crypt foci \n", - "RRBS_on_normal_crypts_patient_139 normal colonic crypt \n", - "RRBS_on_ACF_patient_143 aberrant crypt foci \n", - "RRBS_on_normal_crypts_patient_143 normal colonic crypt \n", - "RRBS_on_normal_crypts_patient_165 normal colonic crypt \n", - "RRBS_on_ACF_patient_165 aberrant crypt foci \n", - "\n", - " sample_organism_ch1 \n", - "sample_name \n", - "RRBS_on_CRC_patient_8 Homo sapiens \n", - "RRBS_on_adjacent_normal_colon_patient_8 Homo sapiens \n", - "RRBS_on_CRC_patient_32 Homo sapiens \n", - "RRBS_on_adjacent_normal_colon_patient_32 Homo sapiens \n", - "RRBS_on_CRC_patient_41 Homo sapiens \n", - "RRBS_on_adjacent_normal_colon_patient_41 Homo sapiens \n", - "RRBS_on_CRC_patient_42 Homo sapiens \n", - "RRBS_on_adjacent_normal_colon_patient_42 Homo sapiens \n", - "RRBS_on_ACF_patient_173 Homo sapiens \n", - "RRBS_on_ACF_patient_515 Homo sapiens \n", - "RRBS_on_normal_crypts_patient_139 Homo sapiens \n", - "RRBS_on_ACF_patient_143 Homo sapiens \n", - "RRBS_on_normal_crypts_patient_143 Homo sapiens \n", - "RRBS_on_normal_crypts_patient_165 Homo sapiens \n", - "RRBS_on_ACF_patient_165 Homo sapiens " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "projects[\"GSE95654_samples\"].sample_table.iloc[:15, :5]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs_jupyter/raw-data-downloading.ipynb b/docs_jupyter/raw-data-downloading.ipynb deleted file mode 100644 index 831e98c..0000000 --- a/docs_jupyter/raw-data-downloading.ipynb +++ /dev/null @@ -1,519 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# geofetch tutorial for raw data\n", - "\n", - "The [GSE67303 data set](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE67303) has about 250 mb of data across 4 samples, so it's a quick download for a test case. Let's take a quick peek at the geofetch version:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "geofetch 0.10.1\n" - ] - } - ], - "source": [ - "geofetch --version" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To see your CLI options, invoke `geofetch -h`:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: geofetch [-h] [-V] -i INPUT [-n NAME] [-m METADATA_ROOT]\n", - " [-u METADATA_FOLDER] [--just-metadata] [-r]\n", - " [--config-template CONFIG_TEMPLATE]\n", - " [--pipeline-samples PIPELINE_SAMPLES]\n", - " [--pipeline-project PIPELINE_PROJECT] [-k SKIP] [--acc-anno]\n", - " [--discard-soft] [--const-limit-project CONST_LIMIT_PROJECT]\n", - " [--const-limit-discard CONST_LIMIT_DISCARD]\n", - " [--attr-limit-truncate ATTR_LIMIT_TRUNCATE] [--add-dotfile]\n", - " [-p] [--data-source {all,samples,series}] [--filter FILTER]\n", - " [--filter-size FILTER_SIZE] [-g GEO_FOLDER] [-x]\n", - " [-b BAM_FOLDER] [-f FQ_FOLDER] [--use-key-subset] [--silent]\n", - " [--verbosity V] [--logdev]\n", - "\n", - "Automatic GEO and SRA data downloader\n", - "\n", - "optional arguments:\n", - " -h, --help show this help message and exit\n", - " -V, --version show program's version number and exit\n", - " -i INPUT, --input INPUT\n", - " required: a GEO (GSE) accession, or a file with a list\n", - " of GSE numbers\n", - " -n NAME, --name NAME Specify a project name. Defaults to GSE number\n", - " -m METADATA_ROOT, --metadata-root METADATA_ROOT\n", - " Specify a parent folder location to store metadata.\n", - " The project name will be added as a subfolder\n", - " [Default: $SRAMETA:]\n", - " -u METADATA_FOLDER, --metadata-folder METADATA_FOLDER\n", - " Specify an absolute folder location to store metadata.\n", - " No subfolder will be added. Overrides value of\n", - " --metadata-root [Default: Not used (--metadata-root is\n", - " used by default)]\n", - " --just-metadata If set, don't actually run downloads, just create\n", - " metadata\n", - " -r, --refresh-metadata\n", - " If set, re-download metadata even if it exists.\n", - " --config-template CONFIG_TEMPLATE\n", - " Project config yaml file template.\n", - " --pipeline-samples PIPELINE_SAMPLES\n", - " Optional: Specify one or more filepaths to SAMPLES\n", - " pipeline interface yaml files. These will be added to\n", - " the project config file to make it immediately\n", - " compatible with looper. [Default: null]\n", - " --pipeline-project PIPELINE_PROJECT\n", - " Optional: Specify one or more filepaths to PROJECT\n", - " pipeline interface yaml files. These will be added to\n", - " the project config file to make it immediately\n", - " compatible with looper. [Default: null]\n", - " -k SKIP, --skip SKIP Skip some accessions. [Default: no skip].\n", - " --acc-anno Optional: Produce annotation sheets for each\n", - " accession. Project combined PEP for the whole project\n", - " won't be produced.\n", - " --discard-soft Optional: After creation of PEP files, all soft and\n", - " additional files will be deleted\n", - " --const-limit-project CONST_LIMIT_PROJECT\n", - " Optional: Limit of the number of the constant sample\n", - " characters that should not be in project yaml.\n", - " [Default: 50]\n", - " --const-limit-discard CONST_LIMIT_DISCARD\n", - " Optional: Limit of the number of the constant sample\n", - " characters that should not be discarded [Default: 250]\n", - " --attr-limit-truncate ATTR_LIMIT_TRUNCATE\n", - " Optional: Limit of the number of sample characters.Any\n", - " attribute with more than X characters will truncate to\n", - " the first X, where X is a number of characters\n", - " [Default: 500]\n", - " --add-dotfile Optional: Add .pep.yaml file that points .yaml PEP\n", - " file\n", - " --silent Silence logging. Overrides verbosity.\n", - " --verbosity V Set logging level (1-5 or logging module level name)\n", - " --logdev Expand content of logging message format.\n", - "\n", - "processed:\n", - " -p, --processed Download processed data [Default: download raw data].\n", - " --data-source {all,samples,series}\n", - " Optional: Specifies the source of data on the GEO\n", - " record to retrieve processed data, which may be\n", - " attached to the collective series entity, or to\n", - " individual samples. Allowable values are: samples,\n", - " series or both (all). Ignored unless 'processed' flag\n", - " is set. [Default: samples]\n", - " --filter FILTER Optional: Filter regex for processed filenames\n", - " [Default: None].Ignored unless 'processed' flag is\n", - " set.\n", - " --filter-size FILTER_SIZE\n", - " Optional: Filter size for processed files that are\n", - " stored as sample repository [Default: None]. Works\n", - " only for sample data. Supported input formats : 12B,\n", - " 12KB, 12MB, 12GB. Ignored unless 'processed' flag is\n", - " set.\n", - " -g GEO_FOLDER, --geo-folder GEO_FOLDER\n", - " Optional: Specify a location to store processed GEO\n", - " files. Ignored unless 'processed' flag is\n", - " set.[Default: $GEODATA:]\n", - "\n", - "raw:\n", - " -x, --split-experiments\n", - " Split SRR runs into individual samples. By default,\n", - " SRX experiments with multiple SRR Runs will have a\n", - " single entry in the annotation table, with each run as\n", - " a separate row in the subannotation table. This\n", - " setting instead treats each run as a separate sample\n", - " -b BAM_FOLDER, --bam-folder BAM_FOLDER\n", - " Optional: Specify folder of bam files. Geofetch will\n", - " not download sra files when corresponding bam files\n", - " already exist. [Default: $SRABAM:]\n", - " -f FQ_FOLDER, --fq-folder FQ_FOLDER\n", - " Optional: Specify folder of fastq files. Geofetch will\n", - " not download sra files when corresponding fastq files\n", - " already exist. [Default: $SRAFQ:]\n", - " --use-key-subset Use just the keys defined in this module when writing\n", - " out metadata.\n" - ] - } - ], - "source": [ - "geofetch -h" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calling geofetch will do 4 tasks: \n", - "\n", - "1. download all `.sra` files from `GSE#####` into your SRA folder (wherever you have configured `sratools` to stick data).\n", - "2. download all metadata from GEO and SRA and store in your metadata folder.\n", - "2. produce a PEP-compatible sample table, `PROJECT_NAME_annotation.csv`, in your metadata folder.\n", - "3. produce a PEP-compatible project configuration file, `PROJECT_NAME_config.yaml`, in your metadata folder.\n", - "\n", - "Complete details about geofetch outputs is cataloged in the [metadata outputs reference](metadata_output.md)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download the data\n", - "\n", - "First, create the metadata:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae\n", - "Trying GSE67303 (not a file) as accession...\n", - "Skipped 0 accessions. Starting now.\n", - "\u001b[38;5;200mProcessing accession 1 of 1: 'GSE67303'\u001b[0m\n", - "--2022-07-08 12:39:24-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gse&acc=GSE67303&form=text&view=full\n", - "Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110\n", - "Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [geo/text]\n", - "Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSE.soft’\n", - "\n", - "/home/bnt4me/Virgin [ <=> ] 3.19K --.-KB/s in 0s \n", - "\n", - "2022-07-08 12:39:24 (134 MB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSE.soft’ saved [3266]\n", - "\n", - "--2022-07-08 12:39:24-- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?targ=gsm&acc=GSE67303&form=text&view=full\n", - "Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 2607:f220:41e:4290::110, 130.14.29.110\n", - "Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|2607:f220:41e:4290::110|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [geo/text]\n", - "Saving to: ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSM.soft’\n", - "\n", - "/home/bnt4me/Virgin [ <=> ] 10.70K --.-KB/s in 0.05s \n", - "\n", - "2022-07-08 12:39:24 (218 KB/s) - ‘/home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSM.soft’ saved [10956]\n", - "\n", - "Processed 4 samples.\n", - "Found SRA Project accession: SRP056574\n", - "Downloading SRP056574 sra metadata\n", - "Parsing SRA file to download SRR records\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930183 (SRX969073)\n", - "Dry run (no raw data will be download)\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930184 (SRX969074)\n", - "Dry run (no raw data will be download)\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930185 (SRX969075)\n", - "Dry run (no raw data will be download)\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930186 (SRX969076)\n", - "Dry run (no raw data will be download)\n", - "Finished processing 1 accession(s)\n", - "Creating complete project annotation sheets and config file...\n", - "Sample annotation sheet: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv\n", - "Writing: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv\n", - " Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_config.yaml\n" - ] - } - ], - "source": [ - "geofetch -i GSE67303 -n red_algae -m `pwd` --just-metadata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `-m` parameter specifies to use the current directory, storing the data according to the name (`-n`) parameter. So, we'll now have a `red_alga` subfolder, where the results will be saved. Inside that folder you'll see the output of the command:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GSE67303_annotation.csv GSE67303_GSE.soft GSE67303_SRA.csv\n", - "GSE67303_config.yaml GSE67303_GSM.soft\n" - ] - } - ], - "source": [ - "ls red_algae" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `.soft` files are the direct output from GEO, which contain all the metadata as stored by GEO, for both the experiment (`_GSE`) and for the individual samples (`_GSM`). Geofetch also produces a `csv` file with the SRA metadata. The filtered version (ending in `_filt`) would contain only the specified subset of the samples if we didn't request them all, but in this case, since we only gave an accession, it is identical to the complete file.\n", - "\n", - "Finally, there are the 2 files that make up the PEP: the `_config.yaml` file and the `_annotation.csv` file. Let's see what's in these files now." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Autogenerated by geofetch\n", - "\n", - "name: GSE67303\n", - "pep_version: 2.1.0\n", - "sample_table: GSE67303_annotation.csv\n", - "subsample_table: null\n", - "\n", - "looper:\n", - " output_dir: GSE67303\n", - " pipeline_interfaces: {pipeline_interfaces}\n", - "\n", - "sample_modifiers:\n", - " append:\n", - " Sample_growth_protocol_ch1: Cyanidioschyzon merolae cells were grown in 2xMA media\n", - " Sample_data_processing: Supplementary_files_format_and_content: Excel spreadsheet includes FPKM values for Darkness and Blue-Light exposed samples with p and q values of cuffdiff output.\n", - " Sample_extract_protocol_ch1: RNA libraries were prepared for sequencing using standard Illumina protocols\n", - " Sample_treatment_protocol_ch1: Cells were exposed to blue-light (15 µmole m-2s-1) for 30 minutes\n", - " SRR_files: SRA\n", - " \n", - " derive:\n", - " attributes: [read1, read2, SRR_files]\n", - " sources:\n", - " SRA: \"${SRABAM}/{SRR}.bam\"\n", - " FQ: \"${SRAFQ}/{SRR}.fastq.gz\"\n", - " FQ1: \"${SRAFQ}/{SRR}_1.fastq.gz\"\n", - " FQ2: \"${SRAFQ}/{SRR}_2.fastq.gz\" \n", - " imply:\n", - " - if: \n", - " organism: \"Mus musculus\"\n", - " then:\n", - " genome: mm10\n", - " - if: \n", - " organism: \"Homo sapiens\"\n", - " then:\n", - " genome: hg38 \n", - " - if: \n", - " read_type: \"PAIRED\"\n", - " then:\n", - " read1: FQ1\n", - " read2: FQ2 \n", - " - if: \n", - " read_type: \"SINGLE\"\n", - " then:\n", - " read1: FQ1\n", - "\n", - "project_modifiers:\n", - " amend:\n", - " sra_convert:\n", - " looper:\n", - " results_subdir: sra_convert_results\n", - " sample_modifiers:\n", - " append:\n", - " SRR_files: SRA\n", - " pipeline_interfaces: ${CODE}/geofetch/pipeline_interface_convert.yaml\n", - " derive:\n", - " attributes: [read1, read2, SRR_files]\n", - " sources:\n", - " SRA: \"${SRARAW}/{SRR}.sra\"\n", - " FQ: \"${SRAFQ}/{SRR}.fastq.gz\"\n", - " FQ1: \"${SRAFQ}/{SRR}_1.fastq.gz\"\n", - " FQ2: \"${SRAFQ}/{SRR}_2.fastq.gz\"\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "cat red_algae/GSE67303_config.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are two important things to note in his file: First, see in the PEP that `sample_table` points to the csv file produced by geofetch. Second, look at the amendment called `sra_convert`. This adds a pipeline interface to the sra conversion pipeline, and adds derived attributes for SRA files and fastq files that rely on environment variables called `$SRARAW` and `$SRAFQ`. These environment variables should point to folders where you store your raw .sra files and the converted fastq files.\n", - "\n", - "Now let's look at the first 100 characters of the csv file:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sample_name,protocol,organism,read_type,data_source,SRR,SRX,Sample_title,Sample_geo_accession,Sample\n", - "Cm_BlueLight_Rep1,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930183,SRX969073,Cm_BlueLig\n", - "Cm_BlueLight_Rep2,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930184,SRX969074,Cm_BlueLig\n", - "Cm_Darkness_Rep1,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930185,SRX969075,Cm_Darkness\n", - "Cm_Darkness_Rep2,cDNA,Cyanidioschyzon merolae strain 10D,PAIRED,SRA,SRR1930186,SRX969076,Cm_Darkness\n" - ] - } - ], - "source": [ - "cut -c -100 red_algae/GSE67303_annotation.csv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's download the actual data." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Metadata folder: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae\n", - "Trying GSE67303 (not a file) as accession...\n", - "Skipped 0 accessions. Starting now.\n", - "\u001b[38;5;200mProcessing accession 1 of 1: 'GSE67303'\u001b[0m\n", - "Found previous GSE file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSE.soft\n", - "Found previous GSM file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_GSM.soft\n", - "Processed 4 samples.\n", - "Found SRA Project accession: SRP056574\n", - "Found SRA metadata, opening..\n", - "Parsing SRA file to download SRR records\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930183 (SRX969073)\n", - "\n", - "2022-07-08T16:40:20 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2022-07-08T16:40:20 prefetch.2.11.2: 1) Downloading 'SRR1930183'...\n", - "2022-07-08T16:40:20 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2022-07-08T16:40:20 prefetch.2.11.2: Downloading via HTTPS...\n", - "2022-07-08T16:41:28 prefetch.2.11.2: HTTPS download succeed\n", - "2022-07-08T16:41:28 prefetch.2.11.2: 'SRR1930183' is valid\n", - "2022-07-08T16:41:28 prefetch.2.11.2: 1) 'SRR1930183' was downloaded successfully\n", - "2022-07-08T16:41:28 prefetch.2.11.2: 'SRR1930183' has 0 unresolved dependencies\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930184 (SRX969074)\n", - "\n", - "2022-07-08T16:41:39 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2022-07-08T16:41:40 prefetch.2.11.2: 1) Downloading 'SRR1930184'...\n", - "2022-07-08T16:41:40 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2022-07-08T16:41:40 prefetch.2.11.2: Downloading via HTTPS...\n", - "2022-07-08T16:42:43 prefetch.2.11.2: HTTPS download succeed\n", - "2022-07-08T16:42:43 prefetch.2.11.2: 'SRR1930184' is valid\n", - "2022-07-08T16:42:43 prefetch.2.11.2: 1) 'SRR1930184' was downloaded successfully\n", - "2022-07-08T16:42:43 prefetch.2.11.2: 'SRR1930184' has 0 unresolved dependencies\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930185 (SRX969075)\n", - "\n", - "2022-07-08T16:42:54 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2022-07-08T16:42:55 prefetch.2.11.2: 1) Downloading 'SRR1930185'...\n", - "2022-07-08T16:42:55 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2022-07-08T16:42:55 prefetch.2.11.2: Downloading via HTTPS...\n", - "2022-07-08T16:45:00 prefetch.2.11.2: HTTPS download succeed\n", - "2022-07-08T16:45:00 prefetch.2.11.2: 'SRR1930185' is valid\n", - "2022-07-08T16:45:00 prefetch.2.11.2: 1) 'SRR1930185' was downloaded successfully\n", - "2022-07-08T16:45:00 prefetch.2.11.2: 'SRR1930185' has 0 unresolved dependencies\n", - "sample_name does not exist, creating new...\n", - "Getting SRR: SRR1930186 (SRX969076)\n", - "\n", - "2022-07-08T16:45:11 prefetch.2.11.2: Current preference is set to retrieve SRA Normalized Format files with full base quality scores.\n", - "2022-07-08T16:45:12 prefetch.2.11.2: 1) Downloading 'SRR1930186'...\n", - "2022-07-08T16:45:12 prefetch.2.11.2: SRA Normalized Format file is being retrieved, if this is different from your preference, it may be due to current file availability.\n", - "2022-07-08T16:45:12 prefetch.2.11.2: Downloading via HTTPS...\n", - "2022-07-08T16:46:49 prefetch.2.11.2: HTTPS download succeed\n", - "2022-07-08T16:46:49 prefetch.2.11.2: 'SRR1930186' is valid\n", - "2022-07-08T16:46:49 prefetch.2.11.2: 1) 'SRR1930186' was downloaded successfully\n", - "2022-07-08T16:46:49 prefetch.2.11.2: 'SRR1930186' has 0 unresolved dependencies\n", - "Finished processing 1 accession(s)\n", - "Creating complete project annotation sheets and config file...\n", - "Sample annotation sheet: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv\n", - "Writing: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_annotation.csv\n", - " Config file: /home/bnt4me/Virginia/repos/geof2/geofetch/docs_jupyter/red_algae/GSE67303_config.yaml\n" - ] - } - ], - "source": [ - "geofetch -i GSE67303 -n red_algae -m `pwd`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Finalize the project config and sample annotation\n", - "\n", - "That's basically it! `geofetch` will have produced a general-purpose PEP for you, but you'll need to modify it for whatever purpose you have. For example, one common thing is to link to the pipeline you want to use by adding a `pipeline_interface` to the project config file. You may also need to adjust the `sample_annotation` file to make sure you have the right column names and attributes needed by the pipeline you're using. GEO submitters are notoriously bad at getting the metadata correct.\n", - "\n", - "\n", - "## Selecting samples to download.\n", - "\n", - "By default, `geofetch` downloads all the data for one accession of interest. If you need more fine-grained control, either because you have multiple accessions or you need a subset of samples within them, you can use the [file-based sample specification](file-specification.md).\n", - "\n", - "\n", - "## Tips\n", - "\n", - "* Set an environment variable for `$SRABAM` (where `.bam` files will live), and `geofetch` will check to see if you have an already-converted bamfile there before issuing the command to download the `sra` file. In this way, you can delete old `sra` files after conversion and not have to worry about re-downloading them. \n", - "\n", - "* The config template uses an environment variable `$SRARAW` for where `.sra` files will live. If you set this variable to the same place you instructed `sratoolkit` to download `sra` files, you won't have to tweak the config file. For more information refer to the [`sratools` page](howto-location.md).\n", - "\n", - "You can find a complete example of [using `geofetch` for RNA-seq data](https://github.com/databio/example-projects/tree/master/rna-seq). \n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/update_usage_docs.sh b/update_usage_docs.sh deleted file mode 100755 index 924e8fb..0000000 --- a/update_usage_docs.sh +++ /dev/null @@ -1,27 +0,0 @@ -# Run this script to build and deploy the mkdocs docs in /docs - -JUPYTER_SOURCE="" -JUPYTER_BUILD="" -AUTODOC_MODULES=() -AUTODOC_BUILD="" -USAGE_TEMPLATE="docs/usage_template.md" -USAGE_CMDS=("geofetch --help") - - -# Build an auto-usage page in markdown -if [ ! -z "$USAGE_CMDS" ] -then - cp $USAGE_TEMPLATE usage_template.md - for cmd in "$USAGE_CMDS"; do - echo $cmd - echo -e "\n\`$cmd\`" >> usage_template.md - echo -e '```{console}' >> usage_template.md - $cmd >> usage_template.md 2>&1 - echo -e '```' >> usage_template.md - done - mv usage_template.md docs/usage.md - cat docs/usage.md -else - echo "No USAGE_CMDS provided." -fi -