Bump to v0.2.0, to include bugfixes and additions

BioinfoMachineLearning · Jul 5, 2024 · 8221fe2 · 8221fe2
1 parent c67f41a
commit 8221fe2
Show file tree

Hide file tree

Showing 41 changed files with 450 additions and 100 deletions.
diff --git a/.gitignore b/.gitignore
@@ -173,6 +173,7 @@ configs/local/default.yaml
 /forks/FABind/FABind/
 /forks/NeuralPLexer/NeuralPLexer/
 /forks/NeuralPLexer/**/neuralplexermodels*
+/forks/P2Rank/
 /forks/*/inference*/
 /forks/RoseTTAFold-All-Atom/blast-2.2.26
 /forks/RoseTTAFold-All-Atom/rf2aa/config/inference/*_rfaa_inference.yaml

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+### 0.2.0 - 07/04/2024
+
+- Added P2Rank as a new binding site prediction method available to use with AutoDock-Vina
+- Added OpenJDK to the `PoseBench` Conda environment to enable P2Rank inference
+- Added a script to benchmark the required compute resources for each baseline method
+- Updated citation
+- Corrected directory navigation instructions (i.e., `cd` references) in `README.md` to reflect the directory structure of each Zenodo archive file
+- Corrected Biopython, NumPy, and ProDy versions in the DiffDock Conda environment to avoid GCC compilation errors
+
 ### 0.1.0 - 06/08/2024
 
 - First public release
diff --git a/README.md b/README.md
@@ -85,6 +85,10 @@ cd forks/RoseTTAFold-All-Atom/rf2aa/SE3Transformer/ && pip3 install --no-cache-d
 # - AutoDock Vina Tools environment (~1 GB)
 mamba env create -f environments/adfr_environment.yaml --prefix forks/Vina/ADFR/
 conda activate forks/Vina/ADFR/  # NOTE: one still needs to use `conda` to (de)activate environments
+# - P2Rank (~0.5 GB)
+wget -P forks/P2Rank/ https://github.com/rdk/p2rank/releases/download/2.4.2/p2rank_2.4.2.tar.gz
+tar -xzf forks/P2Rank/p2rank_2.4.2.tar.gz -C forks/P2Rank/
+rm forks/P2Rank/p2rank_2.4.2.tar.gz
 ```
 
 Download checkpoints (~8.25 GB total)
@@ -132,7 +136,6 @@ of how to extend `PoseBench`, as outlined below.
 
 ```bash
 # fetch, extract, and clean-up preprocessed Astex Diverse, PoseBusters Benchmark, DockGen, and CASP15 data (~3 GB) #
-cd data/
 wget https://zenodo.org/records/11477766/files/astex_diverse_set.tar.gz
 wget https://zenodo.org/records/11477766/files/posebusters_benchmark_set.tar.gz
 wget https://zenodo.org/records/11477766/files/dockgen_set.tar.gz
@@ -145,50 +148,41 @@ rm astex_diverse_set.tar.gz
 rm posebusters_benchmark_set.tar.gz
 rm dockgen_set.tar.gz
 rm casp15_set.tar.gz
-cd ../
 ```
 
 ### Downloading benchmark method predictions
 
 ```bash
 # fetch, extract, and clean-up benchmark method predictions to reproduce paper results (~19 GB) #
 # DiffDock predictions and results
-cd forks/DiffDock/
 wget https://zenodo.org/records/11477766/files/diffdock_benchmark_method_predictions.tar.gz
 tar -xzf diffdock_benchmark_method_predictions.tar.gz
 rm diffdock_benchmark_method_predictions.tar.gz
 # FABind predictions and results
-cd forks/FABind/
 wget https://zenodo.org/records/11477766/files/fabind_benchmark_method_predictions.tar.gz
 tar -xzf fabind_benchmark_method_predictions.tar.gz
 rm fabind_benchmark_method_predictions.tar.gz
 # DynamicBind predictions and results
-cd forks/DynamicBind/
 wget https://zenodo.org/records/11477766/files/dynamicbind_benchmark_method_predictions.tar.gz
 tar -xzf dynamicbind_benchmark_method_predictions.tar.gz
 rm dynamicbind_benchmark_method_predictions.tar.gz
 # NeuralPLexer predictions and results
-cd forks/NeuralPLexer/
 wget https://zenodo.org/records/11477766/files/neuralplexer_benchmark_method_predictions.tar.gz
 tar -xzf neuralplexer_benchmark_method_predictions.tar.gz
 rm neuralplexer_benchmark_method_predictions.tar.gz
 # RoseTTAFold-All-Atom predictions and results
-cd forks/RoseTTAFold-All-Atom/
 wget https://zenodo.org/records/11477766/files/rfaa_benchmark_method_predictions.tar.gz
 tar -xzf rfaa_benchmark_method_predictions.tar.gz
 rm rfaa_benchmark_method_predictions.tar.gz
 # TULIP predictions and results
-cd forks/TULIP/
 wget https://zenodo.org/records/11477766/files/tulip_benchmark_method_predictions.tar.gz
 tar -xzf tulip_benchmark_method_predictions.tar.gz
 rm tulip_benchmark_method_predictions.tar.gz
 # AutoDock Vina predictions and results
-cd forks/Vina/
 wget https://zenodo.org/records/11477766/files/vina_benchmark_method_predictions.tar.gz
 tar -xzf vina_benchmark_method_predictions.tar.gz
 rm vina_benchmark_method_predictions.tar.gz
 # Astex Diverse, PoseBusters Benchmark (w/ pocket-only results), DockGen, and CASP15 consensus ensemble predictions and results
-cd data/test_cases/
 wget https://zenodo.org/records/11477766/files/astex_diverse_ensemble_benchmark_method_predictions.tar.gz
 wget https://zenodo.org/records/11477766/files/posebusters_benchmark_ensemble_benchmark_method_predictions.tar.gz
 wget https://zenodo.org/records/11477766/files/dockgen_ensemble_benchmark_method_predictions.tar.gz
@@ -892,7 +886,8 @@ If you use the code or benchmark method predictions associated with this reposit
   title={Deep Learning for Protein-Ligand Docking: Are We There Yet?},
   author={Morehead, Alex and Giri, Nabin and Liu, Jian and Cheng, Jianlin},
   booktitle={ICML AI4Science Workshop},
-  year={2024}
+  year={2024},
+  note={selected as a spotlight presentation},
 }
 ```
 

diff --git a/citation.bib b/citation.bib
@@ -2,5 +2,6 @@ @inproceedings{morehead2024posebench
   title={Deep Learning for Protein-Ligand Docking: Are We There Yet?},
   author={Morehead, Alex and Giri, Nabin and Liu, Jian and Cheng, Jianlin},
   booktitle={ICML AI4Science Workshop},
-  year={2024}
+  year={2024},
+  note={selected as a spotlight presentation},
 }
diff --git a/configs/analysis/inference_analysis.yaml b/configs/analysis/inference_analysis.yaml
@@ -1,6 +1,6 @@
 full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
 method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`)
-vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
+vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `p2rank`)
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_csv_path: ${resolve_method_input_csv_path:${method},${dataset}} # the input CSV filepath with which to run inference

diff --git a/configs/model/diffdock_inference.yaml b/configs/model/diffdock_inference.yaml
@@ -14,3 +14,4 @@ actual_steps: 19 # the actual number of inference steps to run (i.e., after how
 no_final_step_noise: true # whether to disable the final inference step's noise from being added
 repeat_index: 1 # the repeat index to use for inference
 skip_existing: true # whether to skip inference for existing output directories
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/model/dynamicbind_inference.yaml b/configs/model/dynamicbind_inference.yaml
@@ -13,3 +13,4 @@ num_workers: 1 # the number of workers to use for native relaxation during infer
 skip_existing: true # whether to skip existing predictions
 repeat_index: 1 # the repeat index to use for inference
 pocket_only_baseline: false # whether to run the pocket-only baseline
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/model/fabind_inference.yaml b/configs/model/fabind_inference.yaml
@@ -11,3 +11,4 @@ ckpt_path: ${oc.env:PROJECT_ROOT}/forks/FABind/ckpt/best_model.bin # the checkpo
 output_dir: ${oc.env:PROJECT_ROOT}/forks/FABind/inference/fabind_${dataset}_output_${repeat_index} # the output directory to which to save the inference results
 repeat_index: 1 # the repeat index to use for inference
 pocket_only_baseline: false # whether to run the pocket-only baseline
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/model/inference_relaxation.yaml b/configs/model/inference_relaxation.yaml
@@ -1,5 +1,5 @@
 method: diffdock # the method for which to relax predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `tulip`)
-vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`)
+vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `p2rank`)
 dataset: posebusters_benchmark # the dataset for which to relax predictions - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 num_processes: 1 # the number of parallel processes to use for relaxation

diff --git a/configs/model/neuralplexer_inference.yaml b/configs/model/neuralplexer_inference.yaml
@@ -26,3 +26,4 @@ rank_outputs_by_confidence: true # whether to rank the output conformations, by
 plddt_ranking_type: ligand # the type of plDDT ranking to apply to generated samples - NOTE: must be one of (`protein`, `ligand`, `protein_ligand`)
 csv_path: null # the CSV filepath from which to parse benchmarking input data
 repeat_index: 1 # the repeat index to use for inference
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/model/rfaa_inference.yaml b/configs/model/rfaa_inference.yaml
@@ -11,3 +11,4 @@ inference_config_name: null # the name of the inference config to use - NOTE: if
 inference_dir_name: null # the name of the inference output directory to use
 repeat_index: 1 # the repeat index to use for inference
 skip_existing: true # whether to skip running inference if the prediction for a target already exists
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/model/vina_inference.yaml b/configs/model/vina_inference.yaml
@@ -1,7 +1,8 @@
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
-method: diffdock # the method from which to use binding site predictions - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `ensemble`)
+method: diffdock # the method from which to use binding site predictions - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `p2rank`, `ensemble`) - NOTE: `p2rank` is not included in `ensemble`
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 python2_exec_path: ${oc.env:PROJECT_ROOT}/forks/Vina/ADFR/bin/python # the path to the Python 2 executable
+p2rank_exec_path: ${oc.env:PROJECT_ROOT}/forks/P2Rank/p2rank_2.4.2/prank # the path to the P2Rank executable
 prepare_receptor_script_path: ${oc.env:PROJECT_ROOT}/forks/Vina/ADFR/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py # the path to the prepare_receptor.py script
 input_dir: ${resolve_method_output_dir:${method},${dataset},${method},${ensemble_ranking_method},${repeat_index}} # the input directory with which to run inference
 input_protein_structure_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set/${dataset}_holo_aligned_esmfold_structures # the input protein structure directory to parse
@@ -23,3 +24,7 @@ apo_protein_filepath: null # the apo protein file path to use for inference
 input_id: null # the input ID to use for inference
 repeat_index: 1 # the repeat index to use for inference
 pocket_only_baseline: false # whether to run the pocket-only baseline
+p2rank_exec_utility: predict # the P2Rank executable utility to use for inference
+p2rank_config: alphafold # the P2Rank configuration to use for inference
+p2rank_enable_pymol_visualizations: false # whether to enable P2Rank's PyMOL visualizations
+max_num_inputs: null # if provided, the number of (dataset subset) inputs over which to run inference
diff --git a/configs/scripts/benchmark_baseline_compute_resources.yaml b/configs/scripts/benchmark_baseline_compute_resources.yaml
@@ -0,0 +1,6 @@
+method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `vina`, `ensemble`)
+vina_binding_site_method: diffdock # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `rfaa`, `p2rank`)
+dataset: astex_diverse # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
+repeat_index: 1 # the repeat index which was used for inference
+max_num_inputs: 20 # the number of (dataset subset) inputs over which to benchmark each baseline method's compute resource usage
+cuda_device_index: 0 # the CUDA device index to use for inference (for all methods except AutoDock-Vina)
diff --git a/docs/source/acknowledgements.rst b/docs/source/acknowledgements.rst
@@ -2,5 +2,5 @@ Acknowledgements
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 868
-    :end-line: 885
+    :start-line: 862
+    :end-line: 879
diff --git a/docs/source/available_methods.rst b/docs/source/available_methods.rst
@@ -2,8 +2,8 @@ Available inference methods
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 279
-    :end-line: 316
+    :start-line: 273
+    :end-line: 310
 
 .. note::
     Have a new method to add? Please let us know by creating a pull request. We would be happy to work with you to integrate new methodology into this benchmark!
diff --git a/docs/source/bonus.rst b/docs/source/bonus.rst
@@ -2,8 +2,8 @@ Bonus
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 902
-    :end-line: 904
+    :start-line: 897
+    :end-line: 899
 
 .. image:: ./_static/WorkBench.jpeg
   :alt: My brain after building PoseBench

diff --git a/docs/source/citing_this_work.rst b/docs/source/citing_this_work.rst
@@ -2,5 +2,5 @@ Citing this work
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 887
-    :end-line: 898
+    :start-line: 881
+    :end-line: 893
diff --git a/docs/source/comparative_plots.rst b/docs/source/comparative_plots.rst
@@ -2,5 +2,5 @@ How to create comparative plots of inference results
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 815
-    :end-line: 824
+    :start-line: 809
+    :end-line: 818
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,7 +9,7 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 project = "PoseBench"
 author = "Alex Morehead"
-release = "0.1.0"
+release = "0.2.0"
 copyright = f"{datetime.datetime.now().year}, {author}"
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/data_preparation.rst b/docs/source/data_preparation.rst
@@ -2,5 +2,5 @@ How to prepare `PoseBench` data
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 130
-    :end-line: 273
+    :start-line: 134
+    :end-line: 267
diff --git a/docs/source/ensemble_inference.rst b/docs/source/ensemble_inference.rst
@@ -2,8 +2,8 @@ How to run inference with a method ensemble
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 756
-    :end-line: 807
+    :start-line: 750
+    :end-line: 801
 
 .. note::
     In addition to having `consensus` as an available value for `ensemble_ranking_method`, one can also set `ensemble_ranking_method=ff` to have the method ensemble's top-ranked predictions selected using the criterion of "minimum (molecular dynamics) force field energy" (albeit while incurring a very large runtime complexity).
diff --git a/docs/source/for_developers.rst b/docs/source/for_developers.rst
@@ -2,5 +2,5 @@ For developers
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 830
-    :end-line: 864
+    :start-line: 824
+    :end-line: 858
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -3,4 +3,4 @@ Installation
 
 .. mdinclude:: ../../README.md
     :start-line: 40
-    :end-line: 112
+    :end-line: 116
diff --git a/docs/source/method_inference.rst b/docs/source/method_inference.rst
@@ -2,5 +2,5 @@ How to run inference with individual methods
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 324
-    :end-line: 750
+    :start-line: 318
+    :end-line: 744
diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst
@@ -2,5 +2,5 @@ Tutorials
 ================
 
 .. mdinclude:: ../../README.md
-    :start-line: 118
-    :end-line: 124
+    :start-line: 122
+    :end-line: 128
diff --git a/environments/diffdock_environment.yaml b/environments/diffdock_environment.yaml
@@ -215,7 +215,7 @@ dependencies:
       - autopage==0.5.1
       - backcall==0.2.0
       - biopandas==0.5.1.dev0
-      - biopython==1.76
+      - biopython==1.79
       - bioservices==1.11.2
       - boto3==1.28.66
       - botocore==1.31.66
@@ -282,7 +282,7 @@ dependencies:
       - natsort==8.4.0
       - networkx==2.8.7
       - ninja==1.11.1
-      - numpy==1.23.0
+      - numpy==1.23.5
       - omegaconf==2.3.0
       - git+https://github.com/amorehead/openfold.git@fe1275099639bf7e617e09ef24d6af778647dd64
       - opt-einsum==3.3.0
@@ -296,7 +296,7 @@ dependencies:
       - pickleshare==0.7.5
       - plotly==5.17.0
       - prettytable==3.8.0
-      - prody==2.2.0
+      - prody==2.4.1
       - prompt-toolkit==3.0.39
       - protobuf==4.24.1
       - pure-eval==0.2.2

diff --git a/environments/posebench_environment.yaml b/environments/posebench_environment.yaml
@@ -212,6 +212,7 @@ dependencies:
   - openff-units=0.2.2=pyhca7485f_0
   - openff-utilities=0.1.12=pyhd8ed1ab_0
   - openh264=2.1.1=h780b84a_0
+  - openjdk=11.0.1=h600c080_1018
   - openjpeg=2.5.0=h7d73246_0
   - openmm=8.1.1=py310h43b6314_1
   - openmmforcefields=0.12.0=pyhd8ed1ab_0