Skip to content

Commit

Permalink
Updated main module name to ddcuimap. Revised requirements and pyproj…
Browse files Browse the repository at this point in the history
…ect.toml. Other minor changes.
  • Loading branch information
Kevin Armengol committed May 13, 2023
1 parent 5ace9bd commit 38818ba
Show file tree
Hide file tree
Showing 62 changed files with 216 additions and 235 deletions.
12 changes: 6 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.env
.idea/**/dictionaries
.idea/**/shelf
/notebooks/.ipynb_checkpoints
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/workspace.xml
/.idea/
.env
/dist/
__pycache__/
/docs
/notebooks/.ipynb_checkpoints
__pycache__/
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
exclude: data_dictionary_cui_mapping/semantic_search/resources/dict_umls_upsert_ids.pkl
exclude: ddcuimap/semantic_search/resources/dict_umls_upsert_ids.pkl
- id: debug-statements

#- repo: https://github.com/PyCQA/flake8
Expand Down
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Below is a sample data dictionary format that can be used as input for this pack
In order to run and customize these pipelines, you will need to create/edit yaml configuration files located in configs. Run configurations are saved and can be reloaded.

```bash
├───data_dictionary_cui_mapping
├───ddcuimap
│ ├───configs
│ │ │ config.yaml
│ │ │ __init__.py
Expand All @@ -54,18 +54,19 @@ In order to run and customize these pipelines, you will need to create/edit yaml
## UMLS API and MetaMap Batch Queries

#### Import modules

```python
# import batch_query_pipeline modules from metamap OR umls package
from data_dictionary_cui_mapping.metamap import batch_query_pipeline as mm_bqp
from data_dictionary_cui_mapping.umls import batch_query_pipeline as umls_bqp
from ddcuimap.metamap import batch_query_pipeline as mm_bqp
from ddcuimap.umls import batch_query_pipeline as umls_bqp

# import helper functions for loading, viewing, composing configurations for pipeline run
from data_dictionary_cui_mapping.utils import helper
from ddcuimap.utils import helper
from omegaconf import OmegaConf

# import modules to create data dictionary with curated CUIs and check the file for missing mappings
from data_dictionary_cui_mapping.curation import create_dictionary_import_file
from data_dictionary_cui_mapping.curation import check_cuis
from ddcuimap.curation import create_dictionary_import_file
from ddcuimap.curation import check_cuis
```
#### Load/edit configuration files
```python
Expand Down
1 change: 0 additions & 1 deletion data_dictionary_cui_mapping/metamap/__init__.py

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from data_dictionary_cui_mapping.utils.helper import (
from ddcuimap.utils.helper import (
load_config,
save_config,
compose_config,
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ curation_settings:
'title_extracted_tokens', 'definition_extracted_tokens']
order_cols_curation: ['variable name', 'title', 'definition', 'permissible values',
'permissible value descriptions', 'preferred question text',
'pipeline_name', 'search_ID', 'query_term_1',
'pipeline_name', 'pipeline_name_alpha', 'search_ID', 'query_term_1',
'query_term_stopwords_removed_1', 'query_term_2',
'query_term_stopwords_removed_2', 'query_term_used',
'query_term_used_col', 'searchType', 'MetaMap_input', 'PMID',
Expand All @@ -68,8 +68,7 @@ curation_settings:
'definition_extracted_sparse_vecs_idx2token', 'title_extracted_tokens',
'definition_extracted_tokens', 'metadata', 'result_id', 'semantic_type',
'definition_source', 'overall_count', 'average_score', 'title_str_rank',
'title_str_score', 'title_def_rank', 'title_def_score',
'definition_str_rank', 'definition_str_score', 'definition_def_rank',
'title_str_score', 'definition_def_rank',
'definition_def_score', 'keep']
format_cols_curation: # TODO: pipeline name, overall rank, score, etc.

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@ query:
- definition_extracted
include_columns:
- 'preferred question text'
alpha: 0.5 # [1.0, 0.75, 0.5, 0.25, 0.0]
alpha: [1.0, 0.0] # [1.0, 0.75, 0.5, 0.25, 0.0]
namespace: [ 'STR', 'DEF' ]
top_k: 20
queries:
dense:
title_str: [ 'title_extracted_dense_vecs', 'STR' ]
title_def: [ 'title_extracted_dense_vecs', 'DEF' ]
definition_str: [ 'definition_extracted_dense_vecs', 'STR' ]
# title_def: [ 'title_extracted_dense_vecs', 'DEF' ]
# definition_str: [ 'definition_extracted_dense_vecs', 'STR' ]
definition_def: [ 'definition_extracted_dense_vecs', 'DEF' ]
hybrid:
title_str: [ 'title_extracted', 'STR' ]
title_def: [ 'title_extracted', 'DEF' ]
definition_str: [ 'definition_extracted', 'STR' ]
# title_def: [ 'title_extracted', 'DEF' ]
# definition_str: [ 'definition_extracted', 'STR' ]
definition_def: [ 'definition_extracted', 'DEF' ]
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from prefect import flow
import pandas as pd
import numpy as np
from data_dictionary_cui_mapping.utils import helper as helper
from data_dictionary_cui_mapping.curation.utils import dictionary_functions as dictfn
from ddcuimap.utils import helper as helper
from ddcuimap.curation.utils import dictionary_functions as dictfn


# @hydra.main(version_base=None, config_path="../configs", config_name="config")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

from prefect import flow
from pathlib import Path
from data_dictionary_cui_mapping.utils import helper as helper
from data_dictionary_cui_mapping.curation.utils import curation_functions as cur
from ddcuimap.utils import helper as helper
from ddcuimap.curation.utils import curation_functions as cur


# @hydra.main(version_base=None, config_path="../configs", config_name="config")
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from prefect import flow, task
from prefect.task_runners import SequentialTaskRunner

from data_dictionary_cui_mapping.utils import helper as helper
from ddcuimap.utils import helper as helper


@task(name="Adding search_ID column")
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
from prefect import flow
from pathlib import Path

import data_dictionary_cui_mapping.utils.helper as helper
import data_dictionary_cui_mapping.utils.process_data_dictionary as proc_dd
import data_dictionary_cui_mapping.curation.utils.curation_functions as cur
import data_dictionary_cui_mapping.umls.batch_query_pipeline as umls
import data_dictionary_cui_mapping.metamap.batch_query_pipeline as mm
import data_dictionary_cui_mapping.semantic_search.batch_hybrid_query_pipeline as ss
import ddcuimap.utils.helper as helper
import ddcuimap.utils.process_data_dictionary as proc_dd
import ddcuimap.curation.utils.curation_functions as cur
import ddcuimap.umls.batch_query_pipeline as umls
import ddcuimap.metamap.batch_query_pipeline as mm
import ddcuimap.semantic_search.batch_hybrid_query_pipeline as ss

cfg = helper.compose_config.fn(overrides=["custom=hydra_base"])
cfg_umls = helper.compose_config.fn(overrides=["custom=de", "apis=config_umls_api"])
Expand Down Expand Up @@ -49,43 +49,27 @@ def run_hydra_batch(cfg, cfg_umls, cfg_mm, cfg_ss, **kwargs):
f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_umls-api-search"
)
)
df_umls = umls.run_umls_batch(cfg_umls, df_dd=df_dd, dir_step1=dir_step1_umls)
df_umls, cfg_umls = umls.run_umls_batch(
cfg_umls, df_dd=df_dd, dir_step1=dir_step1_umls
)

## METAMAP API ##
dir_step1_mm = helper.create_folder(
Path(dir_step1).joinpath(
f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_metamap-search"
)
)
df_metamap = mm.run_mm_batch(cfg_mm, df_dd=df_dd, dir_step1=dir_step1_mm)
df_metamap, cfg_mm = mm.run_mm_batch(cfg_mm, df_dd=df_dd, dir_step1=dir_step1_mm)

## SEMANTIC SEARCH ##
ls_df_alphas = []
alphas = cfg_ss.semantic_search.query.alpha
if type(alphas) != list:
alphas = [alphas]
for alpha in alphas:
dir_step1_ss = helper.create_folder(
Path(dir_step1).joinpath(
f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_hybrid-semantic-search_alpha={alpha}"
)
)
cfg_ss.custom.settings.pipeline_name = f"hybrid_semantic_search (custom={cfg.custom.settings.custom_config}, alpha={alpha})"
cfg_ss.semantic_search.query.alpha = alpha
cfg_ss.semantic_search.query.filepath_embeddings = None
df_run, cfg_ss = ss.run_hybrid_ss_batch(
cfg_ss, df_dd=df_dd, dir_step1=dir_step1_ss
)
ls_df_alphas.append(df_run)
df_semantic_search = pd.concat(ls_df_alphas, axis=0)
cfg_ss.semantic_search.query.alpha = alphas
dir_step1_ss_alphas = helper.create_folder(

dir_step1_ss = helper.create_folder(
Path(dir_step1).joinpath(
f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_hybrid-semantic-search_alpha={alphas}"
f"{cfg.custom.curation_settings.file_settings.directory_prefix}_Step-1_hybrid-semantic-search_alpha={cfg_ss.semantic_search.query.alpha}"
)
)
df_semantic_search.to_csv(
Path(dir_step1_ss_alphas).joinpath("results.csv"), index=False
df_semantic_search, cfg_ss = ss.run_hybrid_ss_batch(
cfg_ss, df_dd=df_dd, dir_step1=dir_step1_ss
)

## COMBINE RESULTS ##
Expand All @@ -94,7 +78,6 @@ def run_hydra_batch(cfg, cfg_umls, cfg_mm, cfg_ss, **kwargs):
[df_umls, df_metamap, df_semantic_search], axis=0, ignore_index=True
)
df_results.to_csv(Path(dir_step1).joinpath("hydra_search_results.csv"), index=False)
# df_results = pd.read_csv(Path(dir_step1).joinpath("hydra_search_results.csv"))

# FORMAT CURATION DATAFRAME
df_dd_preprocessed = proc_dd.process_data_dictionary(df_dd, cfg)
Expand Down
1 change: 1 addition & 0 deletions ddcuimap/metamap/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# from ddcuimap.metamap import utils, skr_web_api
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
from prefect import flow
from prefect.task_runners import SequentialTaskRunner
from pathlib import Path
import data_dictionary_cui_mapping.utils.helper as helper
import data_dictionary_cui_mapping.utils.process_data_dictionary as proc_dd
import ddcuimap.utils.helper as helper
import ddcuimap.utils.process_data_dictionary as proc_dd

# MetaMap API
from data_dictionary_cui_mapping.curation.utils import curation_functions as cur
from data_dictionary_cui_mapping.metamap.utils.api_connection import check_credentials
from data_dictionary_cui_mapping.metamap.utils import (
from ddcuimap.curation.utils import curation_functions as cur
from ddcuimap.metamap.utils.api_connection import check_credentials
from ddcuimap.metamap.utils import (
metamap_query_processing_functions as mm_qproc,
)

Expand Down Expand Up @@ -86,8 +86,8 @@ def run_mm_batch(cfg, **kwargs):
helper.save_config(cfg, dir_step1)
print("FINISHED MetaMap batch query pipeline!!!")

return df_final
return df_final, cfg


if __name__ == "__main__":
df_final = run_mm_batch(cfg)
df_final, cfg = run_mm_batch(cfg)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from prefect import flow, task
from prefect.task_runners import SequentialTaskRunner

from data_dictionary_cui_mapping.metamap.skr_web_api import Submission
from data_dictionary_cui_mapping.utils.text_processing import (
from ddcuimap.metamap.skr_web_api import Submission
from ddcuimap.utils.text_processing import (
check_query_terms_valid,
unescape_string,
)
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
from prefect import flow
from pathlib import Path

import data_dictionary_cui_mapping.utils.helper as helper
import data_dictionary_cui_mapping.utils.process_data_dictionary as proc_dd
import data_dictionary_cui_mapping.curation.utils.curation_functions as cur
import ddcuimap.utils.helper as helper
import ddcuimap.utils.process_data_dictionary as proc_dd
import ddcuimap.curation.utils.curation_functions as cur

# Semantic Search with Pinecone
from data_dictionary_cui_mapping.semantic_search.utils.api_connection import (
from ddcuimap.semantic_search.utils.api_connection import (
check_credentials,
connect_to_pinecone,
)

from data_dictionary_cui_mapping.semantic_search.utils import builders
from data_dictionary_cui_mapping.semantic_search.utils import runners as run
from ddcuimap.semantic_search.utils import builders
from ddcuimap.semantic_search.utils import runners as run

cfg = helper.compose_config.fn(
overrides=[
Expand Down Expand Up @@ -99,13 +99,9 @@ def run_hybrid_ss_batch(cfg, **kwargs):
cfg.semantic_search.query.metadata.tokenize_columns,
cfg.semantic_search.query.metadata.tokenizer.model_name,
)
df_query_embeddings.to_pickle(Path(dir_step1, "df_query_embeddings.pkl"))
cfg.semantic_search.query.filepath_embeddings = str(
Path(
dir_step1
/ f"df_query_embeddings_alpha={cfg.semantic_search.query.alpha}.pkl"
).resolve()
)
fp_embeddings = str(Path(dir_step1 / f"df_query_embeddings_raw.pkl").resolve())
df_query_embeddings.to_pickle(fp_embeddings)
cfg.semantic_search.query.filepath_embeddings = fp_embeddings

# RETRIEVE UMLS VECTOR ID AS DICTIONARY
ids = importlib.resources.read_binary(
Expand All @@ -115,27 +111,32 @@ def run_hybrid_ss_batch(cfg, **kwargs):
# dict_umls_upsert_ids = run.fetch_id_metadata(index, cfg) #TODO: need to work on this

# RUN BATCH QUERY
var_results = run.hybrid_search_runner(
df_query_embeddings, cfg.semantic_search.query.alpha, cfg
)
# fp_var_results = Path(dir_step1, f"var_results_{ cfg.custom.settings.pipeline_name}.json") # TODO: fix this to serialize QueryResponseObject to json
# with open(fp_var_results, "w") as f:
# json.dump(var_results, f)

# AGGREGATE AND RANK RESULTS
df_agg = run.aggregate_results(var_results, dict_umls_upsert_ids, cfg)
df_agg = df_agg.rename(
columns={
"cui": "data element concept identifiers",
"title": "data element concept names",
"title_source": "data element terminology sources",
}
)
df_agg.insert(2, "recCount", cfg.semantic_search.query.top_k)
ls_df_alphas = []
alphas = cfg.semantic_search.query.alpha
if type(alphas) != list:
alphas = list(alphas)
for alpha in alphas:
pipeline_name_alpha = f"hybrid_semantic_search (custom={cfg.custom.settings.custom_config}, alpha={alpha})"
cfg.semantic_search.query.alpha = alpha
var_results = run.hybrid_search_runner(df_query_embeddings, alpha, cfg)
# AGGREGATE AND RANK RESULTS
df_agg = run.aggregate_results(var_results, dict_umls_upsert_ids, cfg)
df_agg = df_agg.rename(
columns={
"cui": "data element concept identifiers",
"title": "data element concept names",
"title_source": "data element terminology sources",
}
)
df_agg.insert(2, "recCount", cfg.semantic_search.query.top_k)
df_agg.insert(1, "pipeline_name_alpha", pipeline_name_alpha)
ls_df_alphas.append(df_agg)
df_results = pd.concat(ls_df_alphas, axis=0)

# CREATE CURATION FILE
cfg.semantic_search.query.alpha = alphas
df_final = cur.create_curation_file(
dir_step1, df_dd, df_dd_preprocessed, df_curation, df_agg, cfg
dir_step1, df_dd, df_dd_preprocessed, df_curation, df_results, cfg
) # TODO: may want to include sparse tokens and scoring in curation file

helper.save_config(cfg, dir_step1, "config_query.yaml")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"""

from prefect import flow
import data_dictionary_cui_mapping.utils.helper as helper
import ddcuimap.utils.helper as helper
from semantic_search.configure_umls_index import (
step1_select_umls_subset as step1,
step2_embed_umls_subset as step2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from pandas.api.types import CategoricalDtype
from tqdm import tqdm

import data_dictionary_cui_mapping.utils.helper as helper
from data_dictionary_cui_mapping.utils.text_processing import clean_text
import ddcuimap.utils.helper as helper
from ddcuimap.utils.text_processing import clean_text

cfg = helper.compose_config.fn(
config_path="../configs/semantic_search", config_name="embeddings", overrides=[]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
import pandas as pd
from pathlib import Path

import data_dictionary_cui_mapping.utils.helper as helper
from data_dictionary_cui_mapping.semantic_search.utils import builders
import ddcuimap.utils.helper as helper
from ddcuimap.semantic_search.utils import builders

cfg = helper.compose_config.fn(
config_path="../configs/semantic_search", config_name="embeddings", overrides=[]
Expand Down
Loading

0 comments on commit 38818ba

Please sign in to comment.