Skip to content

Commit

Permalink
Merge pull request #55 from ncbo/uploading
Browse files Browse the repository at this point in the history
Add new Jenkins build process
  • Loading branch information
caufieldjh authored Sep 3, 2024
2 parents 4755b6d + ab41a39 commit 55711f6
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 78 deletions.
92 changes: 20 additions & 72 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ pipeline {
agent {
docker {
reuseNode false
image 'caufieldjh/ubuntu20-python-3-8-5-dev:4-with-dbs-v6'
image 'caufieldjh/ubuntu20-python-3-9-14-dev:2'
}
}
// No scheduled builds for now
Expand All @@ -14,10 +14,7 @@ pipeline {
S3BUCKETNAME = 'kg-hub-public-data'
S3PROJECTDIR = 'kg-bioportal' // no trailing slash
MERGEDKGNAME_BASE = "kg_bioportal"
MERGEDKGNAME_GENERIC = "merged-kg"

// Ontologies to merge in this run, if not using --merge_all flag
ONTOSET = 'CCONT,GRO-CPGA,STY,HP,PMO,CDPEO,GRO-CPD,ISO19115CC,TEDDY,NMOBR,IDQA,RDFS,LUNGMAP_M_CELL,PCO,ISSVA,IOBC,APADISORDERS,TESTEX,ONL-DP,XEO,EXTRACT,CHEMINF,ECSO,FAST-GENREFORM,VODANAKENYA,CTX,ISO19115DI,CARO,TEO,COMODI,IRD,OGDI,VEO,OHPI,GEXO,CIDO,GMM,RNAO,BCTT,MADS-RDF,GAZ,OBA,OSM,TRANS,BP-METADATA,PE,PCMO,UO,NMR,NEOMARK3,EVI,MCHVODANATERMS,EO1,APACOMPUTER,ICECI,DISDRIV,ONTONEO,ENM,ONTODM-CORE,UBERON,ISO19115TCC,SBO,CU-VO,SHR,ETHOPD,SPO,HOIP,ISO19115ROLES,DCT,WETAXTOPICS,PECO,IRDG,SEQ,HL7,SEDI,CASE-BASE-ONTO,AHOL,AD-DROP,TM-CONST,MATR,APATANDT,BCO,FLYGLYCODB,RXNORM,HOOM,HIO,PTS,CRISP,OCMR,TAXRANK,OMO,SO,ODNAE,ROCKNROLLTEST,GO,OBI,FOBI,PLANA,HIVO004,AGROMOP,ONTOPBM,ADMO,PCAO,EDAM,BE,ONE,CODO,FOVT,OCE,OFSMR,OMIM,KISAO,NOMEN,DEB,HCDR,ID-AMR,DERMLEX,BTO_ONTOLOGY,OBOREL,MOC,ALLERGYDETECTOR,ADALAB,MS,RDL,AERO,TML,MATRCOMPOUND,CEDARVS,PACO,MEGO,BRSO,TGMA,RPO,EHDAA2,GENO,MCBCC,HAMIDEHSGH,RNPRIO,FAST-TITLE,CWD,VODANA-MIGRANTS,AMINO-ACID,INTO,TADS,RCTONT,MIM,SITBAC,PP,OM,DLORO,ETANC,SIO,IMGT-ONTOLOGY,CLO,RVO,APO,HMIS033B,RXNO,MOOCCUADO,KENYAANC,UPA,EXO,OBS,SYMP,IBD,IAML-MOP,OBOE-SBC,EPO,FIX,OLATDV,OA,CONTSONTO,SNOMEDCT,NCBITAXON,ERO,ISO-ANNOTATIONS,BRCT,HRDO,MAMO,CHEAR,BCGO,RADLEX,MATRROCKIGNEOUS,MOSAIC,CYTO,PDO_CAS,PDO,AGROCYMAC,VODANA-UG,MIXSCV,FB-BT,CANCO,SD3,REPRODUCE-ME,BCS7,CN,NCCO,EP,PDQ,FENICS,VDOT,NEOMARK4,FISH-AST,EPIE,MA,PANET,TCO,CLAO,OGR,ODAE,PPO,NATPRO,FAST-EVENT-SKOS,WEAR,CVAO,GLYCORDF,ISO19108TO,CMPO,OAE,ISO19115PR,PIERO,MPO,TAO,PHMAMMADO,STO-DRAFT,NPOKB,EDAM-BIOIMAGING,CISAVIADO,ROLEO,DCM,ONTOPARON_SOCIAL,MNV,INFRARISK,NCRO,CDO,RNRMU,NMOSP,BCTEO,ONTOTOXNUC,DERMO,ICDO,WB-BT,ATO,VFB_DRIVERS,MDDB,NLN,GMO,SAO,EMAPA,BHN,DOID,OCRE,TCDO,TM-MER,ISO19115CON,GEOSPECIES,VARIO,UGANDA_DISEASES,SCIO,AHSO,TM-OTHER-FACTORS,KORO,ENVO,MCCV,ECG,UNITSONT,ONTOSINASC,ECAO,REX,NEO,AO,ACESO,FAST-FORMGENRE,EHDAA,LOINC,NERO,CLYH,MERA,ONTODM-KDD,PLIO,CANONT,TRAK,PO,PHYLONT,MOP,BSAO,OPTION-ONTOLOGY,ELD,CVDO,TDWGSPEC,RDA-ISSUANCE,TEST_A,FHHO,ZONMW-GENERIC,COHSI2STUDY,IDO-COVID-19,ADW,NIHSS,GFO,PEAO,DDPHENO,TRON,HAROREADO,CKDO,OARCS,LUNGMAP-HUMAN,ICO,HIVMT,PATEL,GLYCO,CARRE,EDDA_PT,suicideo,BRO,PATO,REXO,MMUSDV,BIOMO,ICD10,CHIRO,LAND-SURFACE,MLTX,GO-PLUS,OBIWS,DCAT-FDC,HOM,CHD,MCCL,MELO,NIFDYS,ONTOAVIDA,ECTO,HSO,PE-O,HUPSON,SOS,NCIT,PR,BIOMODELS,ESFO,MFO,LEPAO,BAO,EHDA,FIRE,ADO,ATC,REPO,JERM,EDDA,NMDCO,PHFUMIADO,COPDO,OMRSE,GRO,FYPO,LUNGMAP-MOUSE,TXPO,BDO'
}
options {
timestamps()
Expand All @@ -42,11 +39,10 @@ pipeline {
sh 'echo "$GIT_BRANCH"'
sh 'cat env.txt'
sh 'cat branch.txt'
sh "echo $BUILDSTARTDATE > dow.txt"
sh "echo $BUILDSTARTDATE"
sh "echo $MERGEDKGNAME_BASE"
sh "echo $MERGEDKGNAME_GENERIC"
sh "python3.8 --version"
sh "python3.9 --version"
sh "id"
sh "whoami" // this should be jenkinsuser
// if the above fails, then the docker host didn't start the docker
Expand All @@ -66,88 +62,48 @@ pipeline {
url: 'https://github.com/ncbo/kg-bioportal',
branch: 'main'
)
sh '/usr/bin/python3.8 -m venv venv'
sh '/usr/bin/python3.9 -m venv venv'
sh '. venv/bin/activate'
// Now move on to the actual install + reqs
sh './venv/bin/pip install .'
sh './venv/bin/pip install awscli boto3 s3cmd'
}
}
}

// the download step uses s3cmd instead of the standard kghub_downloader
// this is so we can access the private object

stage('Download') {
steps {
dir('./gitrepo') {
script {
// Get the names of all BioPortal ontologies
sh ". venv/bin/activate && kgbioportal get-ontology-list --api_key ${NCBO_API_KEY} --output data/raw/"

// Now download all
// or at least in the future, do them all.
// For now just do a few
sh "printf 'ENVO\nPO\nSEPIO\n' > data/raw/ontologylist.tsv"

// Verify that the project directory is defined, or it will make a mess
// when it uploads everything to the wrong directory
if (S3PROJECTDIR.replaceAll("\\s","") == '') {
error("Project name contains only whitespace. Will not continue.")
}
withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) {
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG get s3://$S3BUCKETNAME/frozen_incoming_data/bioportal_transformed/bioportal_transformed.tar.gz data/raw/bioportal_transformed.tar.gz'
}
// Download the ontologies
sh ". venv/bin/activate && kbbioportal download --api_key ${NCBO_API_KEY} --ontology_file data/raw/ontologylist.tsv --output_dir data/raw/"

}
}
}
}

// Transform step just moves and decompresses the raw sources

// Transform the downloaded ontologies
stage('Transform') {
steps {
dir('./gitrepo') {
sh '. venv/bin/activate && env && mv data/raw/* ../ && tar -xvzf ../bioportal_transformed.tar.gz -C ../'
sh 'du -a ../'
sh 'pwd'
sh ". venv/bin/activate && kgbioportal transform --input_dir data/raw/ --output_dir data/transformed/"
}
}
}

// Currently using cat-merge
stage('Merge') {
steps {
dir('./gitrepo') {
sh 'echo "Starting that big merge."'
sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
sh 'echo "Finished that big merge."'
//sh '. venv/bin/activate && python3.8 run.py catmerge --exclude NCBITAXON,GAZ,DRON,BERO,SNOMEDCT'
sh 'gunzip data/merged/merged-kg.tar.gz'
sh 'tar -rvf data/merged/merged-kg.tar data/merged/qc/'
sh 'tar -rvf data/merged/merged-kg.tar data/merged/merged-kg_nodes.tsv'
sh 'gzip data/merged/merged-kg.tar'
//sh '. venv/bin/activate && python3.8 run.py catmerge --include_only $ONTOSET'
//sh 'cp merged_graph_stats.yaml merged_graph_stats_$BUILDSTARTDATE.yaml'
//sh 'tar -rvfz data/merged/merged-kg.tar.gz merged_graph_stats_$BUILDSTARTDATE.yaml'
}
}
}

stage('Publish') {
steps {
dir('./gitrepo') {
script {

// make sure we aren't going to clobber existing data
withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) {
REMOTE_BUILD_DIR_CONTENTS = sh (
script: '. venv/bin/activate && s3cmd -c $S3CMD_CFG ls s3://$S3BUCKETNAME/$S3PROJECTDIR/$BUILDSTARTDATE/',
returnStdout: true
).trim()
echo "REMOTE_BUILD_DIR_CONTENTS (THIS SHOULD BE EMPTY): '${REMOTE_BUILD_DIR_CONTENTS}'"
if("${REMOTE_BUILD_DIR_CONTENTS}" != ''){
echo "Will not overwrite existing remote S3 directory: $S3PROJECTDIR/$BUILDSTARTDATE"
sh 'exit 1'
} else {
echo "remote directory $S3PROJECTDIR/$BUILDSTARTDATE is empty, proceeding"
}
}

if (env.GIT_BRANCH != 'origin/main') {
echo "Will not push if not on main branch."
} else {
Expand All @@ -157,21 +113,13 @@ pipeline {
string(credentialsId: 'aws_kg_hub_access_key', variable: 'AWS_ACCESS_KEY_ID'),
string(credentialsId: 'aws_kg_hub_secret_key', variable: 'AWS_SECRET_ACCESS_KEY')]) {

//
// make $BUILDSTARTDATE/ directory and sync to s3 bucket
// Don't create any index - none of this will be public
//
sh 'mkdir $BUILDSTARTDATE/'
sh 'cp -p data/merged/merged-kg.tar.gz $BUILDSTARTDATE/${MERGEDKGNAME_BASE}.tar.gz'
sh 'cp Jenkinsfile $BUILDSTARTDATE/'

// Add updated stats
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr graph_stats.yaml $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/graph_stats.yaml'

sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG rm -r s3://$S3BUCKETNAME/$S3PROJECTDIR/current/'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE/* s3://$S3BUCKETNAME/$S3PROJECTDIR/current/'
// Index, then upload
sh '. venv/bin/activate && multi_indexer -v --directory data/transformed/ --prefix https://kghub.io/$S3PROJECTDIR/ -x -u'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate data/transformed/ s3://kg-hub-public-data/$S3PROJECTDIR/'

// Now update the index for the whole project
sh '. venv/bin/activate && multi_indexer -v --prefix https://kghub.io/$S3PROJECTDIR/ -b kg-hub-public-data -r $S3PROJECTDIR -x'
sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate ./index.html s3://kg-hub-public-data/$S3PROJECTDIR/'
}

}
Expand Down
1 change: 1 addition & 0 deletions src/kg_bioportal/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(

return None

# TODO: save NCBO ID and version for each ontology, then pass to transformer
def download(self, onto_list: list = []) -> None:
"""Downloads data files from list of ontologies into data directory.
Expand Down
26 changes: 20 additions & 6 deletions src/kg_bioportal/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# TODO: Fix KGX hijacking logging
# TODO: Save KGX logs to a file for each ontology
# TODO: Address BNodes
# TODO: get version from BioPortal API (in the downloader)


class Transformer:

Expand Down Expand Up @@ -77,14 +79,15 @@ def transform_all(self) -> None:
logging.info(f"Found {len(filepaths)} ontologies to transform.")

for filepath in filepaths:
if not self.transform(filepath):
if not self.transform(filepath, version="latest"):
logging.error(f"Error transforming {filepath}.")
else:
logging.info(f"Transformed {filepath}.")

return None

def transform(self, ontology: str) -> bool:
# TODO: use NCBO ID to name the output, not the filename
def transform(self, ontology: str, version: str) -> bool:
"""Transforms a single ontology to KGX nodes and edges.
Args:
Expand All @@ -97,7 +100,9 @@ def transform(self, ontology: str) -> bool:

logging.info(f"Transforming {ontology} to nodes and edges.")
ontology_name = os.path.splitext(os.path.basename(ontology))[0]
owl_output_path = os.path.join(self.output_dir, f"{ontology_name}.owl")
owl_output_path = os.path.join(
self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}.owl"
)

# Convert
if not robot_convert(
Expand All @@ -109,7 +114,12 @@ def transform(self, ontology: str) -> bool:
status = False

# Relax
relaxed_outpath = os.path.join(self.output_dir, f"{ontology_name}_relaxed.owl")
relaxed_outpath = os.path.join(
self.output_dir,
f"{ontology_name}",
f"{version}",
f"{ontology_name}_relaxed.owl",
)
if not robot_relax(
robot_path=self.robot_path,
input_path=owl_output_path,
Expand All @@ -120,7 +130,9 @@ def transform(self, ontology: str) -> bool:

# Transform to KGX nodes + edges
txr = KGXTransformer(stream=True)
outfilename = os.path.join(self.output_dir, f"{ontology_name}")
outfilename = os.path.join(
self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}"
)
nodefilename = outfilename + "_nodes.tsv"
edgefilename = outfilename + "_edges.tsv"
input_args = {
Expand All @@ -139,7 +151,9 @@ def transform(self, ontology: str) -> bool:
input_args=input_args,
output_args=output_args,
)
logging.info(f"Nodes and edges written to {nodefilename} and {edgefilename}.")
logging.info(
f"Nodes and edges written to {nodefilename} and {edgefilename}."
)
status = True
except Exception as e:
logging.error(f"Error transforming {ontology} to KGX nodes and edges: {e}")
Expand Down

0 comments on commit 55711f6

Please sign in to comment.