Merge pull request #55 from ncbo/uploading

Add new Jenkins build process
ncbo · Sep 3, 2024 · 55711f6 · 55711f6
2 parents 4755b6d + ab41a39
commit 55711f6
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 78 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2,7 +2,7 @@ pipeline {
     agent {
         docker {
             reuseNode false
-            image 'caufieldjh/ubuntu20-python-3-8-5-dev:4-with-dbs-v6'
+            image 'caufieldjh/ubuntu20-python-3-9-14-dev:2'
         }
     }
     // No scheduled builds for now
@@ -14,10 +14,7 @@ pipeline {
         S3BUCKETNAME = 'kg-hub-public-data'
         S3PROJECTDIR = 'kg-bioportal' // no trailing slash
         MERGEDKGNAME_BASE = "kg_bioportal"
-        MERGEDKGNAME_GENERIC = "merged-kg"
 
-        // Ontologies to merge in this run, if not using --merge_all flag
-        ONTOSET = 'CCONT,GRO-CPGA,STY,HP,PMO,CDPEO,GRO-CPD,ISO19115CC,TEDDY,NMOBR,IDQA,RDFS,LUNGMAP_M_CELL,PCO,ISSVA,IOBC,APADISORDERS,TESTEX,ONL-DP,XEO,EXTRACT,CHEMINF,ECSO,FAST-GENREFORM,VODANAKENYA,CTX,ISO19115DI,CARO,TEO,COMODI,IRD,OGDI,VEO,OHPI,GEXO,CIDO,GMM,RNAO,BCTT,MADS-RDF,GAZ,OBA,OSM,TRANS,BP-METADATA,PE,PCMO,UO,NMR,NEOMARK3,EVI,MCHVODANATERMS,EO1,APACOMPUTER,ICECI,DISDRIV,ONTONEO,ENM,ONTODM-CORE,UBERON,ISO19115TCC,SBO,CU-VO,SHR,ETHOPD,SPO,HOIP,ISO19115ROLES,DCT,WETAXTOPICS,PECO,IRDG,SEQ,HL7,SEDI,CASE-BASE-ONTO,AHOL,AD-DROP,TM-CONST,MATR,APATANDT,BCO,FLYGLYCODB,RXNORM,HOOM,HIO,PTS,CRISP,OCMR,TAXRANK,OMO,SO,ODNAE,ROCKNROLLTEST,GO,OBI,FOBI,PLANA,HIVO004,AGROMOP,ONTOPBM,ADMO,PCAO,EDAM,BE,ONE,CODO,FOVT,OCE,OFSMR,OMIM,KISAO,NOMEN,DEB,HCDR,ID-AMR,DERMLEX,BTO_ONTOLOGY,OBOREL,MOC,ALLERGYDETECTOR,ADALAB,MS,RDL,AERO,TML,MATRCOMPOUND,CEDARVS,PACO,MEGO,BRSO,TGMA,RPO,EHDAA2,GENO,MCBCC,HAMIDEHSGH,RNPRIO,FAST-TITLE,CWD,VODANA-MIGRANTS,AMINO-ACID,INTO,TADS,RCTONT,MIM,SITBAC,PP,OM,DLORO,ETANC,SIO,IMGT-ONTOLOGY,CLO,RVO,APO,HMIS033B,RXNO,MOOCCUADO,KENYAANC,UPA,EXO,OBS,SYMP,IBD,IAML-MOP,OBOE-SBC,EPO,FIX,OLATDV,OA,CONTSONTO,SNOMEDCT,NCBITAXON,ERO,ISO-ANNOTATIONS,BRCT,HRDO,MAMO,CHEAR,BCGO,RADLEX,MATRROCKIGNEOUS,MOSAIC,CYTO,PDO_CAS,PDO,AGROCYMAC,VODANA-UG,MIXSCV,FB-BT,CANCO,SD3,REPRODUCE-ME,BCS7,CN,NCCO,EP,PDQ,FENICS,VDOT,NEOMARK4,FISH-AST,EPIE,MA,PANET,TCO,CLAO,OGR,ODAE,PPO,NATPRO,FAST-EVENT-SKOS,WEAR,CVAO,GLYCORDF,ISO19108TO,CMPO,OAE,ISO19115PR,PIERO,MPO,TAO,PHMAMMADO,STO-DRAFT,NPOKB,EDAM-BIOIMAGING,CISAVIADO,ROLEO,DCM,ONTOPARON_SOCIAL,MNV,INFRARISK,NCRO,CDO,RNRMU,NMOSP,BCTEO,ONTOTOXNUC,DERMO,ICDO,WB-BT,ATO,VFB_DRIVERS,MDDB,NLN,GMO,SAO,EMAPA,BHN,DOID,OCRE,TCDO,TM-MER,ISO19115CON,GEOSPECIES,VARIO,UGANDA_DISEASES,SCIO,AHSO,TM-OTHER-FACTORS,KORO,ENVO,MCCV,ECG,UNITSONT,ONTOSINASC,ECAO,REX,NEO,AO,ACESO,FAST-FORMGENRE,EHDAA,LOINC,NERO,CLYH,MERA,ONTODM-KDD,PLIO,CANONT,TRAK,PO,PHYLONT,MOP,BSAO,OPTION-ONTOLOGY,ELD,CVDO,TDWGSPEC,RDA-ISSUANCE,TEST_A,FHHO,ZONMW-GENERIC,COHSI2STUDY,IDO-COVID-19,ADW,NIHSS,GFO,PEAO,DDPHENO,TRON,HAROREADO,CKDO,OARCS,LUNGMAP-HUMAN,ICO,HIVMT,PATEL,GLYCO,CARRE,EDDA_PT,suicideo,BRO,PATO,REXO,MMUSDV,BIOMO,ICD10,CHIRO,LAND-SURFACE,MLTX,GO-PLUS,OBIWS,DCAT-FDC,HOM,CHD,MCCL,MELO,NIFDYS,ONTOAVIDA,ECTO,HSO,PE-O,HUPSON,SOS,NCIT,PR,BIOMODELS,ESFO,MFO,LEPAO,BAO,EHDA,FIRE,ADO,ATC,REPO,JERM,EDDA,NMDCO,PHFUMIADO,COPDO,OMRSE,GRO,FYPO,LUNGMAP-MOUSE,TXPO,BDO'
     }
     options {
         timestamps()
@@ -42,11 +39,10 @@ pipeline {
                     sh 'echo "$GIT_BRANCH"'
                     sh 'cat env.txt'
                     sh 'cat branch.txt'
-                    sh "echo $BUILDSTARTDATE > dow.txt"
                     sh "echo $BUILDSTARTDATE"
                     sh "echo $MERGEDKGNAME_BASE"
                     sh "echo $MERGEDKGNAME_GENERIC"
-                    sh "python3.8 --version"
+                    sh "python3.9 --version"
                     sh "id"
                     sh "whoami" // this should be jenkinsuser
                     // if the above fails, then the docker host didn't start the docker
@@ -66,88 +62,48 @@ pipeline {
                             url: 'https://github.com/ncbo/kg-bioportal',
                             branch: 'main'
                     )
-                    sh '/usr/bin/python3.8 -m venv venv'
+                    sh '/usr/bin/python3.9 -m venv venv'
                     sh '. venv/bin/activate'
-                    // Now move on to the actual install + reqs
                     sh './venv/bin/pip install .'
                     sh './venv/bin/pip install awscli boto3 s3cmd'
                 }
             }
         }
 
-        // the download step uses s3cmd instead of the standard kghub_downloader
-        // this is so we can access the private object
-
         stage('Download') {
             steps {
                 dir('./gitrepo') {
                     script {
+                        // Get the names of all BioPortal ontologies
+                        sh ". venv/bin/activate && kgbioportal get-ontology-list --api_key ${NCBO_API_KEY} --output data/raw/"
+
+                        // Now download all
+                        // or at least in the future, do them all.
+                        // For now just do a few
+                        sh "printf 'ENVO\nPO\nSEPIO\n' > data/raw/ontologylist.tsv"
 
-                        // Verify that the project directory is defined, or it will make a mess
-                        // when it uploads everything to the wrong directory
-                        if (S3PROJECTDIR.replaceAll("\\s","") == '') {
-                            error("Project name contains only whitespace. Will not continue.")
-                        }
-                        withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) {
-                            sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG get s3://$S3BUCKETNAME/frozen_incoming_data/bioportal_transformed/bioportal_transformed.tar.gz data/raw/bioportal_transformed.tar.gz'
-                        }
+                        // Download the ontologies
+                        sh ". venv/bin/activate && kbbioportal download --api_key ${NCBO_API_KEY} --ontology_file data/raw/ontologylist.tsv --output_dir data/raw/"
 
                     }
                 }
             }
         }
 
-        // Transform step just moves and decompresses the raw sources
-
+        // Transform the downloaded ontologies
         stage('Transform') {
            steps {
                dir('./gitrepo') {
-		           sh '. venv/bin/activate && env && mv data/raw/* ../ && tar -xvzf ../bioportal_transformed.tar.gz -C ../'
-                           sh 'du -a ../'
-		           sh 'pwd'
+		           sh ". venv/bin/activate && kgbioportal transform --input_dir data/raw/ --output_dir data/transformed/"
 	       }
            }
         }
 
-        // Currently using cat-merge
-        stage('Merge') {
-            steps {
-                dir('./gitrepo') {
-	            sh 'echo "Starting that big merge."'
-		    sh '. venv/bin/activate && python3.8 run.py catmerge --merge_all'
-		    sh 'echo "Finished that big merge."'
-                    //sh '. venv/bin/activate && python3.8 run.py catmerge --exclude NCBITAXON,GAZ,DRON,BERO,SNOMEDCT'
-                    sh 'gunzip data/merged/merged-kg.tar.gz'
-                    sh 'tar -rvf data/merged/merged-kg.tar data/merged/qc/'
-                    sh 'tar -rvf data/merged/merged-kg.tar data/merged/merged-kg_nodes.tsv'
-                    sh 'gzip data/merged/merged-kg.tar'
-                    //sh '. venv/bin/activate && python3.8 run.py catmerge --include_only $ONTOSET'
-                    //sh 'cp merged_graph_stats.yaml merged_graph_stats_$BUILDSTARTDATE.yaml'
-                    //sh 'tar -rvfz data/merged/merged-kg.tar.gz merged_graph_stats_$BUILDSTARTDATE.yaml'
-                }
-            }
-        }
-
         stage('Publish') {
             steps {
                 dir('./gitrepo') {
                     script {
 
-                        // make sure we aren't going to clobber existing data
-                        withCredentials([file(credentialsId: 's3cmd_kg_hub_push_configuration', variable: 'S3CMD_CFG')]) {
-                            REMOTE_BUILD_DIR_CONTENTS = sh (
-                                script: '. venv/bin/activate && s3cmd -c $S3CMD_CFG ls s3://$S3BUCKETNAME/$S3PROJECTDIR/$BUILDSTARTDATE/',
-                                returnStdout: true
-                            ).trim()
-                            echo "REMOTE_BUILD_DIR_CONTENTS (THIS SHOULD BE EMPTY): '${REMOTE_BUILD_DIR_CONTENTS}'"
-                            if("${REMOTE_BUILD_DIR_CONTENTS}" != ''){
-                                echo "Will not overwrite existing remote S3 directory: $S3PROJECTDIR/$BUILDSTARTDATE"
-                                sh 'exit 1'
-                            } else {
-                                echo "remote directory $S3PROJECTDIR/$BUILDSTARTDATE is empty, proceeding"
-                            }
-                        }
-
                         if (env.GIT_BRANCH != 'origin/main') {
                             echo "Will not push if not on main branch."
                         } else {
@@ -157,21 +113,13 @@ pipeline {
 					            string(credentialsId: 'aws_kg_hub_access_key', variable: 'AWS_ACCESS_KEY_ID'),
 					            string(credentialsId: 'aws_kg_hub_secret_key', variable: 'AWS_SECRET_ACCESS_KEY')]) {
 
-                                //
-                                // make $BUILDSTARTDATE/ directory and sync to s3 bucket
-                                // Don't create any index - none of this will be public
-                                //
-                                sh 'mkdir $BUILDSTARTDATE/'
-                                sh 'cp -p data/merged/merged-kg.tar.gz $BUILDSTARTDATE/${MERGEDKGNAME_BASE}.tar.gz'
-                                sh 'cp Jenkinsfile $BUILDSTARTDATE/'
-
-                                // Add updated stats
-                                sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr graph_stats.yaml $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/graph_stats.yaml'
-
-                                sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE s3://$S3BUCKETNAME/$S3PROJECTDIR/'
-                                sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG rm -r s3://$S3BUCKETNAME/$S3PROJECTDIR/current/'
-                                sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr $BUILDSTARTDATE/* s3://$S3BUCKETNAME/$S3PROJECTDIR/current/'
+                                // Index, then upload
+                                sh '. venv/bin/activate && multi_indexer -v --directory data/transformed/ --prefix https://kghub.io/$S3PROJECTDIR/ -x -u'
+                                sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate data/transformed/ s3://kg-hub-public-data/$S3PROJECTDIR/'
 
+                                // Now update the index for the whole project
+                                sh '. venv/bin/activate && multi_indexer -v --prefix https://kghub.io/$S3PROJECTDIR/ -b kg-hub-public-data -r $S3PROJECTDIR -x'
+                                sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate ./index.html s3://kg-hub-public-data/$S3PROJECTDIR/'
                             }
 
                         }

diff --git a/src/kg_bioportal/downloader.py b/src/kg_bioportal/downloader.py
@@ -52,6 +52,7 @@ def __init__(
 
         return None
 
+    # TODO: save NCBO ID and version for each ontology, then pass to transformer
     def download(self, onto_list: list = []) -> None:
         """Downloads data files from list of ontologies into data directory.
 

diff --git a/src/kg_bioportal/transformer.py b/src/kg_bioportal/transformer.py
@@ -12,6 +12,8 @@
 # TODO: Fix KGX hijacking logging
 # TODO: Save KGX logs to a file for each ontology
 # TODO: Address BNodes
+# TODO: get version from BioPortal API (in the downloader)
+
 
 class Transformer:
 
@@ -77,14 +79,15 @@ def transform_all(self) -> None:
             logging.info(f"Found {len(filepaths)} ontologies to transform.")
 
         for filepath in filepaths:
-            if not self.transform(filepath):
+            if not self.transform(filepath, version="latest"):
                 logging.error(f"Error transforming {filepath}.")
             else:
                 logging.info(f"Transformed {filepath}.")
 
         return None
 
-    def transform(self, ontology: str) -> bool:
+    # TODO: use NCBO ID to name the output, not the filename
+    def transform(self, ontology: str, version: str) -> bool:
         """Transforms a single ontology to KGX nodes and edges.
 
         Args:
@@ -97,7 +100,9 @@ def transform(self, ontology: str) -> bool:
 
         logging.info(f"Transforming {ontology} to nodes and edges.")
         ontology_name = os.path.splitext(os.path.basename(ontology))[0]
-        owl_output_path = os.path.join(self.output_dir, f"{ontology_name}.owl")
+        owl_output_path = os.path.join(
+            self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}.owl"
+        )
 
         # Convert
         if not robot_convert(
@@ -109,7 +114,12 @@ def transform(self, ontology: str) -> bool:
             status = False
 
         # Relax
-        relaxed_outpath = os.path.join(self.output_dir, f"{ontology_name}_relaxed.owl")
+        relaxed_outpath = os.path.join(
+            self.output_dir,
+            f"{ontology_name}",
+            f"{version}",
+            f"{ontology_name}_relaxed.owl",
+        )
         if not robot_relax(
             robot_path=self.robot_path,
             input_path=owl_output_path,
@@ -120,7 +130,9 @@ def transform(self, ontology: str) -> bool:
 
         # Transform to KGX nodes + edges
         txr = KGXTransformer(stream=True)
-        outfilename = os.path.join(self.output_dir, f"{ontology_name}")
+        outfilename = os.path.join(
+            self.output_dir, f"{ontology_name}", f"{version}", f"{ontology_name}"
+        )
         nodefilename = outfilename + "_nodes.tsv"
         edgefilename = outfilename + "_edges.tsv"
         input_args = {
@@ -139,7 +151,9 @@ def transform(self, ontology: str) -> bool:
                 input_args=input_args,
                 output_args=output_args,
             )
-            logging.info(f"Nodes and edges written to {nodefilename} and {edgefilename}.")
+            logging.info(
+                f"Nodes and edges written to {nodefilename} and {edgefilename}."
+            )
             status = True
         except Exception as e:
             logging.error(f"Error transforming {ontology} to KGX nodes and edges: {e}")