Skip to content

Commit

Permalink
Generalize config loading code in cluster-checker (#58)
Browse files Browse the repository at this point in the history
Support loading the operator config yaml for MLBatch on vanilla
Kubernetes and on appwrapper dev clusters.
  • Loading branch information
dgrove-oss authored Sep 13, 2024
1 parent 077e046 commit b5ab2e7
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions tools/cluster-checker/checker.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,24 @@ class Client {
return res.body
}

async readOperatorConfig() {
const options = [
{ns: 'redhat-ods-applications', cm: 'codeflare-operator-config', key: 'config.yaml', f: cm => cm.appwrapper?.Config },
{ns: 'mlbatch-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper },
{ns: 'appwrapper-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper }
]
for (const opt of options) {
try {
const configMap = await this.readConfigMap(opt.cm, opt.ns)
const cm = k8s.loadYaml(configMap.data[opt.key])
return opt.f(cm)
} catch (error) {
}
}
console.log('WARNING: Failed to read operator config')
return {}
}

async clusterQueues () {
const res = await this.custom.listClusterCustomObject(
'kueue.x-k8s.io',
Expand Down Expand Up @@ -184,11 +202,9 @@ async function main () {
let quotaGPUs = 0 // nominal GPU quota (excluding slack queue)
let slackGPUs = 0 // lending limit on slack queue

// load codeflare operator configuration
const configMap = await client.readConfigMap('codeflare-operator-config', 'redhat-ods-applications')
const config = k8s.loadYaml(configMap.data['config.yaml'])
const taints = config.appwrapper?.Config?.autopilot?.resourceTaints?.['nvidia.com/gpu']
const slackQueueName = config.appwrapper?.Config?.slackQueueName
const config = await client.readOperatorConfig()
const taints = config.autopilot?.resourceTaints?.['nvidia.com/gpu']
const slackQueueName = config.slackQueueName

// compute GPU counts
const nodes = await client.nodes()
Expand Down

0 comments on commit b5ab2e7

Please sign in to comment.