-
Notifications
You must be signed in to change notification settings - Fork 1
/
getJobStateFcn.m
108 lines (93 loc) · 3.63 KB
/
getJobStateFcn.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
function state = getJobStateFcn(cluster, job, state)
%GETJOBSTATEFCN Gets the state of a job from SLURM
%
% Set your cluster's GetJobStateFcn to this function using the following
% command:
% set(cluster, 'GetJobStateFcn', @getJobStateFcn);
% Copyright 2010-2012 The MathWorks, Inc.
% Store the current filename for the errors, warnings and dctSchedulerMessages
currFilename = mfilename;
if ~isa(cluster, 'parallel.Cluster')
error('parallelexamples:GenericSLURM:SubmitFcnError', ...
'The function %s is for use with clusters created using the parcluster command.', currFilename)
end
if ~cluster.HasSharedFilesystem
error('parallelexamples:GenericSLURM:SubmitFcnError', ...
'The submit function %s is for use with shared filesystems.', currFilename)
end
% Shortcut if the job state is already finished or failed
jobInTerminalState = strcmp(state, 'finished') || strcmp(state, 'failed');
if jobInTerminalState
return;
end
% Get the information about the actual cluster used
data = cluster.getJobClusterData(job);
if isempty(data)
% This indicates that the job has not been submitted, so just return
dctSchedulerMessage(1, '%s: Job cluster data was empty for job with ID %d.', currFilename, job.ID);
return
end
try
jobIDs = data.ClusterJobIDs;
catch err
ex = MException('parallelexamples:GenericSLURM:FailedToRetrieveJobID', ...
'Failed to retrieve clusters''s job IDs from the job cluster data.');
ex = ex.addCause(err);
throw(ex);
end
commandToRun = sprintf('sacct -j %s.0 -n -P -ostate', sprintf('%d ', jobIDs{:}));
dctSchedulerMessage(4, '%s: Querying cluster for job state using command:\n\t%s', currFilename, commandToRun);
try
% We will ignore the status returned from the state command because
% a non-zero status is returned if the job no longer exists
% Make the shelled out call to run the command.
[~, cmdOut] = system(commandToRun);
catch err
ex = MException('parallelexamples:GenericSLURM:FailedToGetJobState', ...
'Failed to get job state from cluster.');
ex.addCause(err);
throw(ex);
end
clusterState = iExtractJobState(cmdOut, numel(jobIDs));
dctSchedulerMessage(6, '%s: State %s was extracted from cluster output:\n', currFilename, clusterState);
% If we could determine the cluster's state, we'll use that, otherwise
% stick with MATLAB's job state.
if ~strcmp(clusterState, 'unknown')
state = clusterState;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function state = iExtractJobState(bjobsOut, numJobs)
% Function to extract the job state from the output of sacct
% How many PEND, PSUSP, USUSP, SSUSP, WAIT
numPending = numel(regexp(bjobsOut, 'PENDING|SUSPENDED|COMPLETING|CONFIGURING|PREEMPTED'));
% How many RUN strings - UNKWN started running and then comms was lost
% with the sbatchd process.
numRunning = numel(regexp(bjobsOut, 'RUNNING|UNKNOWN'));
% How many DONE, EXIT, ZOMBI strings
numFailed = numel(regexp(bjobsOut, 'FAILED|TIMEOUT'));
% How many DONE
numFinished = numel(regexp(bjobsOut, 'COMPLETED|CANCELED|NODE_FAIL|SPECIAL_EXIT'));
% If the number of finished jobs is the same as the number of jobs that we
% asked about then the entire job has finished.
if numFinished == numJobs
state = 'finished';
return;
end
% Any running indicates that the job is running
if numRunning > 0
state = 'running';
return
end
% We know numRunning == 0 so if there are some still pending then the
% job must be queued again, even if there are some finished
if numPending > 0
state = 'queued';
return
end
% Deal with any tasks that have failed
if numFailed > 0
% Set this job to be failed
state = 'failed';
return
end
state = 'unknown';