scontrol.py
1 |
#!/usr/bin/env python
|
---|---|
2 |
'''
|
3 |
Created on March 23rd 2016
|
4 |
|
5 |
@author: mkuemmel@usm.lmu.de
|
6 |
|
7 |
Date: $Date: 2018-05-24 10:25:02 +0200 (Thu, 24 May 2018) $
|
8 |
Revision: $Revision: 340 $
|
9 |
Author: $Author: Martin.Kuemmel $
|
10 |
'''
|
11 |
import os.path |
12 |
import subprocess |
13 |
|
14 |
def theNodeStatus(): |
15 |
# run the 'sinfo' command
|
16 |
# to list all nodes (no quotes necessary for format string)
|
17 |
comSeq = ['sinfo', '-p', 'usm-cl', '-o', '%n %t %e %m %O %C'] |
18 |
allReturns = execCommand('sinfo', comSeq, prompt='execCommand>') |
19 |
if allReturns['retcode']: |
20 |
errMsg = 'Command: "%s" finished with exit code: %i!' % (comSeq, allReturns['retcode']) |
21 |
raise Exception(errMsg) |
22 |
|
23 |
# read in the
|
24 |
allNodes = {} |
25 |
lineIndex=0
|
26 |
for aLine in open(allReturns['stdout']): |
27 |
lineIndex += 1
|
28 |
|
29 |
# skip the first line
|
30 |
if lineIndex < 2: |
31 |
continue
|
32 |
|
33 |
# isolate the items
|
34 |
lineItems = aLine.strip().split() |
35 |
|
36 |
# get key and state
|
37 |
key = lineItems[0]
|
38 |
state = lineItems[1]
|
39 |
|
40 |
# get the other info
|
41 |
if state != 'down': |
42 |
fracMem = 100.0*float(lineItems[2])/float(lineItems[3]) |
43 |
fmStr = '%4.0f%s' % (fracMem, '%') |
44 |
load = float(lineItems[4]) |
45 |
cpuUsed = '%s/%s'%(lineItems[5].split("/")[0],lineItems[5].split("/")[-1]) |
46 |
else:
|
47 |
fracMem = '---'
|
48 |
fmStr = '---'
|
49 |
load = '---'
|
50 |
cpuUsed = '---'
|
51 |
|
52 |
# append the info to the keylist
|
53 |
allNodes[key] = {'state': state, 'load': load, 'mem': fmStr, 'cpu': cpuUsed} |
54 |
|
55 |
# destroy the output files
|
56 |
os.unlink(allReturns['stdout'])
|
57 |
os.unlink(allReturns['stderr'])
|
58 |
|
59 |
# return the keys
|
60 |
return allNodes
|
61 |
|
62 |
def execCommand(command, commSequence, prompt='execCommand>', verbose=True): |
63 |
"""
|
64 |
"""
|
65 |
# make stdout and stderr files ready
|
66 |
stdoutFile = '%s.out' % command
|
67 |
stderrFile = '%s.err' % command
|
68 |
if os.path.isfile(stdoutFile):
|
69 |
os.unlink(stdoutFile) |
70 |
if os.path.isfile(stderrFile):
|
71 |
os.unlink(stderrFile) |
72 |
sout = open(stdoutFile, "w+") |
73 |
serr = open(stderrFile, "w+") |
74 |
|
75 |
# print the entire command
|
76 |
if verbose:
|
77 |
allCommand = ' '.join(commSequence)
|
78 |
serr.write(allCommand+'\n')
|
79 |
serr.flush() |
80 |
|
81 |
# run the command
|
82 |
retcode = subprocess.call(commSequence, shell=False, env=None, stdout=sout, stderr=serr ) |
83 |
|
84 |
# close stdout and stderr
|
85 |
sout.close() |
86 |
serr.close() |
87 |
|
88 |
# give back the return
|
89 |
return {'retcode': retcode, 'stdout': stdoutFile, 'stderr': stderrFile} |
90 |
|
91 |
def getScontrolList(): |
92 |
"""
|
93 |
"""
|
94 |
# run the 'scontrol' command
|
95 |
# to list all jobs
|
96 |
comSeq = ['scontrol', 'show', 'job'] |
97 |
allReturns = execCommand('scontrol', comSeq, prompt='execCommand>') |
98 |
if allReturns['retcode']: |
99 |
errMsg = 'Command: "%s" finished with exit code: %i!' % (comSeq, allReturns['retcode']) |
100 |
raise Exception(errMsg) |
101 |
|
102 |
# go over all lines
|
103 |
# of the output files
|
104 |
nmRunJobs = [] |
105 |
nmPendJobs = [] |
106 |
dbRunJobs = [] |
107 |
dbPendJobs = [] |
108 |
oneJob=None
|
109 |
for aLine in open(allReturns['stdout']): |
110 |
|
111 |
# identify a new job;
|
112 |
# append the old one, open a new dict
|
113 |
if aLine.find('JobId')>-1: |
114 |
if oneJob:
|
115 |
if oneJob['JobState']=='RUNNING': |
116 |
if oneJob['Partition'] == 'usm-cl': |
117 |
dbRunJobs.append(oneJob) |
118 |
else:
|
119 |
nmRunJobs.append(oneJob) |
120 |
elif oneJob['JobState']=='PENDING': |
121 |
if oneJob['Partition'] == 'usm-cl': |
122 |
dbPendJobs.append(oneJob) |
123 |
else:
|
124 |
nmPendJobs.append(oneJob) |
125 |
oneJob={} |
126 |
|
127 |
if aLine.find(',') > -1: |
128 |
# split line according to ','
|
129 |
lineParts = aLine.strip().split(',')
|
130 |
else:
|
131 |
# split the line according to blanks
|
132 |
lineParts = aLine.strip().split() |
133 |
|
134 |
# go over the parts
|
135 |
for aPart in lineParts: |
136 |
# identify keywords and append to the dict
|
137 |
jobParts=aPart.split('=')
|
138 |
if len(jobParts) < 2: |
139 |
continue
|
140 |
oneJob[jobParts[0]] = jobParts[1] |
141 |
|
142 |
# append the last job
|
143 |
if oneJob:
|
144 |
if oneJob['JobState']=='RUNNING': |
145 |
if oneJob['Partition'] == 'usm-cl': |
146 |
dbRunJobs.append(oneJob) |
147 |
else:
|
148 |
nmRunJobs.append(oneJob) |
149 |
elif oneJob['JobState']=='PENDING': |
150 |
if oneJob['Partition'] == 'usm-cl': |
151 |
dbPendJobs.append(oneJob) |
152 |
else:
|
153 |
nmPendJobs.append(oneJob) |
154 |
|
155 |
# destroy the output files
|
156 |
os.unlink(allReturns['stdout'])
|
157 |
os.unlink(allReturns['stderr'])
|
158 |
|
159 |
# return the various job lists
|
160 |
#return {'normalRun': nmRunJobs, 'normalPend' :nmPendJobs, 'lowpriRun': dbRunJobs, 'lowpriPend': dbPendJobs}
|
161 |
return {'usm-clRun': dbRunJobs, 'usm-clPend': dbPendJobs} |
162 |
|
163 |
def getUserInfo(jobLists): |
164 |
"""
|
165 |
"""
|
166 |
# go over all lists
|
167 |
allLists = {} |
168 |
for aJob in jobLists: |
169 |
|
170 |
# skip empty lists
|
171 |
if len(jobLists[aJob]) < 1: |
172 |
continue
|
173 |
|
174 |
# make a dict for the users
|
175 |
allUsers = {} |
176 |
for oneJob in jobLists[aJob]: |
177 |
# try getting the number of CPU's;
|
178 |
# get the memory information
|
179 |
# define the user ID accordingly
|
180 |
try:
|
181 |
# get ncpu and user=ID
|
182 |
nCpu = int(oneJob['NumCPUs']) |
183 |
userId = oneJob['UserId']
|
184 |
# get the memory information, which is different
|
185 |
# for pending and running jobs
|
186 |
if 'mem' in oneJob: |
187 |
if oneJob['mem'][-1] == 'G' or oneJob['mem'][-1] == 'M': |
188 |
userMem = float(oneJob['mem'][:-1]) |
189 |
unitMem = oneJob['mem'][-1] |
190 |
else:
|
191 |
userMem = float(oneJob['mem']) |
192 |
unitMem = 'M'
|
193 |
elif 'MinMemoryNode' in oneJob: |
194 |
userMem = float(oneJob['MinMemoryNode'][:-1]) |
195 |
unitMem = oneJob['MinMemoryNode'][-1] |
196 |
elif 'MinMemoryCPU' in oneJob: |
197 |
userMem = float(oneJob['MinMemoryCPU'][:-1]) |
198 |
unitMem = oneJob['MinMemoryCPU'][-1] |
199 |
# convert to Gb if necessary
|
200 |
if unitMem == 'M': |
201 |
userMem /= 1000.0
|
202 |
hasNCpu = True
|
203 |
except ValueError: |
204 |
nCpu = None
|
205 |
userId = '%sS' % oneJob['UserId'] |
206 |
hasNCpu = False
|
207 |
|
208 |
# append the job to an ID
|
209 |
# or create a new ID
|
210 |
if not userId in allUsers: |
211 |
if hasNCpu:
|
212 |
allUsers[userId] = {'hasNCpu': True, 'njobs': 1, 'ncpu': nCpu, 'memsize': userMem, 'jobids': [oneJob['JobId']]} |
213 |
else:
|
214 |
allUsers[userId] = {'hasNCpu': False,'njobs': 1, 'ncpu': oneJob['NumCPUs'], 'memsize': oneJob['MinMemoryNode'], 'jobids': [oneJob['JobId']]} |
215 |
else:
|
216 |
if hasNCpu:
|
217 |
allUsers[userId]['njobs'] += 1 |
218 |
allUsers[userId]['ncpu'] += nCpu
|
219 |
allUsers[userId]['memsize'] += userMem
|
220 |
allUsers[userId]['jobids'].append(oneJob['JobId']) |
221 |
else:
|
222 |
allUsers[userId]['njobs'] += 1 |
223 |
allUsers[userId]['ncpu'] += '%,'%str(oneJob['NumCPUs']) |
224 |
allUsers[userId]['memsize'] += '%,'%str(oneJob['MinMemoryNode']) |
225 |
allUsers[userId]['jobids'].append(oneJob['JobId']) |
226 |
|
227 |
# append the list the combined list
|
228 |
allLists[aJob] = allUsers |
229 |
|
230 |
# return the combined list
|
231 |
return allLists
|
232 |
|
233 |
def printUserJobList(userJobLists): |
234 |
"""
|
235 |
"""
|
236 |
# go over all queues
|
237 |
for aList in userJobLists: |
238 |
|
239 |
# print all users for a queue
|
240 |
for oneUser in userJobLists[aList]: |
241 |
if userJobLists[aList][oneUser]['hasNCpu']: |
242 |
print('%10s> %-15s njobs: %3i ncpus: %4i memory: %6.1fG' % (aList, oneUser, userJobLists[aList][oneUser]['njobs'], userJobLists[aList][oneUser]['ncpu'], userJobLists[aList][oneUser]['memsize'])) |
243 |
else:
|
244 |
print('%10s> %-15s njobs: %3i ncpus: %s memory: %s' % (aList, oneUser, userJobLists[aList][oneUser]['njobs'], userJobLists[aList][oneUser]['ncpu'], userJobLists[aList][oneUser]['memsize'])) |
245 |
# print a separator
|
246 |
print('')
|
247 |
|
248 |
def doScontrol(): |
249 |
"""
|
250 |
"""
|
251 |
|
252 |
# get the job on all queues via 'scontrol'
|
253 |
jobLists = getScontrolList() |
254 |
|
255 |
# get the user information for all job lists
|
256 |
userJobLists = getUserInfo(jobLists) |
257 |
|
258 |
# print the information
|
259 |
printUserJobList(userJobLists) |
260 |
|
261 |
def checkNodes(): |
262 |
"""
|
263 |
"""
|
264 |
# the list of nodes to check for
|
265 |
checkNodes=['usm-cl-bt01n1', 'usm-cl-bt01n2', 'usm-cl-bt01n3', 'usm-cl-bt01n4', 'usm-cl-bt02n1', 'usm-cl-bt02n2', 'usm-cl-bt02n3', 'usm-cl-bt02n4'] |
266 |
|
267 |
allNodes = theNodeStatus() |
268 |
print("\n\
|
269 |
+-----------------------------------------------+\n\
|
270 |
+ USM cluster at the physics depart. +\n\
|
271 |
+-----------------------------------------------+")
|
272 |
print(str("%15s%8s%8s%8s%8s |" % |
273 |
("Node: ","Status","Slots"," Mem ","Load"))) |
274 |
print(str("%15s%8s%8s%8s%8s |" % |
275 |
("----------","------", "-----"," --- ","----"))) |
276 |
numTot = 0
|
277 |
numUsed = 0
|
278 |
for oneNode in checkNodes: |
279 |
if oneNode in allNodes: |
280 |
print(str("%15s%8s%8s%8s%8s |" % (oneNode.ljust(10),allNodes[oneNode]['state'],allNodes[oneNode]['cpu'],allNodes[oneNode]['mem'],allNodes[oneNode]['load']))) |
281 |
if allNodes[oneNode]['cpu'].find('/') > -1: |
282 |
procInfo = allNodes[oneNode]['cpu'].split('/') |
283 |
numUsed += int(procInfo[0]) |
284 |
numTot += int(procInfo[1]) |
285 |
print('CPUs used/total: % 3i/%03i |' % (numUsed, numTot))
|
286 |
print("+-----------------------------------------------+\n")
|
287 |
|
288 |
def main(): |
289 |
|
290 |
checkNodes() |
291 |
doScontrol() |
292 |
|
293 |
if __name__ == '__main__': |
294 |
main() |