Spaces:
Running
Running
# Copyright (c) Microsoft Corporation. | |
# Licensed under the MIT license. | |
import subprocess | |
import time | |
import traceback | |
from xml.dom import minidom | |
def collect_gpu_usage(node_id): | |
cmd = 'nvidia-smi -q -x'.split() | |
info = None | |
try: | |
smi_output = subprocess.check_output(cmd) | |
info = parse_nvidia_smi_result(smi_output) | |
except Exception: | |
traceback.print_exc() | |
info = gen_empty_gpu_metric() | |
return info | |
def parse_nvidia_smi_result(smi): | |
try: | |
output = {} | |
xmldoc = minidom.parseString(smi) | |
gpuList = xmldoc.getElementsByTagName('gpu') | |
output["Timestamp"] = time.asctime(time.localtime()) | |
output["gpuCount"] = len(gpuList) | |
output["gpuInfos"] = [] | |
for gpuIndex, gpu in enumerate(gpuList): | |
gpuInfo = {} | |
gpuInfo['index'] = gpuIndex | |
gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ | |
.getElementsByTagName('gpu_util')[0]\ | |
.childNodes[0].data.replace("%", "").strip() | |
gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ | |
.getElementsByTagName('memory_util')[0]\ | |
.childNodes[0].data.replace("%", "").strip() | |
processes = gpu.getElementsByTagName('processes') | |
runningProNumber = len(processes[0].getElementsByTagName('process_info')) | |
gpuInfo['activeProcessNum'] = runningProNumber | |
gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ | |
.childNodes[0].data | |
memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] | |
gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ | |
.childNodes[0].data.replace("MiB", "").strip() | |
gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ | |
.childNodes[0].data.replace("MiB", "").strip() | |
gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\ | |
.childNodes[0].data.replace("MiB", "").strip() | |
output["gpuInfos"].append(gpuInfo) | |
except Exception: | |
traceback.print_exc() | |
output = {} | |
return output | |
def gen_empty_gpu_metric(): | |
try: | |
output = {} | |
output["Timestamp"] = time.asctime(time.localtime()) | |
output["gpuCount"] = 0 | |
output["gpuInfos"] = [] | |
except Exception: | |
traceback.print_exc() | |
output = {} | |
return output | |