Skip to content

Commit

Permalink
fix N/A value when nvidia-smi reports no power usage
Browse files Browse the repository at this point in the history
  • Loading branch information
bigsk1 committed Nov 25, 2024
1 parent e6ff52a commit 2539ca2
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 37 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,8 @@ If this failed proceed to [Installation Prerequisites](#installation-prerequisit


## License
[![License](https://img.shields.io/github/license/bigsk1/gpu-monitor)](https://github.com/bigsk1/gpu-monitor/blob/main/LICENSE)
[![License](https://img.shields.io/github/license/bigsk1/gpu-monitor)](https://github.com/bigsk1/gpu-monitor/blob/main/LICENSE)

Docker Scout Score

![Docker Scout](https://imagedelivery.net/WfhVb8dSNAAvdXUdMfBuPQ/969b5bc9-6c2c-44c7-32a8-359287378a00/public)
67 changes: 37 additions & 30 deletions gpu-stats.html
Original file line number Diff line number Diff line change
Expand Up @@ -858,14 +858,14 @@ <h2>Alert Settings</h2>
});

function updateGaugeColor(value, element, max = 100) {
// Handle N/A or invalid values
if (value === 'N/A' || value === null || isNaN(value)) {
value = 0;
}
const percentage = (value / max) * 100;
let colorClass = 'success';

if (percentage > 70) {
colorClass = 'danger';
} else if (percentage > 50) {
colorClass = 'warning';
}
if (percentage > 70) colorClass = 'danger';
else if (percentage > 50) colorClass = 'warning';

element.className = 'gauge-fill ' + colorClass;
element.style.width = Math.min(percentage, 100) + '%';
Expand Down Expand Up @@ -932,20 +932,16 @@ <h2>Alert Settings</h2>
console.warn('No data available');
return;
}

// Calculate cutoff time with explicit handling of time frames

const cutoff = new Date();
if (hours === 0.25) {
cutoff.setMinutes(cutoff.getMinutes() - 15);
console.log('Setting 15 minute timeframe');
} else if (hours === 0.5) {
cutoff.setMinutes(cutoff.getMinutes() - 30);
console.log('Setting 30 minute timeframe');
} else {
cutoff.setHours(cutoff.getHours() - hours);
console.log(`Setting ${hours} hour timeframe`);
}

const currentYear = new Date().getFullYear();
const filteredData = {
timestamps: [],
Expand All @@ -959,10 +955,11 @@ <h2>Alert Settings</h2>
const dt = new Date(`${currentYear} ${timestamp}`);
if (dt >= cutoff) {
filteredData.timestamps.push(timestamp);
filteredData.temperatures.push(data.temperatures[i]);
filteredData.utilizations.push(data.utilizations[i]);
filteredData.memory.push(data.memory[i]);
filteredData.power.push(data.power[i]);
// Ensure all values are valid numbers
filteredData.temperatures.push(Number(data.temperatures[i]) || 0);
filteredData.utilizations.push(Number(data.utilizations[i]) || 0);
filteredData.memory.push(Number(data.memory[i]) || 0);
filteredData.power.push(Number(data.power[i]) || 0);
}
});

Expand All @@ -972,22 +969,26 @@ <h2>Alert Settings</h2>
chart.data.datasets[1].data = filteredData.utilizations;
chart.data.datasets[2].data = filteredData.memory;
chart.data.datasets[3].data = filteredData.power;
chart.update();
chart.update('none'); // Use 'none' to disable animation if needed

// Update performance indicators
document.getElementById('peak-temp').textContent =
`${Math.max(...filteredData.temperatures)}°C`;
document.getElementById('avg-util').textContent =
`${(filteredData.utilizations.reduce((a, b) => a + b, 0) / filteredData.utilizations.length).toFixed(1)}%`;
document.getElementById('max-mem').textContent =
`${Math.max(...filteredData.memory)} MiB`;

const avgPower = filteredData.power.reduce((a, b) => a + b, 0) / filteredData.power.length;
const avgUtil = filteredData.utilizations.reduce((a, b) => a + b, 0) / filteredData.utilizations.length || 1;
document.getElementById('power-efficiency').textContent =
`${(avgPower / avgUtil).toFixed(1)} W/%`;
// performance indicators
if (filteredData.timestamps.length > 0) {
document.getElementById('peak-temp').textContent =
`${Math.max(...filteredData.temperatures)}°C`;
document.getElementById('avg-util').textContent =
`${(filteredData.utilizations.reduce((a, b) => a + b, 0) / filteredData.utilizations.length).toFixed(1)}%`;
document.getElementById('max-mem').textContent =
`${Math.max(...filteredData.memory)} MiB`;

const avgPower = filteredData.power.reduce((a, b) => a + b, 0) / filteredData.power.length;
const avgUtil = filteredData.utilizations.reduce((a, b) => a + b, 0) / filteredData.utilizations.length || 1;
document.getElementById('power-efficiency').textContent =
`${(avgPower / avgUtil).toFixed(1)} W/%`;
}
})
.catch(error => console.error('Error loading historical data:', error));
.catch(error => {
console.error('Error updating chart:', error);
});
}

function updateTimeframeStats(data) {
Expand All @@ -1010,6 +1011,12 @@ <h2>Alert Settings</h2>

function smoothlyUpdateText(element, newValue, unit = '') {
if (element) {
// Handle N/A or invalid values
if (newValue === 'N/A' || newValue === null || isNaN(newValue)) {
element.textContent = '--' + unit;
return;
}

// Special handling for MiB values - no decimals
if (unit.includes('MiB')) {
element.textContent = `${Math.round(newValue)}${unit}`;
Expand Down
33 changes: 27 additions & 6 deletions monitor_gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,17 @@ data_buffer = []
for line in sys.stdin:
try:
timestamp, temp, util, mem, power = line.strip().split(',')
try:
power_val = float(power) if power.strip() != 'N/A' else 0
except (ValueError, AttributeError):
power_val = 0
data_buffer.append({
"timestamp": timestamp,
"temperature": float(temp),
"utilization": float(util),
"memory": float(mem),
"power": float(power)
"power": power_val
})
if len(data_buffer) >= BATCH_SIZE:
Expand Down Expand Up @@ -144,7 +149,7 @@ process_24hr_stats() {
return
fi

cat > /tmp/process_stats.py << 'EOF'
cat > /tmp/process_stats.py << 'EOF'
import sys
from datetime import datetime, timedelta
import json
Expand All @@ -166,22 +171,31 @@ for line in sys.stdin:
temp = float(temp)
util = float(util)
mem = float(mem)
power = float(power)
# Handle N/A power values
try:
power = float(power) if power.strip() != 'N/A' else 0
except (ValueError, AttributeError):
power = 0
temp_min = min(temp_min, temp)
temp_max = max(temp_max, temp)
util_min = min(util_min, util)
util_max = max(util_max, util)
mem_min = min(mem_min, mem)
mem_max = max(mem_max, mem)
power_min = min(power_min, power)
power_max = max(power_max, power)
if power > 0: # Only update power min/max if power is reported
power_min = min(power_min, power)
power_max = max(power_max, power)
except:
continue
# Handle case where no data was processed
if temp_min == float('inf'):
temp_min = temp_max = util_min = util_max = mem_min = mem_max = power_min = power_max = 0
temp_min = temp_max = util_min = util_max = mem_min = mem_max = 0
# Special handling for power stats when not available
if power_min == float('inf') or power_max == float('-inf'):
power_min = power_max = 0
stats = {
"stats": {
Expand Down Expand Up @@ -238,6 +252,8 @@ rotate_logs() {
update_stats() {
# Get current stats
local timestamp=$(date '+%m-%d %H:%M:%S')
# For testing, replace the nvidia-smi command with:
# local gpu_stats="44, 0, 3, N/A"
local gpu_stats=$(nvidia-smi --query-gpu=temperature.gpu,utilization.gpu,memory.used,power.draw \
--format=csv,noheader,nounits 2>/dev/null)

Expand All @@ -251,6 +267,11 @@ update_stats() {
local mem=$(echo "$gpu_stats" | cut -d',' -f3 | tr -d ' ')
local power=$(echo "$gpu_stats" | cut -d',' -f4 | tr -d ' ')

# Handle N/A power value
if [[ "$power" == "N/A" || -z "$power" ]]; then
power="0" # Using 0 as default for N/A power values
fi

cat > "$JSON_FILE" << EOF
{
"timestamp": "$timestamp",
Expand Down

0 comments on commit 2539ca2

Please sign in to comment.