MOON
Server: Apache
System: Linux server1.studioinfinity.com.br 2.6.32-954.3.5.lve1.4.90.el6.x86_64 #1 SMP Tue Feb 21 12:26:30 UTC 2023 x86_64
User: artinside (517)
PHP: 7.4.33
Disabled: exec,passthru,shell_exec,system
Upload Files
File: //opt/netdata/usr/local/local/lib/netdata/conf.d/health.d/dcgm.conf
# DCGM GPU reliability alerts.

 template: dcgm_gpu_xid_errors
       on: dcgm.gpu.reliability.xid
    class: Errors
     type: GPU
component: NVIDIA
   lookup: max -1m unaligned absolute of xid
    units: code
    every: 30s
     warn: $this > 0
    delay: up 30s down 5m multiplier 1.5 max 1h
  summary: DCGM reported XID error on GPU ${label:gpu}
     info: NVIDIA driver reported a GPU XID error (metric ${label:chart_context}).
       to: sysadmin

 template: dcgm_gpu_row_remap_failure
       on: dcgm.gpu.reliability.row_remap_status
    class: Errors
     type: GPU
component: NVIDIA
   lookup: max -1m unaligned absolute of row_remap_failure
    units: state
    every: 30s
     warn: $this > 0
    delay: up 30s down 5m multiplier 1.5 max 1h
  summary: DCGM row remap failure on GPU ${label:gpu}
     info: Row remapping has failed, indicating a persistent memory reliability problem.
       to: sysadmin

 template: dcgm_gpu_uncorrectable_remapped_rows
       on: dcgm.gpu.reliability.row_remap_events
    class: Errors
     type: GPU
component: NVIDIA
   lookup: sum -5m unaligned absolute of uncorrectable_remapped_rows
    units: rows
    every: 30s
     warn: $this > 0
    delay: up 30s down 10m multiplier 1.5 max 1h
  summary: DCGM uncorrectable remapped rows on GPU ${label:gpu}
     info: New uncorrectable row remap events were detected in the last 5 minutes.
       to: sysadmin

# DCGM throttle violation alerts.

 template: dcgm_gpu_power_violation
       on: dcgm.gpu.throttle.violations
    class: Workload
     type: GPU
component: NVIDIA
   lookup: sum -5m unaligned absolute of power_violation
    units: milliseconds
    every: 30s
     warn: $this > 0
    delay: up 1m down 10m multiplier 1.5 max 1h
  summary: DCGM power throttling detected on GPU ${label:gpu}
     info: The GPU was power-throttled during the last 5 minutes.
       to: sysadmin

 template: dcgm_gpu_thermal_violation
       on: dcgm.gpu.throttle.violations
    class: Workload
     type: GPU
component: NVIDIA
   lookup: sum -5m unaligned absolute of thermal_violation
    units: milliseconds
    every: 30s
     warn: $this > 0
    delay: up 1m down 10m multiplier 1.5 max 1h
  summary: DCGM thermal throttling detected on GPU ${label:gpu}
     info: The GPU was thermally throttled during the last 5 minutes.
       to: sysadmin