Skip to content

Commit

Permalink
Add a couple more metrics.
Browse files Browse the repository at this point in the history
  • Loading branch information
pcarranza committed May 6, 2018
1 parent 2081d21 commit 5941a62
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 28 deletions.
19 changes: 11 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,13 @@ recording metrics from any hosts regardless of it being the leader or not.
|nomad_client_errors_total | Number of errors that were accounted for. | |
|nomad_leader | Wether the current host is the cluster leader. | |
|nomad_jobs_total | How many jobs are there in the cluster. | |
|nomad_node_info | Node information. | name, version, class, status, drain, datacenter, scheduling_eligibility |
|nomad_raft_peers | How many peers (servers) are in the Raft cluster. | |
|nomad_serf_lan_members | How many members are in the cluster. | |
|nomad_serf_lan_member_status | Describe member state. | datacenter, class, node, drain |
|nomad_allocation | Allocation labeled with runtime information. | status, desired_status, job_type, job_id, task_group, node_id, node |
|nomad_allocation | Allocation labeled with runtime information. | status, desired_status, job_type, job_id, task_group, node |
|nomad_evals_total | The number of evaluations. | status |
|nomad_tasks_total | The number of tasks. | state, failed, job_type, node_id, node |
|nomad_tasks_total | The number of tasks. | state, failed, job_type, node |
|nomad_deployments_total | The number of deployments. | status, job_id |
|nomad_deployment_task_group_desired_canaries_total | The number of desired canaries for the task group. | job_id, deployment_id, task_group, promoted, auto_revert |
|nomad_deployment_task_group_desired_total | The number of desired allocs for the task group. | job_id, deployment_id, task_group, promoted, auto_revert |
Expand All @@ -56,12 +57,14 @@ recording metrics from any hosts regardless of it being the leader or not.
|nomad_task_cpu_total_ticks | Task CPU total ticks. | job, group, alloc, region, datacenter, node, task |
|nomad_task_cpu_percent | Task CPU usage percent. | job, group, alloc, region, datacenter, node, task |
|nomad_task_memory_rss_bytes | Task memory RSS usage in bytes. | job, group, alloc, region, datacenter, node, task |
|nomad_node_resource_memory_bytes | Amount of allocatable memory the node has in bytes| node_id, node, datacenter |
|nomad_node_allocated_memory_bytes | Amount of memory allocated to tasks on the node in bytes. | node_id, node, datacenter |
|nomad_node_used_memory_bytes | Amount of memory used on the node in bytes. | node_id, node, datacenter |
|nomad_node_resource_cpu_megahertz | Amount of allocatable CPU the node has in MHz. | node_id, node, datacenter |
|nomad_node_allocated_cpu_megahertz | Amount of allocated CPU on the node in MHz. | node_id, node, datacenter |
|nomad_node_used_cpu_megahertz | Amount of CPU used on the node in MHz. | node_id, node, datacenter |
|nomad_node_resource_memory_bytes | Amount of allocatable memory the node has in bytes| node, datacenter |
|nomad_node_allocated_memory_bytes | Amount of memory allocated to tasks on the node in bytes. | node, datacenter |
|nomad_node_used_memory_bytes | Amount of memory used on the node in bytes. | node, datacenter |
|nomad_node_resource_cpu_megahertz | Amount of allocatable CPU the node has in MHz. | node, datacenter |
|nomad_node_resource_iops | Amount of allocatable IOPS the node has. | node, datacenter |
|nomad_node_resource_disk_bytes | Amount of allocatable disk bytes the node has. | node, datacenter |
|nomad_node_allocated_cpu_megahertz | Amount of allocated CPU on the node in MHz. | node, datacenter |
|nomad_node_used_cpu_megahertz | Amount of CPU used on the node in MHz. | node, datacenter |

## Usage

Expand Down
69 changes: 49 additions & 20 deletions nomad-exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ var (
"How many peers (servers) are in the Raft cluster.",
nil, nil,
)
nodeInfo = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_info"),
"Node information",
[]string{"name", "version", "class", "status", "drain", "datacenter", "scheduling_eligibility"},
nil,
)
serfLanMembers = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "serf_lan_members"),
"How many members are in the cluster.",
Expand All @@ -59,12 +65,12 @@ var (
"How many jobs are there in the cluster.",
nil, nil,
)
allocationMemotyBytes = prometheus.NewDesc(
allocationMemoryBytes = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "allocation_memory_rss_bytes"),
"Allocation memory usage",
[]string{"job", "group", "alloc", "region", "datacenter", "node"}, nil,
)
allocationMemotyBytesLimit = prometheus.NewDesc(
allocationMemoryBytesLimit = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "allocation_memory_rss_bytes_limit"),
"Allocation memory limit.",
[]string{"job", "group", "alloc", "region", "datacenter", "node"}, nil,
Expand Down Expand Up @@ -98,32 +104,42 @@ var (
nodeResourceMemory = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_resource_memory_bytes"),
"Amount of allocatable memory the node has in bytes",
[]string{"node_id", "node", "datacenter"}, nil,
[]string{"node", "datacenter"}, nil,
)
nodeAllocatedMemory = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_allocated_memory_bytes"),
"Amount of memory allocated to tasks on the node in bytes.",
[]string{"node_id", "node", "datacenter"}, nil,
[]string{"node", "datacenter"}, nil,
)
nodeUsedMemory = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_used_memory_bytes"),
"Amount of memory used on the node in bytes.",
[]string{"node_id", "node", "datacenter"}, nil,
[]string{"node", "datacenter"}, nil,
)
nodeResourceCPU = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_resource_cpu_megahertz"),
"Amount of allocatable CPU the node has in MHz",
[]string{"node_id", "node", "datacenter"}, nil,
[]string{"node", "datacenter"}, nil,
)
nodeResourceIOPS = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_resource_iops"),
"Amount of allocatable IOPS the node has.",
[]string{"node", "datacenter"}, nil,
)
nodeResourceDiskBytes = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_resource_disk_bytes"),
"Amount of allocatable disk bytes the node has.",
[]string{"node", "datacenter"}, nil,
)
nodeAllocatedCPU = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_allocated_cpu_megahertz"),
"Amount of allocated CPU on the node in MHz.",
[]string{"node_id", "node", "datacenter"}, nil,
[]string{"node", "datacenter"}, nil,
)
nodeUsedCPU = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "", "node_used_cpu_megahertz"),
"Amount of CPU used on the node in MHz.",
[]string{"node_id", "node", "datacenter"}, nil,
[]string{"node", "datacenter"}, nil,
)

allocation = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand All @@ -137,7 +153,6 @@ var (
"job_type",
"job_id",
"task_group",
"node_id",
"node",
},
)
Expand All @@ -157,7 +172,6 @@ var (
"state",
"failed",
"job_type",
"node_id",
"node",
},
)
Expand Down Expand Up @@ -352,21 +366,24 @@ func (e *Exporter) shouldReadMetrics() bool {
// Describe implements Collector interface.
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
ch <- up
ch <- nodeInfo
ch <- clusterServers
ch <- serfLanMembers
ch <- serfLanMembersStatus
ch <- jobsTotal
ch <- allocationMemotyBytes
ch <- allocationMemoryBytes
ch <- allocationCPU
ch <- allocationCPUThrottled
ch <- allocationMemotyBytesLimit
ch <- allocationMemoryBytesLimit
ch <- taskCPUPercent
ch <- taskCPUTotalTicks
ch <- taskMemoryRssBytes
ch <- nodeResourceMemory
ch <- nodeAllocatedMemory
ch <- nodeUsedMemory
ch <- nodeResourceCPU
ch <- nodeResourceIOPS
ch <- nodeResourceDiskBytes
ch <- nodeAllocatedCPU
ch <- nodeUsedCPU

Expand Down Expand Up @@ -488,7 +505,8 @@ func (e *Exporter) collectNodes(ch chan<- prometheus.Metric) error {
}

opts := &api.QueryOptions{}
nodes, _, err := e.client.Nodes().List(&api.QueryOptions{})

nodes, _, err := e.client.Nodes().List(opts)
if err != nil {
return fmt.Errorf("failed to get nodes list: %s", err)
}
Expand All @@ -503,8 +521,14 @@ func (e *Exporter) collectNodes(ch chan<- prometheus.Metric) error {
w.Add(1)

state := 1

drain := strconv.FormatBool(node.Drain)

ch <- prometheus.MustNewConstMetric(
nodeInfo, prometheus.GaugeValue, 1,
node.Name, node.Version, node.NodeClass, node.Status,
drain, node.Datacenter, node.SchedulingEligibility,
)

if node.Status == "down" {
state = 0
}
Expand Down Expand Up @@ -548,7 +572,7 @@ func (e *Exporter) collectNodes(ch chan<- prometheus.Metric) error {
allocatedMemory += *alloc.Resources.MemoryMB
}

nodeLabels := []string{node.ID, node.Name, node.Datacenter}
nodeLabels := []string{node.Name, node.Datacenter}
ch <- prometheus.MustNewConstMetric(
nodeResourceMemory, prometheus.GaugeValue, float64(*node.Resources.MemoryMB)*1024*1024,
nodeLabels...,
Expand All @@ -565,6 +589,14 @@ func (e *Exporter) collectNodes(ch chan<- prometheus.Metric) error {
nodeResourceCPU, prometheus.GaugeValue, float64(*node.Resources.CPU),
nodeLabels...,
)
ch <- prometheus.MustNewConstMetric(
nodeResourceIOPS, prometheus.GaugeValue, float64(*node.Resources.IOPS),
nodeLabels...,
)
ch <- prometheus.MustNewConstMetric(
nodeResourceDiskBytes, prometheus.GaugeValue, float64(*node.Resources.DiskMB)*1024*1024,
nodeLabels...,
)

nodeStats, err := e.client.Nodes().Stats(a.ID, opts)
if err != nil {
Expand All @@ -581,7 +613,6 @@ func (e *Exporter) collectNodes(ch chan<- prometheus.Metric) error {
nodeUsedCPU, prometheus.GaugeValue, float64(math.Floor(nodeStats.CPUTicksConsumed)),
nodeLabels...,
)

}
}(node)
}
Expand Down Expand Up @@ -663,7 +694,6 @@ func (e *Exporter) collectAllocations(ch chan<- prometheus.Metric) error {
"job_type": *job.Type,
"job_id": alloc.JobID,
"task_group": alloc.TaskGroup,
"node_id": node.ID,
"node": node.Name,
}).Add(1)

Expand All @@ -674,7 +704,6 @@ func (e *Exporter) collectAllocations(ch chan<- prometheus.Metric) error {
"state": task.State,
"failed": strconv.FormatBool(task.Failed),
"job_type": *job.Type,
"node_id": node.ID,
"node": node.Name,
}).Add(1)
}
Expand Down Expand Up @@ -705,10 +734,10 @@ func (e *Exporter) collectAllocations(ch chan<- prometheus.Metric) error {
allocationCPUThrottled, prometheus.GaugeValue, float64(stats.ResourceUsage.CpuStats.ThrottledTime), allocationLabels...,
)
ch <- prometheus.MustNewConstMetric(
allocationMemotyBytes, prometheus.GaugeValue, float64(stats.ResourceUsage.MemoryStats.RSS), allocationLabels...,
allocationMemoryBytes, prometheus.GaugeValue, float64(stats.ResourceUsage.MemoryStats.RSS), allocationLabels...,
)
ch <- prometheus.MustNewConstMetric(
allocationMemotyBytesLimit, prometheus.GaugeValue, float64(*alloc.Resources.MemoryMB)*1024*1024, allocationLabels...,
allocationMemoryBytesLimit, prometheus.GaugeValue, float64(*alloc.Resources.MemoryMB)*1024*1024, allocationLabels...,
)

for taskName, taskStats := range stats.Tasks {
Expand Down

0 comments on commit 5941a62

Please sign in to comment.