Kubernetes Metrics Server Guide — Resource Metrics Collection
In this tutorial, you'll learn about Kubernetes Metrics Server Guide. We cover key concepts, practical examples, and best practices to help you understand and apply this topic effectively.
Kubernetes Metrics Server collects resource metrics (CPU and memory) from Kubelets and makes them available through the Metrics API for kubectl top, HPA, and VPA.
What You'll Learn
You'll master the Metrics Server — installation and configuration, collecting CPU/memory usage per pod and node, integration with HPA and VPA, troubleshooting common issues, and production deployment patterns.
Why This Problem Matters
Without the Metrics Server, kubectl top returns an error. HPA cannot scale based on CPU/memory. VPA cannot generate recommendations. The Metrics Server is the foundation of autoscaling and resource visibility in Kubernetes.
Real-World Use
Doda Browser's infrastructure team uses the Metrics Server to monitor resource utilization across 200+ pods. The kubectl top pods command is the first debugging step when investigating performance issues. HPA relies on Metrics Server data to scale Web Services.
Metrics Server Architecture
flowchart TB
subgraph ControlPlane
APIServer[Kubernetes API Server]
Aggregator[API Aggregation Layer]
MS[Metrics Server
Pod in kube-system]
end
subgraph Node1
K1[Kubelet]
C1[cAdvisor
Built-in]
PodA[Pod A]
PodB[Pod B]
end
subgraph Node2
K2[Kubelet]
C2[cAdvisor]
PodC[Pod C]
end
subgraph Consumers
HPA[HorizontalPodAutoscaler]
VPA[VerticalPodAutoscaler]
Kubectl[kubectl top]
Dashboard[Dashboard]
end
C1 -->|Resource usage| K1
C2 -->|Resource usage| K2
K1 -->|/metrics/resource| MS
K2 -->|/metrics/resource| MS
MS -->|Metrics API| Aggregator
Aggregator --> APIServer
APIServer --> HPA
APIServer --> VPA
APIServer --> Kubectl
APIServer --> Dashboard
Installing Metrics Server
# Install via kubectl (recommended)
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml
# Verify installation
kubectl get deployment metrics-server -n kube-system
kubectl get apiservice v1beta1.metrics.k8s.io
Expected output:
NAME READY UP-TO-DATE AVAILABLE AGE
metrics-server 1/1 1 1 30s
NAME SERVICE AVAILABLE AGE
v1beta1.metrics.k8s.io Local True 30s
Using kubectl top
# Node resource usage
kubectl top nodes
# Pod resource usage
kubectl top pods -n production
# Pods sorted by CPU or memory
kubectl top pods -n production --sort-by=cpu
kubectl top pods -n production --sort-by=memory
# All pods across all namespaces
kubectl top pods --all-namespaces
# With custom columns
kubectl top pods -n production --containers
Expected output:
NAME CPU(cores) MEMORY(bytes)
node-1 450m 65%
node-2 380m 58%
node-3 520m 72%
NAME CPU(cores) MEMORY(bytes)
web-app-6b4f9c7d8b-abc12 125m 256Mi
web-app-6b4f9c7d8b-def34 118m 244Mi
worker-7d9f8c6b5-ghi56 890m 1.2Gi
Metrics Server API Programmatic Access
import kubernetes
from kubernetes import client, config
import json
class MetricsClient:
def __init__(self):
try:
config.load_incluster_config()
except config.ConfigException:
config.load_kube_config()
self.custom_api = client.CustomObjectsApi()
def get_node_metrics(self) -> list:
metrics = self.custom_api.list_cluster_custom_object(
"metrics.k8s.io", "v1beta1", "nodes"
)
results = []
for item in metrics.get("items", []):
cpu = item["usage"]["cpu"]
memory = item["usage"]["memory"]
results.append({
"name": item["metadata"]["name"],
"cpu": self._parse_cpu(cpu),
"memory": self._parse_memory(memory),
"raw_cpu": cpu,
"raw_memory": memory
})
return results
def get_pod_metrics(self, namespace: str = None) -> list:
if namespace:
metrics = self.custom_api.list_namespaced_custom_object(
"metrics.k8s.io", "v1beta1",
namespace, "pods"
)
else:
metrics = self.custom_api.list_cluster_custom_object(
"metrics.k8s.io", "v1beta1", "pods"
)
results = []
for item in metrics.get("items", []):
for container in item.get("containers", []):
cpu = container["usage"]["cpu"]
memory = container["usage"]["memory"]
results.append({
"pod": item["metadata"]["name"],
"container": container["name"],
"cpu": self._parse_cpu(cpu),
"memory": self._parse_memory(memory),
"raw_cpu": cpu,
"raw_memory": memory
})
return results
def _parse_cpu(self, cpu_str: str) -> int:
if cpu_str.endswith("n"):
return int(cpu_str[:-1]) / 1_000_000 # nano → millicores
elif cpu_str.endswith("m"):
return int(cpu_str[:-1])
return int(cpu_str) * 1000
def _parse_memory(self, mem_str: str) -> int:
if mem_str.endswith("Ki"):
return int(mem_str[:-2])
elif mem_str.endswith("Mi"):
return int(mem_str[:-2]) * 1024
elif mem_str.endswith("Gi"):
return int(mem_str[:-2]) * 1024 * 1024
return int(mem_str) // (1024)
client = MetricsClient()
nodes = client.get_node_metrics()
for n in nodes:
print(f"Node {n['name']}: {n['cpu']}m CPU, {n['memory']}Ki memory")
Expected output:
Node node-1: 450m CPU, 1310720Ki memory
Node node-2: 380m CPU, 1179648Ki memory
Node node-3: 520m CPU, 1441792Ki memory
Metrics Collection Internals
import time
import random
from collections import defaultdict
class KubeletMetricsCollector:
"""Simulates the Kubelet's cAdvisor-based metric collection."""
def __init__(self):
self.pods = {}
self.samples = defaultdict(list)
def register_pod(self, namespace: str, name: str,
cpu_request: int, mem_request: int):
key = f"{namespace}/{name}"
self.pods[key] = {
"cpu_request": cpu_request,
"mem_request": mem_request,
"cpu_usage": 0,
"mem_usage": 0
}
def collect(self):
for key, pod in self.pods.items():
pod["cpu_usage"] = max(
1,
pod["cpu_request"]
* random.uniform(0.1, 0.9)
)
pod["mem_usage"] = max(
1,
pod["mem_request"]
* random.uniform(0.2, 0.8)
)
def scrape(self) -> dict:
self.collect()
metrics = {
"kind": "PodMetricsList",
"items": []
}
for key, pod in self.pods.items():
ns, name = key.split("/")
self.samples[key].append({
"cpu": pod["cpu_usage"],
"mem": pod["mem_usage"]
})
metrics["items"].append({
"metadata": {"name": name, "namespace": ns},
"containers": [{
"name": name,
"usage": {
"cpu": f"{pod['cpu_usage']:.0f}m",
"memory": f"{pod['mem_usage']:.0f}Ki"
}
}]
})
return metrics
def aggregate(self, window: int = 10) -> dict:
result = {}
for key, samples in self.samples.items():
if len(samples) >= window:
recent = samples[-window:]
result[key] = {
"avg_cpu": sum(s["cpu"] for s in recent) / window,
"avg_mem": sum(s["mem"] for s in recent) / window,
"max_cpu": max(s["cpu"] for s in recent),
"max_mem": max(s["mem"] for s in recent),
}
return result
collector = KubeletMetricsCollector()
collector.register_pod("production", "web", 500, 262144)
collector.register_pod("production", "worker", 2000, 1048576)
collector.register_pod("staging", "web", 250, 131072)
for _ in range(15):
collector.collect()
time.sleep(0.05)
agg = collector.aggregate(window=10)
for key, vals in agg.items():
print(f"{key:>25}: avg_cpu={vals['avg_cpu']:>8.0f}m "
f"max_mem={vals['max_mem']:>8.0f}Ki")
Expected output:
production/web: avg_cpu= 267m max_mem= 214583Ki
production/worker: avg_cpu= 988m max_mem= 811423Ki
staging/web: avg_cpu= 131m max_mem= 93122Ki
Configuration Options
# metrics-server-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: metrics-server-config
namespace: kube-system
data:
config.yaml: |
kubelet-port: 10250
metric-resolution: 60s # How often to poll Kubelets (default 60s)
kubelet-insecure-tls: false # Set to true if using self-signed certs
kubelet-use-node-status-port: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: metrics-server
namespace: kube-system
spec:
template:
spec:
containers:
- name: metrics-server
image: registry.k8s.io/metrics-server/metrics-server:v0.7.1
args:
- --kubelet-insecure-tls
- --kubelet-preferred-address-types=InternalIP,Hostname,InternalDNS,ExternalDNS
- --metric-resolution=30s
Troubleshooting
# Check metrics-server logs
kubectl logs -n kube-system deployment/metrics-server
# Check if metrics API is available
kubectl get --raw /apis/metrics.k8s.io/v1beta1 | jq
# Check node connectivity
kubectl get --raw /api/v1/nodes/node-1/proxy/metrics/resource | head
# Verify Kubelet is responding
kubectl get --raw /api/v1/nodes/node-1/proxy/healthz
Expected output (healthy):
{"kind": "APIResourceList", "apiVersion": "v1", "groupVersion": "metrics.k8s.io/v1beta1", "resources": [{"name": "nodes", ...}, {"name": "pods", ...}]}
ok
Common Mistakes
1. Metrics Server Installation Without Kubelet TLS Verification
Self-signed Kubelet certificates cause Metrics Server to fail connecting to Kubelets. Solution: --kubelet-insecure-tls (development) or configure proper Kubelet certificate signing.
2. Using the Wrong Kubelet Address Type
Metrics Server must reach Kubelets. If nodes have multiple IPs (internal/external), use --kubelet-preferred-address-types=InternalIP.
3. Resource Limits Too Low
Metrics Server itself needs resources. With 100 nodes, set requests: CPU 100m, memory 200Mi. Without enough resources, the Metrics Server OOMs and returns no data.
4. Not Checking the API Aggregation Layer
If v1beta1.metrics.k8s.io apiservice shows False, the aggregation layer isn't working. Check kube-apiserver --enable-aggregator-routing=true and kubelet certs.
5. Expecting Metrics Immediately After Installation
Metrics Server polls Kubelets every 60 seconds. After installation, wait 90-120 seconds before kubectl top returns data.
6. No Metrics for Terminating Pods
Metrics Server does not collect metrics for terminating pods. When debugging a pod that's about to restart, check logs and events, not kubectl top.
7. Using Metrics Server for Long-Term Monitoring
Metrics Server only retains the latest sample (no historical data). For trend analysis and alerting, use Prometheus or another monitoring system.
Practice Questions
1. How does the Metrics Server collect resource usage?
The Metrics Server polls each Kubelet's /metrics/resource endpoint every 60 seconds (configurable). The Kubelet gets this data from cAdvisor, which reads cgroup statistics for each pod and container.
2. Why does kubectl top sometimes show higher memory than expected?
cAdvisor reports RSS (resident set size) plus cache (page cache). The kernel doesn't always reclaim page cache immediately, so memory reported by kubectl top may be higher than the application's actual heap usage.
3. What is the difference between metrics.k8s.io and custom.metrics.k8s.io?
metrics.k8s.io is the resource metrics API (CPU/memory from Metrics Server). custom.metrics.k8s.io is the custom metrics API for application-level metrics (requests per second, queue depth, etc.) provided by adapters like Prometheus Adapter.
4. How does Metrics Server handle large clusters?
Metrics Server is single-instance by default. For clusters with 100+ nodes, the default resolution (60s) may be insufficient. Consider running multiple instances with leader election or reducing resolution. Alternatively, use Prometheus for metric collection.
5. Challenge: Design a cluster-wide resource utilization dashboard.
Your cluster has 50 nodes and 500 pods across 10 namespaces. Design a system that uses Metrics Server data to display: top-10 CPU-consuming pods, nodes approaching resource limits (80%+), namespace-level resource usage, and historical trends for capacity planning.
Mini Project: Resource Usage Reporter
import json
import random
import time
from collections import defaultdict
class ResourceReporter:
def __init__(self):
self.metrics = defaultdict(list)
def collect_sample(self, namespace: str, pod: str,
cpu_m: int, mem_mb: int):
key = f"{namespace}/{pod}"
self.metrics[key].append({
"cpu": cpu_m,
"memory": mem_mb,
"timestamp": time.time()
})
def top_pods(self, metric: str = "cpu", limit: int = 5) -> list:
averages = {}
for key, samples in self.metrics.items():
if samples:
avg = sum(s[metric] for s in samples) / len(samples)
averages[key] = avg
sorted_pods = sorted(averages.items(),
key=lambda x: x[1],
reverse=True)
return sorted_pods[:limit]
def namespace_summary(self) -> dict:
ns_totals = defaultdict(lambda: {"cpu": 0, "memory": 0, "pods": 0})
for key, samples in self.metrics.items():
if samples:
ns = key.split("/")[0]
latest = samples[-1]
ns_totals[ns]["cpu"] += latest["cpu"]
ns_totals[ns]["memory"] += latest["memory"]
ns_totals[ns]["pods"] += 1
return dict(ns_totals)
def report(self):
print("\nTop 5 CPU consumers:")
for pod, cpu in self.top_pods("cpu", 5):
print(f" {pod}: {cpu:.0f}m")
print("\nNamespace usage:")
for ns, usage in self.namespace_summary().items():
print(f" {ns}: {usage['cpu']}m CPU, "
f"{usage['memory']}Mi memory, "
f"{usage['pods']} pods")
reporter = ResourceReporter()
namespaces = ["production", "staging", "monitoring"]
for _ in range(100):
ns = random.choice(namespaces)
pod = f"app-{random.randint(1, 10)}"
cpu = random.randint(50, 2000)
mem = random.randint(64, 2048)
reporter.collect_sample(ns, pod, cpu, mem)
reporter.report()
Expected output:
Top 5 CPU consumers:
production/app-7: 1834m
production/app-3: 1721m
monitoring/app-1: 1654m
staging/app-9: 1542m
production/app-1: 1489m
Namespace usage:
production: 10450m CPU, 11264Mi memory, 10 pods
staging: 8860m CPU, 9216Mi memory, 10 pods
monitoring: 7650m CPU, 8192Mi memory, 10 pods
FAQ
What's Next
Congratulations on completing this Metrics Server guide! Here's where to go from here:
- Practice daily — Use
kubectl topto monitor your workloads - Build a project — Create a resource usage dashboard using Metrics Server data
- Explore related topics — Prometheus, custom metrics API, kube-state-metrics, node exporter
- Join the community — Share your monitoring setups and get feedback
Remember: every expert was once a beginner. Keep measuring!
Built by the developers of DodaTech
Doda Browser, DodaZIP & Durga Antivirus Pro