Kubernetes Operators Guide — Automating Application Management
In this tutorial, you'll learn about Kubernetes Operators Guide. We cover key concepts, practical examples, and best practices to help you understand and apply this topic effectively.
A Kubernetes Operator is a custom controller that extends Kubernetes to manage complex applications and their lifecycles using domain-specific knowledge encoded as software.
What You'll Learn
You'll master the operator pattern — reconciliation loops, custom resource definitions, controller design, operator SDKs (kubebuilder, Operator SDK), and production patterns for building and deploying operators.
Why This Problem Matters
Running stateful applications (databases, Message Queues, monitoring stacks) on Kubernetes requires deep domain knowledge. Operators encode that knowledge into automated controllers that handle deployment, backup, scaling, and recovery without human intervention.
Real-World Use
Durga Antivirus Pro uses a custom operator to manage its signature distribution cluster. The operator handles scaling analysis workers based on queue depth, rotating TLS certificates, and restoring from backup after node failures.
Operator Architecture
flowchart TB
User[User] -->|kubectl apply| CR[Custom Resource]
CR -->|Watch| Controller[Operator Controller]
Controller -->|Reconcile| Desired[Desired State]
Desired --> Compare{Compare with
Actual State}
Compare -->|Different| Action[Take Action]
Action -->|Create/Update/Delete| K8sResources[K8s Resources
Deployments, Services, PVCs]
K8sResources --> Controller
Compare -->|Same| Done[Idle - Wait for Changes]
subgraph OperatorDeployment
Operator[Operator Pod]
CRD[CustomResourceDefinition]
RBAC[RBAC Permissions]
end
Controller --> Operator
CR --> CRD
Implementing a Simple Operator in Python
import kubernetes
from kubernetes import client, config, watch
import time
import json
class SimpleOperator:
def __init__(self):
config.load_incluster_config()
self.custom_api = client.CustomObjectsApi()
self.apps_api = client.AppsV1Api()
def reconcile(self, cr):
name = cr["metadata"]["name"]
spec = cr.get("spec", {})
replicas = spec.get("replicas", 1)
image = spec.get("image", "nginx")
print(f"Reconciling {name}: {replicas} replicas of {image}")
# Check if Deployment exists
try:
deploy = self.apps_api.read_namespaced_deployment(
name, cr["metadata"]["namespace"]
)
if deploy.spec.replicas != replicas:
print(f"Scaling {name} to {replicas}")
deploy.spec.replicas = replicas
self.apps_api.patch_namespaced_deployment(
name, cr["metadata"]["namespace"], deploy
)
except client.exceptions.ApiException as e:
if e.status == 404:
print(f"Creating Deployment {name}")
deploy = client.V1Deployment(
metadata=client.V1ObjectMeta(name=name),
spec=client.V1DeploymentSpec(
replicas=replicas,
selector={"matchLabels": {"app": name}},
template=client.V1PodTemplateSpec(
metadata=client.V1ObjectMeta(
labels={"app": name}
),
spec=client.V1PodSpec(
containers=[
client.V1Container(
name="app",
image=image
)
]
)
)
)
)
self.apps_api.create_namespaced_deployment(
cr["metadata"]["namespace"], deploy
)
def run(self):
w = watch.Watch()
for event in w.stream(
self.custom_api.list_cluster_custom_object,
"example.com", "v1", "myapps",
timeout_seconds=0
):
cr = event["object"]
self.reconcile(cr)
if __name__ == "__main__":
operator = SimpleOperator()
operator.run()
CRD Definition
# myapp-crd.yaml
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: myapps.example.com
spec:
group: example.com
versions:
- name: v1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
properties:
replicas:
type: integer
minimum: 1
maximum: 100
image:
type: string
scope: Namespaced
names:
plural: myapps
singular: myapp
kind: MyApp
shortNames:
- ma
kubectl apply -f myapp-crd.yaml
kubectl get crd | grep myapp
Expected output:
myapps.example.com 2026-06-24T10:00:00Z
Custom Resource Instance
# myapp-instance.yaml
apiVersion: example.com/v1
kind: MyApp
metadata:
name: myapp-prod
spec:
replicas: 3
image: nginx:1.25
kubectl apply -f myapp-instance.yaml
kubectl get myapp -o wide
Expected output:
NAME REPLICAS IMAGE
myapp-prod 3 nginx:1.25
Reconciliation Loop in Go (Kubebuilder)
// controllers/myapp_controller.go
package controllers
import (
"context"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
examplev1 "my-operator/api/v1"
)
type MyAppReconciler struct {
client.Client
Scheme *runtime.Scheme
}
func (r *MyAppReconciler) Reconcile(
ctx context.Context, req ctrl.Request,
) (ctrl.Result, error) {
logger := log.FromContext(ctx)
var myapp examplev1.MyApp
if err := r.Get(ctx, req.NamespacedName, &myapp); err != nil {
logger.Error(err, "unable to fetch MyApp")
return ctrl.Result{}, client.IgnoreNotFound(err)
}
desiredReplicas := myapp.Spec.Replicas
desiredImage := myapp.Spec.Image
deploy := &appsv1.Deployment{}
err := r.Get(ctx, types.NamespacedName{
Name: myapp.Name,
Namespace: myapp.Namespace,
}, deploy)
if errors.IsNotFound(err) {
deploy = r.buildDeployment(myapp, desiredReplicas, desiredImage)
if err := r.Create(ctx, deploy); err != nil {
logger.Error(err, "failed to create Deployment")
return ctrl.Result{}, err
}
logger.Info("created Deployment", "replicas", desiredReplicas)
} else if err != nil {
return ctrl.Result{}, err
} else {
if *deploy.Spec.Replicas != desiredReplicas {
deploy.Spec.Replicas = &desiredReplicas
if err := r.Update(ctx, deploy); err != nil {
return ctrl.Result{}, err
}
logger.Info("updated replicas", "count", desiredReplicas)
}
}
return ctrl.Result{}, nil
}
func (r *MyAppReconciler) buildDeployment(
myapp examplev1.MyApp,
replicas int32,
image string,
) *appsv1.Deployment {
labels := map[string]string{"app": myapp.Name}
return &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: myapp.Name,
Namespace: myapp.Namespace,
},
Spec: appsv1.DeploymentSpec{
Replicas: &replicas,
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{Labels: labels},
Spec: corev1.PodSpec{
Containers: []corev1.Container{{
Name: "app",
Image: image,
}},
},
},
},
}
}
Operator Lifecycle Manager (OLM)
# Install OLM
curl -sL https://github.com/operator-framework/operator-lifecycle-manager/releases/latest/download/install.sh | bash -s v0.27.0
# Verify
kubectl get csv -n olm
# Find operators from OperatorHub
kubectl operator list
# Install an operator
kubectl operator install prometheus --channel stable
Helm-Based Operator
# watches.yaml
---
- version: v1alpha1
group: example.com
kind: MyApp
chart: /opt/helm-charts/myapp
reconcilePeriod: 60s
# Create operator scaffold with Helm
operator-sdk init --plugins helm --domain example.com
operator-sdk create api --group example --version v1alpha1 --kind MyApp
helm create myapp-chart
operator-sdk build --image-repository myregistry/myapp-operator
Common Mistakes
1. Reconciliation Without Idempotency
Running the same reconcile twice should produce the same result. If your operator creates resources without checking for existence first, it will fail or duplicate resources.
2. Blocking the Reconciliation Loop
If reconcile takes 10 seconds but the operator watches events continuously, reconcile requests queue up. Use async operations, set appropriate rate limits, and return early when possible.
3. Not Handling Deletion Finalizers
When a CR is deleted, owned resources (Deployments, PVCs) are garbage collected. But some resources need cleanup (cloud load balancers, DNS records). Add finalizers to handle pre-deletion cleanup.
4. Overusing Status Updates
Writing status updates on every reconcile creates API server load. Batch status updates and use conditions to aggregate state changes.
5. Missing RBAC Permissions
Operators need permissions to create/read/update the resources they manage. If RBAC is too restrictive, the operator logs permission errors but doesn't crash. Check operator logs carefully during deployment.
6. No Leader Election
Running multiple replicas of an operator without leader election causes all replicas to reconcile the same resources simultaneously. Enable leader election with --leader-elect=true.
7. Hardcoding Configuration
Operator behavior (timeouts, retries, image registries) should be configurable through environment variables or ConfigMaps. Hardcoded values make operators hard to reuse across environments.
Practice Questions
1. What is the difference between a Kubernetes controller and an operator?
A controller is a loop that watches resources and reconciles desired state. An operator is a controller plus domain-specific knowledge about managing an application — it includes backup, restore, upgrade, scaling, and failure recovery procedures for that specific application.
2. How does the reconciliation loop work?
The controller watches a resource (CRD) and receives events on create, update, and delete. It reads the desired state from the CR spec, compares it to the actual state in the cluster, and takes actions to converge them. After each action, it rechecks until the states match.
3. What are finalizers used for in operators?
Finalizers prevent deletion of a CR until the operator completes cleanup tasks. The operator watches for deletion timestamps, performs cleanup (deleting cloud resources, removing from DNS), then removes the finalizer, allowing the CR to be deleted.
4. How do you test an operator locally?
Use kubebuilder's envtest to run a local API server and etcd. Write integration tests that create CRs and verify the resulting cluster resources. The reconcile method is a standard Go function that can be unit-tested with fake clients.
5. Challenge: Design an operator for a PostgreSQL cluster.
The operator should handle: creating a primary-replica PostgreSQL cluster, automated failover when the primary fails, backup to S3 every 6 hours, point-in-time recovery, and minor version upgrades without downtime.
Mini Project: Operator Simulator
import time
import threading
class ResourceManager:
def __init__(self):
self.resources = {}
def create(self, kind: str, name: str, spec: dict):
key = f"{kind}/{name}"
self.resources[key] = {"spec": spec, "status": "creating"}
def delete(self, kind: str, name: str):
key = f"{kind}/{name}"
self.resources.pop(key, None)
def get(self, kind: str, name: str) -> dict:
return self.resources.get(f"{kind}/{name}")
class Operator:
def __init__(self, manager: ResourceManager):
self.manager = manager
self.running = True
def reconcile(self, cr: dict):
name = cr["name"]
replicas = cr.get("replicas", 1)
existing = self.manager.get("Deployment", name)
if not existing:
print(f" Creating Deployment/{name} with {replicas} replicas")
self.manager.create("Deployment", name, {"replicas": replicas})
elif existing["spec"]["replicas"] != replicas:
print(f" Scaling Deployment/{name} to {replicas}")
existing["spec"]["replicas"] = replicas
def run(self, crs: list):
while self.running:
for cr in crs:
self.reconcile(cr)
time.sleep(2)
operator = Operator(ResourceManager())
crs = [
{"name": "web-app", "replicas": 3},
{"name": "worker", "replicas": 5},
]
t = threading.Thread(target=operator.run, args=(crs,), daemon=True)
t.start()
time.sleep(1)
crs[0]["replicas"] = 5
time.sleep(3)
print("\nFinal state:")
for kind_name, res in operator.manager.resources.items():
print(f" {kind_name}: {res['spec']}")
operator.running = False
Expected output:
Creating Deployment/web-app with 3 replicas
Creating Deployment/worker with 5 replicas
Scaling Deployment/web-app to 5
Final state:
Deployment/web-app: {'replicas': 5}
Deployment/worker: {'replicas': 5}
FAQ
What's Next
Congratulations on completing this operators guide! Here's where to go from here:
- Practice daily — Deploy an existing operator (Prometheus, Cert-Manager)
- Build a project — Write a simple operator for your application
- Explore related topics — Controller-runtime, finalizers, leader election, admission webhooks
- Join the community — Share your operator designs and get feedback
Remember: every expert was once a beginner. Keep operating!
Built by the developers of DodaTech
Doda Browser, DodaZIP & Durga Antivirus Pro