From 32381d3f83c9373f0a78210548a1f6d38b479abb Mon Sep 17 00:00:00 2001 From: Shay Goldstein Date: Thu, 12 Feb 2026 21:20:46 +0200 Subject: [PATCH] Migrating bitnami kafka to Strimzi Kafka operator Add Strimzi Kafka operator configuration Update values for Kafka deployment --- .github/workflows/release.yml | 2 +- .gitignore | 2 + charts/mlrun-ce/Chart.yaml | 2 +- .../mlrun-ce/admin_installation_values.yaml | 4 + ..._admin_cluster_ip_installation_values.yaml | 15 +-- .../non_admin_installation_values.yaml | 7 ++ charts/mlrun-ce/requirements.lock | 10 +- charts/mlrun-ce/requirements.yaml | 8 +- .../kafka/kafka-bootstrap-alias.yaml | 24 +++++ .../templates/kafka/kafka-cluster.yaml | 37 +++++++ .../templates/kafka/kafka-network-policy.yaml | 71 ++++++++++++++ .../templates/kafka/kafka-nodepool.yaml | 34 +++++++ .../mlrun-ce/templates/kafka/kafka-rbac.yaml | 90 +++++++++++++++++ charts/mlrun-ce/values.yaml | 98 +++++++++++++------ 14 files changed, 355 insertions(+), 49 deletions(-) create mode 100644 charts/mlrun-ce/templates/kafka/kafka-bootstrap-alias.yaml create mode 100644 charts/mlrun-ce/templates/kafka/kafka-cluster.yaml create mode 100644 charts/mlrun-ce/templates/kafka/kafka-network-policy.yaml create mode 100644 charts/mlrun-ce/templates/kafka/kafka-nodepool.yaml create mode 100644 charts/mlrun-ce/templates/kafka/kafka-rbac.yaml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2a28171a..988fed32 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,7 +31,7 @@ jobs: helm repo add minio https://charts.min.io/ helm repo add spark-operator https://kubeflow.github.io/spark-operator helm repo add prometheus-community https://prometheus-community.github.io/helm-charts - helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo add strimzi https://strimzi.io/charts/ - name: Run chart-releaser uses: helm/chart-releaser-action@v1.7.0 diff --git a/.gitignore b/.gitignore index ff02e9d9..950c9b99 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .idea/* charts/mlrun-ce/charts/* .DS_Store +**/.DS_Store +*.DS_Store diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index e04c6977..603ba5ef 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.10.0-rc5 +version: 0.10.0-rc6 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/admin_installation_values.yaml b/charts/mlrun-ce/admin_installation_values.yaml index 7f148948..66d48a66 100644 --- a/charts/mlrun-ce/admin_installation_values.yaml +++ b/charts/mlrun-ce/admin_installation_values.yaml @@ -51,5 +51,9 @@ kube-prometheus-stack: tdengine: enabled: false +strimzi-kafka-operator: + enabled: true + watchAnyNamespace: true + kafka: enabled: false diff --git a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml index d860d303..e407db6b 100644 --- a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml @@ -59,11 +59,12 @@ pipelines: priority_class: enabled: false +strimzi-kafka-operator: + enabled: false + +kafka: + rbac: + operatorNamespace: controller + kube-prometheus-stack: - prometheus-node-exporter: - fullnameOverride: node-exporter - hostNetwork: false - service: - port: 9100 - hostRootFsMount: - enabled: false + enabled: false diff --git a/charts/mlrun-ce/non_admin_installation_values.yaml b/charts/mlrun-ce/non_admin_installation_values.yaml index a0c061da..460b1013 100644 --- a/charts/mlrun-ce/non_admin_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_installation_values.yaml @@ -58,5 +58,12 @@ pipelines: priority_class: enabled: false +strimzi-kafka-operator: + enabled: false + +kafka: + rbac: + operatorNamespace: controller + kube-prometheus-stack: enabled: false diff --git a/charts/mlrun-ce/requirements.lock b/charts/mlrun-ce/requirements.lock index a65ccfd0..f3c1b7d9 100644 --- a/charts/mlrun-ce/requirements.lock +++ b/charts/mlrun-ce/requirements.lock @@ -17,8 +17,8 @@ dependencies: - name: kube-prometheus-stack repository: https://prometheus-community.github.io/helm-charts version: 72.1.1 -- name: kafka - repository: https://charts.bitnami.com/bitnami - version: 31.3.1 -digest: sha256:d92e2702f26b3fbbe527fd4439cec8ce50bc79ad54fc69e10c28301e04e0114a -generated: "2025-11-04T09:39:37.92185Z" +- name: strimzi-kafka-operator + repository: https://strimzi.io/charts/ + version: 0.48.0 +digest: sha256:f45be2a1208958d753b2e8a95f33eee17718ad1e691317ec0b50e3c088a7cae8 +generated: "2026-02-03T14:08:48.606883+02:00" diff --git a/charts/mlrun-ce/requirements.yaml b/charts/mlrun-ce/requirements.yaml index 1e7e0941..2d362851 100644 --- a/charts/mlrun-ce/requirements.yaml +++ b/charts/mlrun-ce/requirements.yaml @@ -21,7 +21,7 @@ dependencies: repository: "https://prometheus-community.github.io/helm-charts" version: "72.1.1" condition: kube-prometheus-stack.enabled - - name: kafka - repository: "https://charts.bitnami.com/bitnami" - version: "31.3.1" - condition: kafka.enabled + - name: strimzi-kafka-operator + repository: "https://strimzi.io/charts/" + version: "0.48.0" + condition: strimzi-kafka-operator.enabled diff --git a/charts/mlrun-ce/templates/kafka/kafka-bootstrap-alias.yaml b/charts/mlrun-ce/templates/kafka/kafka-bootstrap-alias.yaml new file mode 100644 index 00000000..d6d9aacc --- /dev/null +++ b/charts/mlrun-ce/templates/kafka/kafka-bootstrap-alias.yaml @@ -0,0 +1,24 @@ +{{- if .Values.kafka.bootstrapAlias.enabled }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ .Values.kafka.bootstrapAlias.name }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: kafka + app.kubernetes.io/component: bootstrap-alias + {{- include "mlrun-ce.common.labels" . | nindent 4 }} +spec: + type: ClusterIP + ports: + - name: client + port: 9092 + targetPort: 9092 + protocol: TCP + selector: + strimzi.io/cluster: {{ .Values.kafka.name }} + strimzi.io/kind: Kafka + strimzi.io/name: {{ .Values.kafka.name }}-kafka +{{- end }} + diff --git a/charts/mlrun-ce/templates/kafka/kafka-cluster.yaml b/charts/mlrun-ce/templates/kafka/kafka-cluster.yaml new file mode 100644 index 00000000..9d936547 --- /dev/null +++ b/charts/mlrun-ce/templates/kafka/kafka-cluster.yaml @@ -0,0 +1,37 @@ +{{- if .Values.kafka.enabled }} +apiVersion: kafka.strimzi.io/v1beta2 +kind: Kafka +metadata: + name: {{ .Values.kafka.name }} + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: kafka + app.kubernetes.io/component: cluster + {{- include "mlrun-ce.common.labels" . | nindent 4 }} + annotations: + # Delay Kafka CR creation until after CRDs are installed + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "5" +spec: + kafka: + listeners: + {{- range .Values.kafka.listeners }} + - name: {{ .name }} + port: {{ .port }} + type: {{ .type }} + tls: {{ .tls }} + {{- if .configuration }} + configuration: + {{- toYaml .configuration | nindent 10 }} + {{- end }} + {{- end }} + config: + {{- toYaml .Values.kafka.config | nindent 6 }} + {{- if gt (.Values.kafka.zookeeper.replicas | int) 0 }} + zookeeper: + replicas: {{ .Values.kafka.zookeeper.replicas }} + storage: + type: persistent-claim + size: 8Gi + {{- end }} +{{- end }} diff --git a/charts/mlrun-ce/templates/kafka/kafka-network-policy.yaml b/charts/mlrun-ce/templates/kafka/kafka-network-policy.yaml new file mode 100644 index 00000000..9f079354 --- /dev/null +++ b/charts/mlrun-ce/templates/kafka/kafka-network-policy.yaml @@ -0,0 +1,71 @@ +{{- if .Values.kafka.rbac.enabled -}} +{{- $kafkaName := .Values.kafka.name | default "kafka-stream" -}} +{{- $currentNamespace := .Release.Namespace -}} +--- +# NetworkPolicy: Kafka Isolation +# Purpose: Ensure pods in this namespace can ONLY connect to Kafka in their OWN +# namespace, preventing cross-tenant Kafka access in multi-namespace deployments. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: kafka-isolation + namespace: {{ $currentNamespace }} + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: kafka-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +spec: + # Apply to all pods in this namespace + podSelector: {} + + policyTypes: + - Egress + + egress: + # ============================================================================= + # Kafka ports (9092-9094): ONLY allowed to Kafka in the SAME namespace + # ============================================================================= + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: {{ $currentNamespace }} + podSelector: + matchLabels: + strimzi.io/cluster: {{ $kafkaName }} + ports: + - protocol: TCP + port: 9092 # client listener + - protocol: TCP + port: 9093 # controller listener + - protocol: TCP + port: 9094 # external listener + + # ============================================================================= + # All other traffic: ALLOWED (no restrictions) + # ============================================================================= + # Allow all egress on non-Kafka ports. This ensures that services like: + # - Docker registries (Kaniko builds) + # - Kubernetes API server + # - DNS + # - External APIs + # ...continue to work without needing explicit whitelist rules. + + # Allow all TCP traffic on ports below Kafka range + - ports: + - protocol: TCP + port: 1 + endPort: 9091 + + # Allow all TCP traffic on ports above Kafka range + - ports: + - protocol: TCP + port: 9095 + endPort: 65535 + + # Allow all UDP traffic (DNS, etc.) + - ports: + - protocol: UDP + port: 1 + endPort: 65535 + +{{- end }} diff --git a/charts/mlrun-ce/templates/kafka/kafka-nodepool.yaml b/charts/mlrun-ce/templates/kafka/kafka-nodepool.yaml new file mode 100644 index 00000000..7623a96d --- /dev/null +++ b/charts/mlrun-ce/templates/kafka/kafka-nodepool.yaml @@ -0,0 +1,34 @@ +{{- if .Values.kafka.enabled }} +apiVersion: kafka.strimzi.io/v1beta2 +kind: KafkaNodePool +metadata: + name: {{ .Values.kafka.name }}-pool + namespace: {{ .Release.Namespace }} + labels: + app.kubernetes.io/name: kafka + app.kubernetes.io/component: nodepool + strimzi.io/cluster: {{ .Values.kafka.name }} + {{- include "mlrun-ce.common.labels" . | nindent 4 }} + annotations: + # Delay KafkaNodePool CR creation until after CRDs are installed + helm.sh/hook: post-install,post-upgrade + helm.sh/hook-weight: "5" +spec: + replicas: {{ .Values.kafka.replicas }} + roles: + - controller + - broker + storage: + type: {{ .Values.kafka.storage.type }} + size: {{ .Values.kafka.storage.size }} + {{- if .Values.kafka.storage.class }} + class: {{ .Values.kafka.storage.class }} + {{- end }} + resources: + requests: + memory: {{ .Values.kafka.resources.requests.memory }} + cpu: {{ .Values.kafka.resources.requests.cpu }} + limits: + memory: {{ .Values.kafka.resources.limits.memory }} + cpu: {{ .Values.kafka.resources.limits.cpu }} +{{- end }} diff --git a/charts/mlrun-ce/templates/kafka/kafka-rbac.yaml b/charts/mlrun-ce/templates/kafka/kafka-rbac.yaml new file mode 100644 index 00000000..f6dd019f --- /dev/null +++ b/charts/mlrun-ce/templates/kafka/kafka-rbac.yaml @@ -0,0 +1,90 @@ +{{- if .Values.kafka.rbac.enabled -}} +{{- $operatorNamespace := .Values.kafka.rbac.operatorNamespace | default .Release.Namespace -}} +{{- $kafkaName := .Values.kafka.name | default "kafka-stream" -}} +{{- $currentNamespace := .Release.Namespace -}} +--- +# ServiceAccount for Kafka client applications +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kafka-client + namespace: {{ $currentNamespace }} + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: kafka-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +--- +# Role: Allow managing Kafka resources via CRDs in the operator namespace +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $currentNamespace }}-kafka-resource-manager + namespace: {{ $operatorNamespace }} + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: kafka-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} + user-namespace: {{ $currentNamespace }} +rules: + # Allow creating and managing KafkaTopic CRDs + - apiGroups: + - kafka.strimzi.io + resources: + - kafkatopics + verbs: + - get + - list + - watch + - create + - update + - patch + - delete + # Allow checking KafkaTopic status + - apiGroups: + - kafka.strimzi.io + resources: + - kafkatopics/status + verbs: + - get + - list + - watch + # Allow reading KafkaUser CRDs (if using SCRAM auth) + - apiGroups: + - kafka.strimzi.io + resources: + - kafkausers + verbs: + - get + - list + - watch + # Allow reading the Kafka cluster info + - apiGroups: + - kafka.strimzi.io + resources: + - kafkas + verbs: + - get + - list + - watch +--- +# RoleBinding: Grant Kafka resource management permissions to ServiceAccount +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $currentNamespace }}-kafka-resource-manager + namespace: {{ $operatorNamespace }} + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: kafka-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} + user-namespace: {{ $currentNamespace }} +subjects: + - kind: ServiceAccount + name: kafka-client + namespace: {{ $currentNamespace }} +roleRef: + kind: Role + name: {{ $currentNamespace }}-kafka-resource-manager + apiGroup: rbac.authorization.k8s.io +{{- end }} + diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index 6edd8fdf..5a947c2e 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -382,7 +382,6 @@ pipelines: repository: minio/minio tag: "RELEASE.2025-10-15T17-29-55Z" - kube-prometheus-stack: fullnameOverride: monitoring enabled: true @@ -431,6 +430,7 @@ kube-prometheus-stack: nodePort: 30020 kube-state-metrics: fullnameOverride: state-metrics + prometheus-node-exporter: fullnameOverride: node-exporter hostNetwork: false @@ -468,37 +468,73 @@ tdengine: CLUSTER: "0" TAOS_REPLICA: "1" +strimzi-kafka-operator: + enabled: true + watchAnyNamespace: true +# defaultImageRegistry: quay.io +# defaultImageRepository: strimzi +# defaultImageTag: 0.48.0 + kafka: - global: - security: - allowInsecureImages: true enabled: true - fullnameOverride: kafka-stream - image: - repository: 'bitnamilegacy/kafka' - extraConfigYaml: - default.replication.factor: "1" - offsets.topic.replication.factor: "1" - transaction.state.log.replication.factor: "1" - transaction.state.log.min.isr: "1" + name: kafka-stream + + # Bootstrap service alias configuration + bootstrapAlias: + # Create a service alias for simpler Kafka bootstrap server name + # When enabled, creates: {name}.{namespace}.svc.cluster.local:9092 + # instead of the default: {name}-kafka-bootstrap.{namespace}.svc.cluster.local:9092 + enabled: true + # Name for the bootstrap service alias (only used if enabled is true) + name: kafka-stream + + replicas: 1 - controller: - replicaCount: 1 - resourcesPreset: "medium" listeners: - client: - name: CLIENT - containerPort: 9092 - protocol: PLAINTEXT - controller: - name: CONTROLLER - containerPort: 9093 - protocol: PLAINTEXT - interbroker: - name: INTERNAL - containerPort: 9094 - protocol: PLAINTEXT - advertisedListeners: >- - CLIENT://kafka-stream:9092 - CONTROLLER://kafka-stream-controller-headless:9093, - INTERNAL://kafka-stream-controller-headless:9094, + - name: client + port: 9092 + type: internal + tls: false + - name: controller + port: 9093 + type: internal + tls: false + - name: internal + port: 9094 + type: internal + tls: false + + storage: + type: persistent-claim + size: 8Gi + class: "" + + resources: + requests: + memory: "1Gi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "1000m" + + config: + # Replication settings for single-node setup + default.replication.factor: 1 + offsets.topic.replication.factor: 1 + transaction.state.log.replication.factor: 1 + transaction.state.log.min.isr: 1 + + zookeeper: + replicas: 0 + + # Kafka RBAC for user namespaces + # Enable this when installing in user namespaces (mlrun, mlrun1, etc.) + # When enabled, creates: ServiceAccount "kafka-client" + Role/RoleBinding + NetworkPolicy + rbac: + # Enable RBAC for this namespace to access Kafka + enabled: true + + # Operator namespace (where Kafka operator/cluster is running) + # Empty means "use the release namespace" + # Example: "controller" if that's where you installed the operator + operatorNamespace: ""