From 28baddbcd60943f3e641cb82ce9a1dcd3acffc62 Mon Sep 17 00:00:00 2001 From: Alex Toker Date: Mon, 16 Feb 2026 20:11:30 +0000 Subject: [PATCH] [Spark] Support spark-operator on multi-namespace deployments Admin namespace holds CRDs + ClusterRole (no running pods), each user namespace runs its own spark-operator controller with a namespace-scoped RoleBinding to the shared ClusterRole. - Add CE-level `spark` config section to decouple ConfigMap from subchart - Add spark-controller-rbac.yaml template for user NS RBAC gap - Update admin/non-admin values files with spark-operator split config - Add `multi-ns` command to Kind test with 2 user namespace validation - Bump chart version to 0.11.0-rc9 --- charts/mlrun-ce/Chart.yaml | 2 +- .../mlrun-ce/admin_installation_values.yaml | 22 + ..._admin_cluster_ip_installation_values.yaml | 27 +- .../non_admin_installation_values.yaml | 27 +- .../templates/config/mlrun-spark-config.yaml | 2 +- .../spark-operator/spark-controller-rbac.yaml | 61 +++ charts/mlrun-ce/values.yaml | 11 + tests/kind-test.sh | 477 +++++++++++++++++- 8 files changed, 621 insertions(+), 8 deletions(-) create mode 100644 charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml diff --git a/charts/mlrun-ce/Chart.yaml b/charts/mlrun-ce/Chart.yaml index 01ac4c48..92bc7cb0 100644 --- a/charts/mlrun-ce/Chart.yaml +++ b/charts/mlrun-ce/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v1 name: mlrun-ce -version: 0.11.0-rc8 +version: 0.11.0-rc9 description: MLRun Open Source Stack home: https://iguazio.com icon: https://www.iguazio.com/wp-content/uploads/2019/10/Iguazio-Logo.png diff --git a/charts/mlrun-ce/admin_installation_values.yaml b/charts/mlrun-ce/admin_installation_values.yaml index 0ef34729..03b2141c 100644 --- a/charts/mlrun-ce/admin_installation_values.yaml +++ b/charts/mlrun-ce/admin_installation_values.yaml @@ -40,7 +40,29 @@ minio: enabled: false spark-operator: + enabled: true + fullnameOverride: spark-operator + controller: + replicas: 0 # No running pods in admin + rbac: + create: true # Creates ClusterRole (shared by all user namespaces) + serviceAccount: + create: true + webhook: + enable: true + replicas: 1 + spark: + jobNamespaces: + - "" # All namespaces (no namespaceSelector on webhook) + serviceAccount: + create: false # No sparkapp SA in admin + rbac: + create: false + +spark: enabled: false + rbac: + enabled: false pipelines: enabled: false diff --git a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml index fb1f4de3..fbf8d772 100644 --- a/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_cluster_ip_installation_values.yaml @@ -53,7 +53,32 @@ timescaledb: nodePort: "" spark-operator: - enabled: false + enabled: true + fullnameOverride: spark-operator + controller: + replicas: 1 + rbac: + create: false + serviceAccount: + create: true + leaderElection: + enable: true + webhook: + enable: false + spark: + jobNamespaces: + - mlrun + serviceAccount: + create: true + name: sparkapp + rbac: + create: true + +spark: + enabled: true + rbac: + enabled: true + serviceAccountName: sparkapp pipelines: service: diff --git a/charts/mlrun-ce/non_admin_installation_values.yaml b/charts/mlrun-ce/non_admin_installation_values.yaml index 460b1013..76267241 100644 --- a/charts/mlrun-ce/non_admin_installation_values.yaml +++ b/charts/mlrun-ce/non_admin_installation_values.yaml @@ -47,7 +47,32 @@ minio: replicas: 1 spark-operator: - enabled: false + enabled: true + fullnameOverride: spark-operator + controller: + replicas: 1 # Controller runs in user namespace + rbac: + create: false # ClusterRole already exists from admin + serviceAccount: + create: true + leaderElection: + enable: true + webhook: + enable: false + spark: + jobNamespaces: + - mlrun # Override with actual namespace at install time + serviceAccount: + create: true + name: sparkapp + rbac: + create: true # Creates sparkapp Role + RoleBinding + +spark: + enabled: true + rbac: + enabled: true + serviceAccountName: sparkapp pipelines: service: diff --git a/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml b/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml index 02054f18..000eda4b 100644 --- a/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml +++ b/charts/mlrun-ce/templates/config/mlrun-spark-config.yaml @@ -1,4 +1,4 @@ -{{- if index .Values "spark-operator" "enabled" -}} +{{- if .Values.spark.enabled -}} apiVersion: v1 kind: ConfigMap metadata: diff --git a/charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml b/charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml new file mode 100644 index 00000000..b027c42c --- /dev/null +++ b/charts/mlrun-ce/templates/spark-operator/spark-controller-rbac.yaml @@ -0,0 +1,61 @@ +{{- if and (index .Values "spark-operator" "enabled") (not (index .Values "spark-operator" "controller" "rbac" "create")) -}} +{{- /* + This template renders only in user multi-NS mode: + - spark-operator subchart is enabled (controller Deployment runs here) + - controller.rbac.create is false (ClusterRole already exists from admin namespace) + + It creates: + 1. RoleBinding: controller SA → shared ClusterRole (namespace-scoped access) + 2. Role + RoleBinding: leader election leases (coordination.k8s.io) +*/ -}} +--- +# RoleBinding: Grant controller SA access to the shared ClusterRole (namespace-scoped) +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-operator-controller + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: spark-controller-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: spark-operator-controller + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: spark-operator-controller + apiGroup: rbac.authorization.k8s.io +--- +# Role: Leader election leases +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: spark-operator-controller-leases + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: spark-controller-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +rules: + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create", "get", "update"] +--- +# RoleBinding: Grant controller SA access to leader election leases +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-operator-controller-leases + labels: + app.kubernetes.io/name: mlrun-ce + app.kubernetes.io/component: spark-controller-rbac + app.kubernetes.io/managed-by: {{ .Release.Name }} +subjects: + - kind: ServiceAccount + name: spark-operator-controller + namespace: {{ .Release.Namespace }} +roleRef: + kind: Role + name: spark-operator-controller-leases + apiGroup: rbac.authorization.k8s.io +{{- end }} diff --git a/charts/mlrun-ce/values.yaml b/charts/mlrun-ce/values.yaml index f403ec0b..147098dd 100644 --- a/charts/mlrun-ce/values.yaml +++ b/charts/mlrun-ce/values.yaml @@ -566,3 +566,14 @@ kafka: # Empty means "use the release namespace" # Example: "controller" if that's where you installed the operator operatorNamespace: "" + +# Spark configuration for multi-NS deployments +# Controls CE-level spark resources (mlrun-spark-config ConfigMap) +# In single-NS mode, both spark.enabled and spark-operator.enabled are true +# In multi-NS admin mode, spark.enabled is false (no ConfigMap needed) +# In multi-NS user mode, spark.enabled is true (ConfigMap needed for MLRun) +spark: + enabled: true + rbac: + enabled: true + serviceAccountName: sparkapp diff --git a/tests/kind-test.sh b/tests/kind-test.sh index c99a182e..60e138b9 100755 --- a/tests/kind-test.sh +++ b/tests/kind-test.sh @@ -23,6 +23,9 @@ set -o pipefail CLUSTER_NAME="${CLUSTER_NAME:-mlrun-ce-test}" NAMESPACE="${NAMESPACE:-mlrun}" RELEASE_NAME="${RELEASE_NAME:-mlrun}" +ADMIN_NAMESPACE="${ADMIN_NAMESPACE:-mlrun-admin}" +USER_NAMESPACE_1="${USER_NAMESPACE_1:-mlrun-user1}" +USER_NAMESPACE_2="${USER_NAMESPACE_2:-mlrun-user2}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CHART_DIR="${SCRIPT_DIR}/../charts/mlrun-ce" @@ -35,6 +38,7 @@ NC='\033[0m' # No Color log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } +fail() { log_error "$1"; exit 1; } cleanup() { if [[ "${CLEANUP_ON_EXIT:-false}" == "true" ]]; then @@ -222,6 +226,458 @@ verify_installation() { fi } +# --- Multi-NS test functions --- + +# Shared Kind-friendly resource limits for multi-ns test +_kind_resource_limits() { + cat <<'LIMITS' +mlrun: + api: + resources: + requests: { memory: "256Mi", cpu: "100m" } + limits: { memory: "512Mi", cpu: "500m" } + ui: + resources: + requests: { memory: "128Mi", cpu: "50m" } + limits: { memory: "256Mi", cpu: "200m" } +nuclio: + dashboard: + resources: + requests: { memory: "128Mi", cpu: "50m" } + limits: { memory: "256Mi", cpu: "200m" } +minio: + resources: + requests: { memory: "256Mi", cpu: "100m" } + limits: { memory: "512Mi", cpu: "500m" } + replicas: 1 + mode: standalone +timescaledb: + resources: + requests: { memory: "256Mi", cpu: "100m" } + limits: { memory: "512Mi", cpu: "500m" } +LIMITS +} + +install_admin_chart() { + log_info "Installing admin release in namespace '${ADMIN_NAMESPACE}'..." + + local values_file + values_file=$(mktemp) + trap "rm -f '${values_file}'" RETURN + + cat > "${values_file}" < "${values_file}" </dev/null; then + log_info "SparkApplication CRD exists" + else + log_error "SparkApplication CRD not found" + errors=$((errors + 1)) + fi + + if kubectl get crd scheduledsparkapplications.sparkoperator.k8s.io &>/dev/null; then + log_info "ScheduledSparkApplication CRD exists" + else + log_error "ScheduledSparkApplication CRD not found" + errors=$((errors + 1)) + fi + + # ClusterRole exists + if kubectl get clusterrole spark-operator-controller &>/dev/null; then + log_info "ClusterRole spark-operator-controller exists" + else + log_error "ClusterRole spark-operator-controller not found" + errors=$((errors + 1)) + fi + + # No controller pods in admin (controller.replicas=0) + local controller_pod_count + controller_pod_count=$(kubectl get pods -n "${ADMIN_NAMESPACE}" -l app.kubernetes.io/component=controller -o name 2>/dev/null | wc -l) + if [[ "${controller_pod_count}" -eq 0 ]]; then + log_info "No spark-operator controller pods in admin namespace (expected)" + else + log_error "Found ${controller_pod_count} controller pods in admin namespace (expected 0)" + errors=$((errors + 1)) + fi + + # Webhook pod running in admin (webhook.replicas=1) + if kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/component=webhook \ + -n "${ADMIN_NAMESPACE}" --timeout=120s &>/dev/null; then + log_info "Webhook pod is Ready in admin namespace" + else + log_error "Webhook pod not ready in admin namespace" + errors=$((errors + 1)) + fi + + # MutatingWebhookConfiguration exists + if kubectl get mutatingwebhookconfiguration spark-operator-webhook &>/dev/null; then + log_info "MutatingWebhookConfiguration exists" + else + log_error "MutatingWebhookConfiguration not found" + errors=$((errors + 1)) + fi + + # No sparkapp SA in admin + if ! kubectl get sa sparkapp -n "${ADMIN_NAMESPACE}" &>/dev/null; then + log_info "No sparkapp SA in admin namespace (expected)" + else + log_error "sparkapp SA should not exist in admin namespace" + errors=$((errors + 1)) + fi + + # mlrun-spark-config NOT in admin + if ! kubectl get configmap mlrun-spark-config -n "${ADMIN_NAMESPACE}" &>/dev/null; then + log_info "mlrun-spark-config ConfigMap absent from admin namespace (expected)" + else + log_error "mlrun-spark-config ConfigMap should not exist in admin namespace" + errors=$((errors + 1)) + fi + + return "${errors}" +} + +# Verify a single user namespace. Args: $1 = namespace name +verify_user_ns() { + local user_ns="$1" + log_info "=== User namespace (${user_ns}) ===" + local errors=0 + + # spark-operator-controller pod is Running + if kubectl wait --for=condition=ready pod \ + -l app.kubernetes.io/name=spark-operator \ + -n "${user_ns}" --timeout=120s &>/dev/null; then + log_info "spark-operator-controller pod is Ready" + else + log_error "spark-operator-controller pod not ready" + kubectl get pods -n "${user_ns}" -l app.kubernetes.io/name=spark-operator + errors=$((errors + 1)) + fi + + # sparkapp SA exists + if kubectl get sa sparkapp -n "${user_ns}" &>/dev/null; then + log_info "sparkapp ServiceAccount exists" + else + log_error "sparkapp ServiceAccount not found" + errors=$((errors + 1)) + fi + + # CE-created RoleBinding → ClusterRole exists and is correct + if kubectl get rolebinding spark-operator-controller -n "${user_ns}" &>/dev/null; then + local rb_kind rb_name + rb_kind=$(kubectl get rolebinding spark-operator-controller -n "${user_ns}" -o jsonpath='{.roleRef.kind}') + rb_name=$(kubectl get rolebinding spark-operator-controller -n "${user_ns}" -o jsonpath='{.roleRef.name}') + if [[ "${rb_kind}" == "ClusterRole" && "${rb_name}" == "spark-operator-controller" ]]; then + log_info "RoleBinding spark-operator-controller -> ClusterRole spark-operator-controller (correct)" + else + log_error "RoleBinding references ${rb_kind}/${rb_name}, expected ClusterRole/spark-operator-controller" + errors=$((errors + 1)) + fi + else + log_error "RoleBinding spark-operator-controller not found" + errors=$((errors + 1)) + fi + + # Leader election Role + RoleBinding exist + if kubectl get role spark-operator-controller-leases -n "${user_ns}" &>/dev/null; then + log_info "Leader election Role exists" + else + log_error "Leader election Role spark-operator-controller-leases not found" + errors=$((errors + 1)) + fi + + if kubectl get rolebinding spark-operator-controller-leases -n "${user_ns}" &>/dev/null; then + log_info "Leader election RoleBinding exists" + else + log_error "Leader election RoleBinding spark-operator-controller-leases not found" + errors=$((errors + 1)) + fi + + # mlrun-spark-config ConfigMap exists + if kubectl get configmap mlrun-spark-config -n "${user_ns}" &>/dev/null; then + log_info "mlrun-spark-config ConfigMap exists" + else + log_error "mlrun-spark-config ConfigMap not found" + errors=$((errors + 1)) + fi + + # Functional check: submit a SparkApplication + log_info "Submitting SparkApplication in ${user_ns}..." + kubectl apply -n "${user_ns}" -f - <<'SPARK_EOF' +apiVersion: sparkoperator.k8s.io/v1beta2 +kind: SparkApplication +metadata: + name: spark-test +spec: + type: Scala + mode: cluster + image: spark:3.5.0 + mainClass: org.apache.spark.examples.SparkPi + mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.0.jar + sparkVersion: "3.5.0" + driver: + serviceAccount: sparkapp + cores: 1 + memory: "512m" + executor: + cores: 1 + instances: 1 + memory: "512m" +SPARK_EOF + + log_info "Waiting for controller to process SparkApplication..." + local status="" + local driver_pod="" + local attempt + for attempt in $(seq 1 12); do + sleep 5 + status=$(kubectl get sparkapplication spark-test -n "${user_ns}" -o jsonpath='{.status.applicationState.state}' 2>/dev/null || echo "") + driver_pod=$(kubectl get pod spark-test-driver -n "${user_ns}" -o name 2>/dev/null || echo "") + if [[ -n "${status}" ]]; then + log_info "SparkApplication status: ${status} (controller is processing)" + break + elif [[ -n "${driver_pod}" ]]; then + log_info "Driver pod created (controller is processing, status not yet set)" + break + fi + log_info "Attempt ${attempt}/12: waiting for controller to set status or create driver pod..." + done + if [[ -z "${status}" && -z "${driver_pod}" ]]; then + log_error "SparkApplication not processed after 60s — controller may not be working" + errors=$((errors + 1)) + fi + + # Cleanup + kubectl delete sparkapplication spark-test -n "${user_ns}" --ignore-not-found &>/dev/null + + return "${errors}" +} + +verify_multi_ns() { + log_info "Verifying multi-NS spark-operator split (1 admin + 2 user namespaces)..." + local total_errors=0 + local ns_errors=0 + + echo "" + ns_errors=0 + verify_admin_ns || ns_errors=$? + total_errors=$((total_errors + ns_errors)) + + echo "" + ns_errors=0 + verify_user_ns "${USER_NAMESPACE_1}" || ns_errors=$? + total_errors=$((total_errors + ns_errors)) + + echo "" + ns_errors=0 + verify_user_ns "${USER_NAMESPACE_2}" || ns_errors=$? + total_errors=$((total_errors + ns_errors)) + + # --- Summary --- + echo "" + if [[ "${total_errors}" -eq 0 ]]; then + log_info "All multi-NS checks passed! (admin + 2 user namespaces, no conflicts)" + else + log_error "${total_errors} check(s) failed" + exit 1 + fi +} + delete_cluster() { log_info "Deleting Kind cluster '${CLUSTER_NAME}'..." kind delete cluster --name "${CLUSTER_NAME}" @@ -235,15 +691,19 @@ Commands: create Create Kind cluster only install Install MLRun CE chart (assumes cluster exists) full Create cluster and install chart (default) + multi-ns Multi-NS test: admin + 2 user namespaces with spark-operator split verify Verify installation delete Delete Kind cluster help Show this help message Environment variables: - CLUSTER_NAME Kind cluster name (default: mlrun-ce-test) - NAMESPACE Kubernetes namespace (default: mlrun) - RELEASE_NAME Helm release name (default: mlrun) - CLEANUP_ON_EXIT Delete cluster on script exit (default: false) + CLUSTER_NAME Kind cluster name (default: mlrun-ce-test) + NAMESPACE Kubernetes namespace (default: mlrun) + RELEASE_NAME Helm release name (default: mlrun) + ADMIN_NAMESPACE Admin namespace for multi-ns (default: mlrun-admin) + USER_NAMESPACE_1 First user namespace for multi-ns (default: mlrun-user1) + USER_NAMESPACE_2 Second user namespace for multi-ns (default: mlrun-user2) + CLEANUP_ON_EXIT Delete cluster on script exit (default: false) Examples: $0 full # Full test: create cluster + install @@ -275,6 +735,15 @@ main() { install_chart verify_installation ;; + multi-ns) + create_kind_cluster + setup_helm_repos + build_dependencies + install_admin_chart + install_user_chart "${USER_NAMESPACE_1}" "mlrun-user1" + install_user_chart "${USER_NAMESPACE_2}" "mlrun-user2" + verify_multi_ns + ;; verify) verify_installation ;;