From 031e00576319b50843da80c3c2121c8fe6fc34f3 Mon Sep 17 00:00:00 2001 From: Ram Lavi Date: Sun, 1 Feb 2026 10:34:17 +0200 Subject: [PATCH 1/3] metricsdocs: Support per-project GitHub org Move the GitHub org from a hardcoded global "kubevirt" string to a per-project field on projectInfo and project structs. All existing projects use the new defaultOrg constant ("kubevirt"), so there is no change in behavior. This prepares for adding projects hosted outside the kubevirt org. Assisted-by: Claude Sonnet 4.6 Signed-off-by: Ram Lavi --- tools/metricsdocs/metricsdocs.go | 19 ++++++++----------- tools/metricsdocs/types.go | 21 ++++++++++++--------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 20e61819..e89eeb8a 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -80,16 +80,12 @@ func parseArguments() *releaseData { log.Fatal("--config-file is a required argument") } - org := "kubevirt" - baseDir := fmt.Sprintf("%s/%s/", *cacheDir, org) - return &releaseData{ - org: org, - projects: createProjects(*configFile, baseDir, org), + projects: createProjects(*configFile, *cacheDir), } } -func createProjects(configFile string, baseDir string, org string) []*project { +func createProjects(configFile string, cacheDir string) []*project { config := getConfig(configFile) var projects []*project @@ -103,10 +99,11 @@ func createProjects(configFile string, baseDir string, org string) []*project { projects = append(projects, &project{ short: info.short, name: info.name, + org: info.org, version: version, - repoDir: baseDir + info.name, - repoUrl: fmt.Sprintf("https://github.com/%s/%s.git", org, info.name), + repoDir: fmt.Sprintf("%s/%s/%s", cacheDir, info.org, info.name), + repoUrl: fmt.Sprintf("https://github.com/%s/%s.git", info.org, info.name), metricsDocPath: info.metricsDocPath, }) } @@ -246,16 +243,16 @@ func (r *releaseData) buildOperators(operatorOrder []string) []TemplateOperator } func (p *project) writeComponentMetrics() string { - resp, err := http.Get(fmt.Sprintf("https://api.github.com/repos/kubevirt/%s/releases/tags/%s", p.name, p.version)) + resp, err := http.Get(fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/tags/%s", p.org, p.name, p.version)) if err == nil { defer resp.Body.Close() } if err != nil || resp.StatusCode != http.StatusOK { - return fmt.Sprintf("[%s](https://github.com/kubevirt/%s/tree/%s)", p.name, p.name, p.version) + return fmt.Sprintf("[%s](https://github.com/%s/%s/tree/%s)", p.name, p.org, p.name, p.version) } - return fmt.Sprintf("[%s - %s](https://github.com/kubevirt/%s/releases/tag/%s)", p.name, p.version, p.name, p.version) + return fmt.Sprintf("[%s - %s](https://github.com/%s/%s/releases/tag/%s)", p.name, p.version, p.org, p.name, p.version) } func readLines(path string) ([]string, error) { diff --git a/tools/metricsdocs/types.go b/tools/metricsdocs/types.go index a389a77e..bed3cd98 100644 --- a/tools/metricsdocs/types.go +++ b/tools/metricsdocs/types.go @@ -23,26 +23,30 @@ import ( "os" ) +const defaultOrg = "kubevirt" + var projectsInfo = []*projectInfo{ - {"KUBEVIRT", "kubevirt", "docs/observability/metrics.md"}, - {"CDI", "containerized-data-importer", "doc/metrics.md"}, - {"NETWORK_ADDONS", "cluster-network-addons-operator", "docs/metrics.md"}, - {"SSP", "ssp-operator", "docs/metrics.md"}, - {"NMO", "node-maintenance-operator", "docs/metrics.md"}, - {"HPPO", "hostpath-provisioner-operator", "docs/metrics.md"}, - {"HPP", "hostpath-provisioner", "docs/metrics.md"}, - {"HCO", "hyperconverged-cluster-operator", "docs/metrics.md"}, + {"KUBEVIRT", "kubevirt", defaultOrg, "docs/observability/metrics.md"}, + {"CDI", "containerized-data-importer", defaultOrg, "doc/metrics.md"}, + {"NETWORK_ADDONS", "cluster-network-addons-operator", defaultOrg, "docs/metrics.md"}, + {"SSP", "ssp-operator", defaultOrg, "docs/metrics.md"}, + {"NMO", "node-maintenance-operator", defaultOrg, "docs/metrics.md"}, + {"HPPO", "hostpath-provisioner-operator", defaultOrg, "docs/metrics.md"}, + {"HPP", "hostpath-provisioner", defaultOrg, "docs/metrics.md"}, + {"HCO", "hyperconverged-cluster-operator", defaultOrg, "docs/metrics.md"}, } type projectInfo struct { short string name string + org string metricsDocPath string } type project struct { short string name string + org string version string repoDir string @@ -64,7 +68,6 @@ type TemplateOperator struct { } type releaseData struct { - org string projects []*project outFile *os.File From 09d98b1cca78b9bf51dab91428e3d10b4b97c26c Mon Sep 17 00:00:00 2001 From: Ram Lavi Date: Tue, 17 Feb 2026 13:50:03 +0200 Subject: [PATCH 2/3] metricsdocs: Add kubemacpool metrics from k8snetworkplumbingwg org Add kubemacpool as a tracked project in the metrics documentation generator. Since kubemacpool is hosted under the k8snetworkplumbingwg GitHub org (not kubevirt), this uses the per-project org support introduced in the previous commit. Regenerate docs/metrics.md to include the new KMP metrics. Assisted-by: Claude Sonnet 4.6 Signed-off-by: Ram Lavi --- docs/metrics.md | 3 +++ tools/metricsdocs/config | 1 + tools/metricsdocs/types.go | 1 + 3 files changed, 5 insertions(+) diff --git a/docs/metrics.md b/docs/metrics.md index bd2b9ec9..500e19f1 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -14,6 +14,7 @@ They reflect and describe exactly what is being exposed. | [hostpath-provisioner](https://github.com/kubevirt/hostpath-provisioner/tree/main) | | [hostpath-provisioner-operator](https://github.com/kubevirt/hostpath-provisioner-operator/tree/main) | | [hyperconverged-cluster-operator](https://github.com/kubevirt/hyperconverged-cluster-operator/tree/main) | +| [kubemacpool](https://github.com/k8snetworkplumbingwg/kubemacpool/tree/main) | | [ssp-operator](https://github.com/kubevirt/ssp-operator/tree/main) | @@ -194,6 +195,8 @@ The following table contains all metrics from operators listed above. Each row r | hyperconverged-cluster-operator | `cluster:vmi_request_cpu_cores:sum` | Recording rule | Gauge | Sum of CPU core requests for all running virt-launcher VMIs across the entire KubeVirt cluster | | hyperconverged-cluster-operator | `cnv_abnormal` | Recording rule | Gauge | Monitors resources for potential problems | | hyperconverged-cluster-operator | `kubevirt_hyperconverged_operator_health_status` | Recording rule | Gauge | [Deprecated] Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric | +| kubemacpool | `kmp_mac_collisions` | Metric | Gauge | Count of running objects sharing the same MAC address (collision when > 1) | +| kubemacpool | `kubevirt_kmp_duplicate_macs` | Metric | Counter | [DEPRECATED] Total count of duplicate KubeMacPool MAC addresses. Use kmp_mac_collisions instead. | | ssp-operator | `kubevirt_ssp_common_templates_restored_total` | Metric | Counter | The total number of common templates restored by the operator back to their original state | | ssp-operator | `kubevirt_ssp_operator_reconcile_succeeded` | Metric | Gauge | Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise | | ssp-operator | `kubevirt_ssp_template_validator_rejected_total` | Metric | Counter | The total number of rejected template validators | diff --git a/tools/metricsdocs/config b/tools/metricsdocs/config index 37ce1191..269ab3aa 100644 --- a/tools/metricsdocs/config +++ b/tools/metricsdocs/config @@ -6,3 +6,4 @@ NMO_VERSION="master" HPPO_VERSION="main" HPP_VERSION="main" HCO_VERSION="main" +KMP_VERSION="main" diff --git a/tools/metricsdocs/types.go b/tools/metricsdocs/types.go index bed3cd98..f5de5c00 100644 --- a/tools/metricsdocs/types.go +++ b/tools/metricsdocs/types.go @@ -34,6 +34,7 @@ var projectsInfo = []*projectInfo{ {"HPPO", "hostpath-provisioner-operator", defaultOrg, "docs/metrics.md"}, {"HPP", "hostpath-provisioner", defaultOrg, "docs/metrics.md"}, {"HCO", "hyperconverged-cluster-operator", defaultOrg, "docs/metrics.md"}, + {"KMP", "kubemacpool", "k8snetworkplumbingwg", "doc/metrics.md"}, } type projectInfo struct { From 109eacd16621d63f55db72b7e6cbc4da43e05772 Mon Sep 17 00:00:00 2001 From: Ram Lavi Date: Tue, 13 Jan 2026 14:28:31 +0200 Subject: [PATCH 3/3] Introduce KubemacpoolMACCollisionDetected alert Runbook This commit introduces a new alert for running objects MAC collision. It also references the new alert on the deprecated alert. Signed-off-by: Ram Lavi --- .../KubeMacPoolDuplicateMacsFound.md | 2 + .../KubemacpoolMACCollisionDetected.md | 48 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 docs/runbooks/KubemacpoolMACCollisionDetected.md diff --git a/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md b/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md index f8a15ef1..62fbae28 100644 --- a/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md +++ b/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md @@ -4,6 +4,8 @@ This alert has been deprecated; It currently monitors VM MAC addresses instead of running VMI MACs, which can produce false positives. If triggered, it may be safely ignored and silenced. +Instead use KubemacpoolMACCollisionDetected alert that fires for running VMI +MAC collisions. ## Meaning diff --git a/docs/runbooks/KubemacpoolMACCollisionDetected.md b/docs/runbooks/KubemacpoolMACCollisionDetected.md new file mode 100644 index 00000000..d8ba141a --- /dev/null +++ b/docs/runbooks/KubemacpoolMACCollisionDetected.md @@ -0,0 +1,48 @@ +# KubemacpoolMACCollisionDetected + +## Meaning + +Multiple running workloads are using the same MAC address. + +## Impact + +MAC collisions cause network issues: packet loss, ARP table conflicts, and +traffic being delivered to the wrong targets. + +## Diagnosis + +1. Set the `KMP_NAMESPACE` environment variable: + + ```bash + $ export KMP_NAMESPACE="$(kubectl get pod -A --no-headers -l \ + control-plane=mac-controller-manager | awk '{print $1}')" + ``` + +2. Query the `kmp_mac_collisions` metric to see which MACs are colliding + (value > 1 means collision): + + ```bash + $ kubectl exec -n $KMP_NAMESPACE deployment/kubemacpool-mac-controller-manager \ + -c manager -- curl -s http://localhost:8080/metrics | grep kmp_mac_collisions + ``` + +3. For each colliding MAC, find the VMIs involved: + + ```bash + $ kubectl get vmi -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.interfaces[*].mac}{"\n"}{end}' | grep -i "" + ``` + +## Mitigation + +Remove the collision by deleting or reconfiguring one of the colliding +workloads to use a different MAC address. + + + +If you cannot resolve the issue, see the following resources: + +- [OKD Help](https://okd.io/docs/community/help/) +- [#virtualization Slack channel](https://kubernetes.slack.com/channels/virtualization) +