diff --git a/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md b/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md index f8a15ef1..62fbae28 100644 --- a/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md +++ b/docs/deprecated_runbooks/KubeMacPoolDuplicateMacsFound.md @@ -4,6 +4,8 @@ This alert has been deprecated; It currently monitors VM MAC addresses instead of running VMI MACs, which can produce false positives. If triggered, it may be safely ignored and silenced. +Instead use KubemacpoolMACCollisionDetected alert that fires for running VMI +MAC collisions. ## Meaning diff --git a/docs/metrics.md b/docs/metrics.md index bd2b9ec9..500e19f1 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -14,6 +14,7 @@ They reflect and describe exactly what is being exposed. | [hostpath-provisioner](https://github.com/kubevirt/hostpath-provisioner/tree/main) | | [hostpath-provisioner-operator](https://github.com/kubevirt/hostpath-provisioner-operator/tree/main) | | [hyperconverged-cluster-operator](https://github.com/kubevirt/hyperconverged-cluster-operator/tree/main) | +| [kubemacpool](https://github.com/k8snetworkplumbingwg/kubemacpool/tree/main) | | [ssp-operator](https://github.com/kubevirt/ssp-operator/tree/main) | @@ -194,6 +195,8 @@ The following table contains all metrics from operators listed above. Each row r | hyperconverged-cluster-operator | `cluster:vmi_request_cpu_cores:sum` | Recording rule | Gauge | Sum of CPU core requests for all running virt-launcher VMIs across the entire KubeVirt cluster | | hyperconverged-cluster-operator | `cnv_abnormal` | Recording rule | Gauge | Monitors resources for potential problems | | hyperconverged-cluster-operator | `kubevirt_hyperconverged_operator_health_status` | Recording rule | Gauge | [Deprecated] Indicates whether HCO and its secondary resources health status is healthy (0), warning (1) or critical (2), based both on the firing alerts that impact the operator health, and on kubevirt_hco_system_health_status metric | +| kubemacpool | `kmp_mac_collisions` | Metric | Gauge | Count of running objects sharing the same MAC address (collision when > 1) | +| kubemacpool | `kubevirt_kmp_duplicate_macs` | Metric | Counter | [DEPRECATED] Total count of duplicate KubeMacPool MAC addresses. Use kmp_mac_collisions instead. | | ssp-operator | `kubevirt_ssp_common_templates_restored_total` | Metric | Counter | The total number of common templates restored by the operator back to their original state | | ssp-operator | `kubevirt_ssp_operator_reconcile_succeeded` | Metric | Gauge | Set to 1 if the reconcile process of all operands completes with no errors, and to 0 otherwise | | ssp-operator | `kubevirt_ssp_template_validator_rejected_total` | Metric | Counter | The total number of rejected template validators | diff --git a/docs/runbooks/KubemacpoolMACCollisionDetected.md b/docs/runbooks/KubemacpoolMACCollisionDetected.md new file mode 100644 index 00000000..d8ba141a --- /dev/null +++ b/docs/runbooks/KubemacpoolMACCollisionDetected.md @@ -0,0 +1,48 @@ +# KubemacpoolMACCollisionDetected + +## Meaning + +Multiple running workloads are using the same MAC address. + +## Impact + +MAC collisions cause network issues: packet loss, ARP table conflicts, and +traffic being delivered to the wrong targets. + +## Diagnosis + +1. Set the `KMP_NAMESPACE` environment variable: + + ```bash + $ export KMP_NAMESPACE="$(kubectl get pod -A --no-headers -l \ + control-plane=mac-controller-manager | awk '{print $1}')" + ``` + +2. Query the `kmp_mac_collisions` metric to see which MACs are colliding + (value > 1 means collision): + + ```bash + $ kubectl exec -n $KMP_NAMESPACE deployment/kubemacpool-mac-controller-manager \ + -c manager -- curl -s http://localhost:8080/metrics | grep kmp_mac_collisions + ``` + +3. For each colliding MAC, find the VMIs involved: + + ```bash + $ kubectl get vmi -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.metadata.name}{"\t"}{.status.interfaces[*].mac}{"\n"}{end}' | grep -i "" + ``` + +## Mitigation + +Remove the collision by deleting or reconfiguring one of the colliding +workloads to use a different MAC address. + + + +If you cannot resolve the issue, see the following resources: + +- [OKD Help](https://okd.io/docs/community/help/) +- [#virtualization Slack channel](https://kubernetes.slack.com/channels/virtualization) + diff --git a/tools/metricsdocs/config b/tools/metricsdocs/config index 37ce1191..269ab3aa 100644 --- a/tools/metricsdocs/config +++ b/tools/metricsdocs/config @@ -6,3 +6,4 @@ NMO_VERSION="master" HPPO_VERSION="main" HPP_VERSION="main" HCO_VERSION="main" +KMP_VERSION="main" diff --git a/tools/metricsdocs/metricsdocs.go b/tools/metricsdocs/metricsdocs.go index 20e61819..e89eeb8a 100644 --- a/tools/metricsdocs/metricsdocs.go +++ b/tools/metricsdocs/metricsdocs.go @@ -80,16 +80,12 @@ func parseArguments() *releaseData { log.Fatal("--config-file is a required argument") } - org := "kubevirt" - baseDir := fmt.Sprintf("%s/%s/", *cacheDir, org) - return &releaseData{ - org: org, - projects: createProjects(*configFile, baseDir, org), + projects: createProjects(*configFile, *cacheDir), } } -func createProjects(configFile string, baseDir string, org string) []*project { +func createProjects(configFile string, cacheDir string) []*project { config := getConfig(configFile) var projects []*project @@ -103,10 +99,11 @@ func createProjects(configFile string, baseDir string, org string) []*project { projects = append(projects, &project{ short: info.short, name: info.name, + org: info.org, version: version, - repoDir: baseDir + info.name, - repoUrl: fmt.Sprintf("https://github.com/%s/%s.git", org, info.name), + repoDir: fmt.Sprintf("%s/%s/%s", cacheDir, info.org, info.name), + repoUrl: fmt.Sprintf("https://github.com/%s/%s.git", info.org, info.name), metricsDocPath: info.metricsDocPath, }) } @@ -246,16 +243,16 @@ func (r *releaseData) buildOperators(operatorOrder []string) []TemplateOperator } func (p *project) writeComponentMetrics() string { - resp, err := http.Get(fmt.Sprintf("https://api.github.com/repos/kubevirt/%s/releases/tags/%s", p.name, p.version)) + resp, err := http.Get(fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/tags/%s", p.org, p.name, p.version)) if err == nil { defer resp.Body.Close() } if err != nil || resp.StatusCode != http.StatusOK { - return fmt.Sprintf("[%s](https://github.com/kubevirt/%s/tree/%s)", p.name, p.name, p.version) + return fmt.Sprintf("[%s](https://github.com/%s/%s/tree/%s)", p.name, p.org, p.name, p.version) } - return fmt.Sprintf("[%s - %s](https://github.com/kubevirt/%s/releases/tag/%s)", p.name, p.version, p.name, p.version) + return fmt.Sprintf("[%s - %s](https://github.com/%s/%s/releases/tag/%s)", p.name, p.version, p.org, p.name, p.version) } func readLines(path string) ([]string, error) { diff --git a/tools/metricsdocs/types.go b/tools/metricsdocs/types.go index a389a77e..f5de5c00 100644 --- a/tools/metricsdocs/types.go +++ b/tools/metricsdocs/types.go @@ -23,26 +23,31 @@ import ( "os" ) +const defaultOrg = "kubevirt" + var projectsInfo = []*projectInfo{ - {"KUBEVIRT", "kubevirt", "docs/observability/metrics.md"}, - {"CDI", "containerized-data-importer", "doc/metrics.md"}, - {"NETWORK_ADDONS", "cluster-network-addons-operator", "docs/metrics.md"}, - {"SSP", "ssp-operator", "docs/metrics.md"}, - {"NMO", "node-maintenance-operator", "docs/metrics.md"}, - {"HPPO", "hostpath-provisioner-operator", "docs/metrics.md"}, - {"HPP", "hostpath-provisioner", "docs/metrics.md"}, - {"HCO", "hyperconverged-cluster-operator", "docs/metrics.md"}, + {"KUBEVIRT", "kubevirt", defaultOrg, "docs/observability/metrics.md"}, + {"CDI", "containerized-data-importer", defaultOrg, "doc/metrics.md"}, + {"NETWORK_ADDONS", "cluster-network-addons-operator", defaultOrg, "docs/metrics.md"}, + {"SSP", "ssp-operator", defaultOrg, "docs/metrics.md"}, + {"NMO", "node-maintenance-operator", defaultOrg, "docs/metrics.md"}, + {"HPPO", "hostpath-provisioner-operator", defaultOrg, "docs/metrics.md"}, + {"HPP", "hostpath-provisioner", defaultOrg, "docs/metrics.md"}, + {"HCO", "hyperconverged-cluster-operator", defaultOrg, "docs/metrics.md"}, + {"KMP", "kubemacpool", "k8snetworkplumbingwg", "doc/metrics.md"}, } type projectInfo struct { short string name string + org string metricsDocPath string } type project struct { short string name string + org string version string repoDir string @@ -64,7 +69,6 @@ type TemplateOperator struct { } type releaseData struct { - org string projects []*project outFile *os.File