Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions pkg/status/cell.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
package status

import (
"context"
"fmt"
"log/slog"

"github.com/cilium/hive/cell"
"github.com/cilium/hive/job"
"github.com/cilium/statedb"
"github.com/spf13/pflag"

Expand Down Expand Up @@ -59,8 +62,8 @@ type statusParams struct {
cell.In

Lifecycle cell.Lifecycle
JobGroup job.Group
Logger *slog.Logger
Health cell.Health

Config Config
DaemonConfig *option.DaemonConfig
Expand Down Expand Up @@ -112,11 +115,30 @@ func newStatusCollector(params statusParams) StatusCollector {
statusCollector: newCollector(params.Logger, params.Config),
}

params.JobGroup.Add(job.OneShot("probes", func(ctx context.Context, health cell.Health) error {
params.Logger.Debug("Starting probes")
collector.statusCollector.StartProbes(collector.getProbes())
defer collector.statusCollector.Close()
params.Logger.Debug("Successfully started probes")

waitCtx, cancelWait := context.WithTimeout(ctx, params.Config.StatusCollectorProbeCheckTimeout)
defer cancelWait()

// Report health whether all probes have been executed at least once.
if err := collector.statusCollector.WaitForFirstRun(waitCtx); err != nil {
params.Logger.Debug("Not all probes successfully executed at least once")
return fmt.Errorf("not all probes successfully executed at least once: %w", err)
}

collector.allProbesInitialized = true

params.Logger.Debug("All probes executed at least once")

<-ctx.Done()
return nil
}))
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probes stopped prematurely on initial timeout error

High Severity

When WaitForFirstRun times out, returning an error causes the defer collector.statusCollector.Close() to execute immediately, stopping all running probes. The previous implementation kept probes running even when the initial check failed, only closing them during OnStop. Now, if any probe takes longer than StatusCollectorProbeCheckTimeout (default 5 minutes) for its first run, the entire status collector becomes non-functional because all probes are terminated.

Fix in Cursor Fix in Web

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Health parameter received but never used for reporting

Medium Severity

The health cell.Health parameter passed to the job.OneShot function is never used. The old code explicitly called probeCheckHealth.Degraded() on failure and probeCheckHealth.OK() on success to report probe initialization status. The new code receives the health parameter but doesn't call any health methods, so the probe check health is never reported to the hive health system. The comment "Report health whether all probes have been executed at least once" remains, but the actual health reporting was omitted.

Fix in Cursor Fix in Web


params.Lifecycle.Append(cell.Hook{
OnStart: func(ctx cell.HookContext) error {
collector.startStatusCollector()
return nil
},
OnStop: func(_ cell.HookContext) error {
// If the KVstore state is not OK, print help for user.
if collector.statusResponse.Kvstore != nil &&
Expand All @@ -131,9 +153,7 @@ func newStatusCollector(params statusParams) StatusCollector {
logfields.Status, collector.statusResponse.Kvstore.Msg,
logfields.HelpMessage, helpMsg,
)

}
collector.statusCollector.Close()
return nil
},
})
Expand Down
25 changes: 0 additions & 25 deletions pkg/status/status_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -1186,28 +1186,3 @@ func (d *statusCollector) getProbes() []Probe {
},
}
}

func (d *statusCollector) startStatusCollector() {
d.statusParams.Logger.Debug("Starting probes")
d.statusCollector.StartProbes(d.getProbes())
d.statusParams.Logger.Debug("Successfully started probes")

go func() {
ctx, cancel := context.WithTimeout(context.Background(), d.statusCollector.config.StatusCollectorProbeCheckTimeout)
defer cancel()

probeCheckHealth := d.statusParams.Health.NewScope("probe-check")

// Report health whether all probes have been executed at least once.
if err := d.statusCollector.WaitForFirstRun(ctx); err != nil {
probeCheckHealth.Degraded("Not all probes successfully executed at least once", err)
d.statusParams.Logger.Debug("Not all probes successfully executed at least once")
return
}

d.allProbesInitialized = true

probeCheckHealth.OK("All probes executed at least once")
d.statusParams.Logger.Debug("All probes executed at least once")
}()
}