@@ -44,8 +44,10 @@ const (
4444 DefaultTimeToSync = 5 * time .Minute
4545)
4646
47- // NewPlugin creates a consensus reporting plugin for shard orchestration
4847func NewPlugin (store * Store , arbiterScaler pb.ArbiterScalerClient , config ocr3types.ReportingPluginConfig , lggr logger.Logger , cfg * ConsensusConfig ) (* Plugin , error ) {
48+ if arbiterScaler == nil {
49+ return nil , errors .New ("RingOCR arbiterScaler is required" )
50+ }
4951 if cfg == nil {
5052 cfg = & ConsensusConfig {
5153 BatchSize : DefaultBatchSize ,
@@ -86,20 +88,14 @@ func (p *Plugin) Observation(ctx context.Context, _ ocr3types.OutcomeContext, _
8688 var wantShards uint32
8789 shardStatus := make (map [uint32 ]* pb.ShardStatus )
8890
89- if p .arbiterScaler != nil {
90- status , err := p .arbiterScaler .Status (ctx , & emptypb.Empty {})
91- if err != nil {
92- p .lggr .Warnw ("failed to get arbiter scaler status" , "error" , err )
93- } else {
94- wantShards = status .WantShards
95- shardStatus = status .Status
96- }
91+ status , err := p .arbiterScaler .Status (ctx , & emptypb.Empty {})
92+ if err != nil {
93+ p .lggr .Warnw ("RingOCR failed to get arbiter scaler status" , "error" , err )
94+ wantShards = 0
95+ shardStatus = make (map [uint32 ]* pb.ShardStatus )
9796 } else {
98- // Fallback to store if no arbiter scaler configured
99- shardHealth := p .store .GetShardHealth ()
100- for shardID , healthy := range shardHealth {
101- shardStatus [shardID ] = & pb.ShardStatus {IsHealthy : healthy }
102- }
97+ wantShards = status .WantShards
98+ shardStatus = status .Status
10399 }
104100
105101 allWorkflowIDs := make ([]string , 0 )
@@ -108,9 +104,11 @@ func (p *Plugin) Observation(ctx context.Context, _ ocr3types.OutcomeContext, _
108104 }
109105
110106 pendingAllocs := p .store .GetPendingAllocations ()
111- allWorkflowIDs = append ( allWorkflowIDs , pendingAllocs ... )
107+ p . lggr . Infow ( "RingOCR Observation pending allocations" , " pendingAllocs" , pendingAllocs )
112108
109+ allWorkflowIDs = append (allWorkflowIDs , pendingAllocs ... )
113110 allWorkflowIDs = uniqueSorted (allWorkflowIDs )
111+ p .lggr .Infow ("RingOCR Observation all workflow IDs unique" , "allWorkflowIDs" , allWorkflowIDs , "wantShards" , wantShards )
114112
115113 observation := & pb.Observation {
116114 ShardStatus : shardStatus ,
@@ -122,23 +120,30 @@ func (p *Plugin) Observation(ctx context.Context, _ ocr3types.OutcomeContext, _
122120 return proto.MarshalOptions {Deterministic : true }.Marshal (observation )
123121}
124122
125- //coverage:ignore
126- func (p * Plugin ) ValidateObservation (_ context.Context , _ ocr3types.OutcomeContext , _ types.Query , _ types.AttributedObservation ) error {
123+ func (p * Plugin ) ValidateObservation (_ context.Context , _ ocr3types.OutcomeContext , _ types.Query , ao types.AttributedObservation ) error {
124+ observation := & pb.Observation {}
125+ if err := proto .Unmarshal (ao .Observation , observation ); err != nil {
126+ return err
127+ }
128+ if observation .Now == nil {
129+ return errors .New ("observation missing timestamp" )
130+ }
131+ if observation .WantShards == 0 {
132+ return errors .New ("observation missing WantShards" )
133+ }
127134 return nil
128135}
129136
130137func (p * Plugin ) ObservationQuorum (_ context.Context , _ ocr3types.OutcomeContext , _ types.Query , aos []types.AttributedObservation ) (quorumReached bool , err error ) {
131138 return quorumhelper .ObservationCountReachesObservationQuorum (quorumhelper .QuorumTwoFPlusOne , p .config .N , p .config .F , aos ), nil
132139}
133140
134- func (p * Plugin ) collectShardInfo (aos []types.AttributedObservation ) (shardHealth map [uint32 ]int , workflows []string , timestamps []time.Time , wantShardVotes [ ]uint32 ) {
141+ func (p * Plugin ) collectShardInfo (aos []types.AttributedObservation ) (shardHealth map [uint32 ]int , workflows []string , timestamps []time.Time , wantShardVotes map [commontypes. OracleID ]uint32 ) {
135142 shardHealth = make (map [uint32 ]int )
143+ wantShardVotes = make (map [commontypes.OracleID ]uint32 )
136144 for _ , ao := range aos {
137145 observation := & pb.Observation {}
138- if err := proto .Unmarshal (ao .Observation , observation ); err != nil {
139- p .lggr .Warnf ("failed to unmarshal observation: %v" , err )
140- continue
141- }
146+ _ = proto .Unmarshal (ao .Observation , observation ) // validated in ValidateObservation
142147
143148 for shardID , status := range observation .ShardStatus {
144149 if status != nil && status .IsHealthy {
@@ -147,14 +152,9 @@ func (p *Plugin) collectShardInfo(aos []types.AttributedObservation) (shardHealt
147152 }
148153
149154 workflows = append (workflows , observation .WorkflowIds ... )
155+ timestamps = append (timestamps , observation .Now .AsTime ())
150156
151- if observation .Now != nil {
152- timestamps = append (timestamps , observation .Now .AsTime ())
153- }
154-
155- if observation .WantShards > 0 {
156- wantShardVotes = append (wantShardVotes , observation .WantShards )
157- }
157+ wantShardVotes [ao .Observer ] = observation .WantShards
158158 }
159159 return shardHealth , workflows , timestamps , wantShardVotes
160160}
@@ -183,6 +183,7 @@ func (p *Plugin) Outcome(_ context.Context, outctx ocr3types.OutcomeContext, _ t
183183 }
184184
185185 currentShardHealth , allWorkflows , nows , wantShardVotes := p .collectShardInfo (aos )
186+ p .lggr .Infow ("RingOCR Outcome collect shard info" , "currentShardHealth" , currentShardHealth , "wantShardVotes" , wantShardVotes )
186187
187188 // Need at least F+1 timestamps; fewer means >F faulty nodes and we can't trust this round
188189 if len (nows ) < p .config .F + 1 {
@@ -193,18 +194,13 @@ func (p *Plugin) Outcome(_ context.Context, outctx ocr3types.OutcomeContext, _ t
193194 // Use the median timestamp to determine the current time
194195 now := nows [len (nows )/ 2 ]
195196
196- // Use median for wantShards consensus; fall back to current state if insufficient votes
197- var wantShards uint32
198- if len (wantShardVotes ) >= p .config .F + 1 {
199- slices .Sort (wantShardVotes )
200- wantShards = wantShardVotes [len (wantShardVotes )/ 2 ]
201- } else if rs := prior .State .GetRoutableShards (); rs > 0 {
202- wantShards = rs
203- } else if tr := prior .State .GetTransition (); tr != nil {
204- wantShards = tr .WantShards
205- } else {
206- wantShards = 1 // ultimate fallback
197+ // Use median for wantShards consensus (all validated observations have WantShards > 0)
198+ votes := make ([]uint32 , 0 , len (wantShardVotes ))
199+ for _ , v := range wantShardVotes {
200+ votes = append (votes , v )
207201 }
202+ slices .Sort (votes )
203+ wantShards := votes [len (votes )/ 2 ]
208204
209205 allWorkflows = uniqueSorted (allWorkflows )
210206
@@ -217,20 +213,23 @@ func (p *Plugin) Outcome(_ context.Context, outctx ocr3types.OutcomeContext, _ t
217213
218214 // Deterministic hashing ensures all nodes agree on workflow-to-shard assignments
219215 // without coordination, preventing protocol failures from inconsistent routing
216+ ring := newShardRing (healthyShards )
220217 routes := make (map [string ]* pb.WorkflowRoute )
221218 for _ , wfID := range allWorkflows {
222- assignedShard := getShardForWorkflow (wfID , healthyShards )
223- routes [wfID ] = & pb.WorkflowRoute {
224- Shard : assignedShard ,
219+ shard , err := locateShard (ring , wfID )
220+ if err != nil {
221+ p .lggr .Warnw ("RingOCR failed to locate shard for workflow" , "workflowID" , wfID , "error" , err )
222+ shard = 0 // fallback to shard 0 when no healthy shards
225223 }
224+ routes [wfID ] = & pb.WorkflowRoute {Shard : shard }
226225 }
227226
228227 outcome := & pb.Outcome {
229228 State : nextState ,
230229 Routes : routes ,
231230 }
232231
233- p .lggr .Infow ("Consensus Outcome" , "healthyShards" , len (healthyShards ), "totalObservations" , len (aos ), "workflowCount" , len (routes ))
232+ p .lggr .Infow ("RingOCR Outcome" , "healthyShards" , len (healthyShards ), "totalObservations" , len (aos ), "workflowCount" , len (routes ))
234233
235234 return proto.MarshalOptions {Deterministic : true }.Marshal (outcome )
236235}
0 commit comments