From 2fbe283d05e6082ca3758512bdac31fd1a5234e2 Mon Sep 17 00:00:00 2001 From: Calvin Zachman Date: Wed, 12 Nov 2025 17:20:00 -0500 Subject: [PATCH 1/6] switchrpc: add AttemptStore access This permits the Switch RPC server to manage the state of attempts. --- htlcswitch/switch.go | 5 +++++ lnrpc/switchrpc/config_active.go | 4 ++++ lnrpc/switchrpc/driver.go | 4 ++++ subrpcserver_config.go | 4 ++++ 4 files changed, 17 insertions(+) diff --git a/htlcswitch/switch.go b/htlcswitch/switch.go index c80a280f412..8b7a26e2076 100644 --- a/htlcswitch/switch.go +++ b/htlcswitch/switch.go @@ -561,6 +561,11 @@ func (s *Switch) CleanStore(keepPids map[uint64]struct{}) error { return s.attemptStore.CleanStore(keepPids) } +// AttemptStore provides access to the Switch's underlying attempt store. +func (s *Switch) AttemptStore() AttemptStore { + return s.attemptStore +} + // SendHTLC is used by other subsystems which aren't belong to htlc switch // package in order to send the htlc update. The attemptID used MUST be unique // for this HTLC, and MUST be used only once, otherwise the switch might reject diff --git a/lnrpc/switchrpc/config_active.go b/lnrpc/switchrpc/config_active.go index 851461d2a4a..af3691f0bc9 100644 --- a/lnrpc/switchrpc/config_active.go +++ b/lnrpc/switchrpc/config_active.go @@ -36,6 +36,10 @@ type Config struct { // be dispatched. HtlcDispatcher routing.PaymentAttemptDispatcher + // AttemptStore provides the means by which the RPC server can manage + // the state of an HTLC attempt, including initializing and failing it. + AttemptStore htlcswitch.AttemptStore + // ChannelInfoAccessor defines an interface for accessing channel // information necessary for dispatching payment attempts, specifically // methods for fetching links by public key. diff --git a/lnrpc/switchrpc/driver.go b/lnrpc/switchrpc/driver.go index c8ceb03e687..6e43de38075 100644 --- a/lnrpc/switchrpc/driver.go +++ b/lnrpc/switchrpc/driver.go @@ -49,6 +49,10 @@ func createNewSubServer(configRegistry lnrpc.SubServerConfigDispatcher) ( return nil, nil, fmt.Errorf("route processor must be set to " + "create SwitchRPC") } + if config.AttemptStore == nil { + return nil, nil, fmt.Errorf("attempt store must be set to " + + "create SwitchRPC") + } return New(config) } diff --git a/subrpcserver_config.go b/subrpcserver_config.go index 4350f67c240..7c1cda4aefd 100644 --- a/subrpcserver_config.go +++ b/subrpcserver_config.go @@ -331,6 +331,10 @@ func (s *subRPCServerConfigs) PopulateDependencies(cfg *Config, reflect.ValueOf(routerBackend), ) + subCfgValue.FieldByName("AttemptStore").Set( + reflect.ValueOf(htlcSwitch.AttemptStore()), + ) + case *watchtowerrpc.Config: subCfgValue := extractReflectValue(subCfg) From 957b3162782bc6eda935dd92512014840723b682 Mon Sep 17 00:00:00 2001 From: Calvin Zachman Date: Tue, 16 Dec 2025 14:56:14 -0600 Subject: [PATCH 2/6] switchrpc: make external htlc dispatch (SendOnion) idempotent Update SendOnion rpc implementation to make use of our new Switch attempt store primitives for idempotence. This will allow SendOnion to reject processing the same htlc attempt id more than once - providing strong assurances to callers that the endpoint is safe to retry when they encounter grpc client and network related failures (eg: timeouts, service unavaiable errors, etc.) Additionally, to guarantee a definitive outcome and prevent orphaned attempts, the handler synchronously transitions an initialized (PENDING) attempt to FAILED if subsequent validation or dispatch fails. This prevents indefinite hangs for TrackOnion queries. --- itest/lnd_sendonion_test.go | 22 ++- lnrpc/switchrpc/mock.go | 19 ++ lnrpc/switchrpc/switch_server.go | 245 +++++++++++++++++++++----- lnrpc/switchrpc/switch_server_test.go | 16 +- 4 files changed, 246 insertions(+), 56 deletions(-) diff --git a/itest/lnd_sendonion_test.go b/itest/lnd_sendonion_test.go index bff161eeaaa..1d5e2adfc04 100644 --- a/itest/lnd_sendonion_test.go +++ b/itest/lnd_sendonion_test.go @@ -1,6 +1,8 @@ package itest import ( + "context" + "github.com/btcsuite/btcd/btcec/v2" "github.com/btcsuite/btcd/btcutil" sphinx "github.com/lightningnetwork/lightning-onion" @@ -9,9 +11,12 @@ import ( "github.com/lightningnetwork/lnd/lnrpc/invoicesrpc" "github.com/lightningnetwork/lnd/lnrpc/switchrpc" "github.com/lightningnetwork/lnd/lntest" + "github.com/lightningnetwork/lnd/lntest/rpc" "github.com/lightningnetwork/lnd/lntypes" "github.com/lightningnetwork/lnd/lnwire" "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" ) // testSendOnion tests the basic success case for the SendOnion RPC. It @@ -178,13 +183,18 @@ func testSendOnionTwice(ht *lntest.HarnessTest) { // While the first onion is still in-flight, we'll send the same onion // again with the same attempt ID. This should error as our Switch will // detect duplicate ADDs for *in-flight* HTLCs. - resp = alice.RPC.SendOnion(sendReq) - ht.Logf("SendOnion resp: %+v, code: %v", resp, resp.ErrorCode) - require.False(ht, resp.Success, "expected failure on onion send") - require.Equal(ht, resp.ErrorCode, - switchrpc.ErrorCode_DUPLICATE_HTLC, + ctxt, cancel := context.WithTimeout(context.Background(), + rpc.DefaultTimeout) + defer cancel() + + _, err := alice.RPC.Switch.SendOnion(ctxt, sendReq) + require.Error(ht, err, "expected failure on onion send") + + // Check that we get the expected gRPC error. + s, ok := status.FromError(err) + require.True(ht, ok, "expected gRPC status error") + require.Equal(ht, codes.AlreadyExists, s.Code(), "unexpected error code") - require.Equal(ht, resp.ErrorMessage, htlcswitch.ErrDuplicateAdd.Error()) // Dave settles the invoice. dave.RPC.SettleInvoice(preimage[:]) diff --git a/lnrpc/switchrpc/mock.go b/lnrpc/switchrpc/mock.go index 0fc66c849d1..95d727f60d9 100644 --- a/lnrpc/switchrpc/mock.go +++ b/lnrpc/switchrpc/mock.go @@ -72,3 +72,22 @@ func (m *mockRouteProcessor) UnmarshallRoute(route *lnrpc.Route) ( return m.unmarshallRoute, m.unmarshallErr } + +// mockAttemptStore is a mock implementation of the AttemptStore interface. +type mockAttemptStore struct { + htlcswitch.AttemptStore + initErr error + failErr error +} + +// InitAttempt returns the mocked initErr. +func (m *mockAttemptStore) InitAttempt(attemptID uint64) error { + return m.initErr +} + +// FailPendingAttempt returns the mocked failErr. +func (m *mockAttemptStore) FailPendingAttempt(attemptID uint64, + reason *htlcswitch.LinkError) error { + + return m.failErr +} diff --git a/lnrpc/switchrpc/switch_server.go b/lnrpc/switchrpc/switch_server.go index 2f54a11908e..b72506da316 100644 --- a/lnrpc/switchrpc/switch_server.go +++ b/lnrpc/switchrpc/switch_server.go @@ -236,39 +236,209 @@ func (r *ServerShell) CreateSubServer( return subServer, macPermissions, nil } -// SendOnion handles the incoming request to send a payment using a -// preconstructed onion blob provided by the caller. +// SendOnion provides an idempotent API for dispatching a pre-formed onion +// packet. This RPC is the primary entry point for a remote router that wishes +// to forward a payment through this lnd instance. +// +// To safely handle network failures, a client can and should retry this RPC +// after a timeout or disconnection. Retries MUST use the exact same +// attempt id to allow the server to correctly detect duplicate requests. +// +// The server guarantees safety against duplicate attempts by following a +// "write-ahead" style approach. Upon receiving a request, it first writes a +// durable record of the intent to dispatch the payment, marking the attempt as +// PENDING. Only after this record is secured does it proceed with validation +// and dispatch. If any of these subsequent steps fail, the server synchronously +// transitions the PENDING record to a final FAILED state, ensuring attempts are +// not "orphaned" in an initialized but not actually dispatched state. +// +// A client interacting with this RPC must handle four distinct categories of +// outcomes: +// +// 1. SUCCESS: This is a definitive confirmation that the HTLC has been +// successfully dispatched and is now in-flight. The client can proceed to track +// the payment's final result via the `TrackOnion` RPC. +// +// 2. DUPLICATE ACKNOWLEDGMENT: This is not a payment-level failure. It is a +// definitive acknowledgment that a request with the same attempt id has already +// been successfully processed. A retrying client should interpret this as a +// success for the attempt and proceed to tracking the payment's result. +// +// 3. AMBIGUOUS FAILURE (e.g, gRPC `codes.Unavailable` status): +// This error is returned if the server fails during the critical initial step +// of writing the PENDING record. In this state, the server cannot be certain +// whether the HTLC was dispatched. The client MUST retry the *exact same +// request* with the original attempt id to resolve this ambiguity. A client +// MUST NOT treat this error as a definitive failure and move on to a new +// attempt ID; doing so is the definition of a "leaked attempt" and risks a +// duplicate payment if the original ambiguous attempt succeeded. +// +// 4. DEFINITIVE FAILURE (any other error): For all other failures (e.g., +// invalid parameters, local policy violations), this RPC will return a +// definitive error. A definitive failure is a guarantee that the HTLC was not +// and will not be dispatched. To prevent orphaned records, the server is +// responsible for transitioning the PENDING attempt to a FAILED state. func (s *Server) SendOnion(_ context.Context, req *SendOnionRequest) (*SendOnionResponse, error) { + // First, we initialize the attempt in our persistent store. This serves + // as a durable record of our intent to send and gates the attempt id + // for concurrent callers, allowing them to safely retry. + // + // NOTE: This durable write MUST happen before any dynamic validation + // (e.g., liquidity, peer connectivity). This ordering guarantees that a + // client's retry of a timed-out request will receive a stable acknowle- + // dgement as duplicate, rather than a new, misleading transient error. + // This prevents the client from incorrectly concluding the original + // attempt failed, which would risk a duplicate attempt. + attemptID := req.AttemptId + err := s.cfg.AttemptStore.InitAttempt(attemptID) + if err != nil { + // A record for this attempt ID already exists. This indicates a + // client-side retry of a request that was already successfully + // registered. We return a distinct error to signal that the + // dispatch request is acknowledged and that the caller can + // safely proceed to track the result. + if errors.Is(err, htlcswitch.ErrPaymentIDAlreadyExists) { + log.Debugf("Attempt id=%v already exists", attemptID) + + return nil, status.Errorf(codes.AlreadyExists, + "payment with attempt ID %d already exists", + attemptID) + } + + // If we receive an initialization error, we'll return the error + // directly to the caller so they can handle the ambiguity. + // + // TODO(calvin): actually transport the error signal across the + // rpc boundary possibly using grpc st.WithDetails(). + log.Errorf("Unable to initialize attempt id=%d: %v", attemptID, + err) + + return nil, status.Errorf(codes.Unavailable, "unable to "+ + "initialize attempt id=%d: %v", attemptID, err) + } + + // Perform all RPC-level pre-dispatch checks. + chanID, htlcAdd, validationErr := s.validateAndPrepareOnion(req) + + // If request validation fails, we transition the attempt from an + // initialized to a terminally failed state to prevent an orphaned + // attempt. An orphaned PENDING record would cause any subsequent + // TrackOnion call for this attempt to hang indefinitely. + if validationErr != nil { + log.Warnf("Validation failed for attempt %d: %v. Failing "+ + "pending attempt.", req.AttemptId, validationErr) + + s.failPendingAttempt(req.AttemptId, "validation failure") + + // Return the original, more specific validation error to the + // client. + return nil, validationErr + } + + log.Debugf("Dispatching HTLC attempt(id=%v, amt=%v) for payment=%x "+ + "via channel=%s", req.AttemptId, req.Amount, + htlcAdd.PaymentHash, chanID) + + // With the attempt initialized, we now dispatch the HTLC. + dispatchErr := s.cfg.HtlcDispatcher.SendHTLC(chanID, attemptID, htlcAdd) + if dispatchErr != nil { + // The core dispatch logic failed definitively. We'll + // transition the attempt from an initialized to a terminally + // failed state to prevent an orphaned attempt. + log.Warnf("Dispatch failed for attempt %d: %v. Failing "+ + "pending attempt.", req.AttemptId, dispatchErr) + + s.failPendingAttempt(req.AttemptId, "dispatch failure") + + // Translate the internal dispatch error into a gRPC status + // with rich details for the client. + message, code := translateErrorForRPC(dispatchErr) + + return &SendOnionResponse{ + Success: false, + ErrorMessage: message, + ErrorCode: code, + }, nil + } + + // The onion attempt was successfully dispatched. + return &SendOnionResponse{Success: true}, nil +} + +// failPendingAttempt is a helper which transitions the given attempt from an +// initialized (PENDING) state to a FAILED state in the persistent store. +func (s *Server) failPendingAttempt(attemptID uint64, context string) { + // We use a generic failure reason, as this is an internal failure. + // The original, more specific error is returned to the client. + failReason := &lnwire.FailTemporaryNodeFailure{} + + err := s.cfg.AttemptStore.FailPendingAttempt( + attemptID, htlcswitch.NewLinkError(failReason), + ) + if err != nil { + // If we fail to transition the attempt to a FAILED state + // here, the htlc switch's own recovery mechanisms will + // eventually clean up any orphaned PENDING attempts after a + // restart. + log.Errorf("Unable to fail pending attempt %d after %s: %v", + attemptID, context, err) + } +} + +// validateAndPrepareOnion performs the pre-checks and preparation for a +// SendOnion request. It returns the channel ID, the HTLC to be sent, and any +// validation error. +func (s *Server) validateAndPrepareOnion(req *SendOnionRequest) ( + lnwire.ShortChannelID, *lnwire.UpdateAddHTLC, error) { + + var ( + chanID lnwire.ShortChannelID + htlcAdd *lnwire.UpdateAddHTLC + err error + ) + if len(req.OnionBlob) != lnwire.OnionPacketSize { - return nil, status.Errorf(codes.InvalidArgument, + err = status.Errorf( + codes.InvalidArgument, "onion blob size=%d does not match expected %d bytes", - len(req.OnionBlob), lnwire.OnionPacketSize) + len(req.OnionBlob), lnwire.OnionPacketSize, + ) + + return chanID, htlcAdd, err } if len(req.PaymentHash) == 0 { - return nil, status.Error(codes.InvalidArgument, - "payment hash is required") + err = status.Error( + codes.InvalidArgument, "payment hash is required") + + return chanID, htlcAdd, err } if req.Amount <= 0 { - return nil, status.Error(codes.InvalidArgument, - "amount must be greater than zero") + err = status.Error(codes.InvalidArgument, + "amount must be greater than zero", + ) + + return chanID, htlcAdd, err } var ( amount = lnwire.MilliSatoshi(req.Amount) pubkeySet = len(req.FirstHopPubkey) != 0 channelIDSet = req.FirstHopChanId != 0 - chanID lnwire.ShortChannelID ) switch { case pubkeySet == channelIDSet: - return nil, status.Error(codes.InvalidArgument, + err = status.Error( + codes.InvalidArgument, "must specify exactly one of first_hop_pubkey or "+ - "first_hop_chan_id") + "first_hop_chan_id", + ) + + return chanID, htlcAdd, err case channelIDSet: // Case 1: The caller provided the first hop chan id directly. @@ -277,35 +447,44 @@ func (s *Server) SendOnion(_ context.Context, case pubkeySet: // Case 2: Convert the first hop pubkey into a format usable by // the forwarding subsystem. - firstHop, err := btcec.ParsePubKey(req.FirstHopPubkey) - if err != nil { - return nil, status.Errorf(codes.InvalidArgument, + firstHop, parseErr := btcec.ParsePubKey(req.FirstHopPubkey) + if parseErr != nil { + err = status.Errorf(codes.InvalidArgument, "invalid first hop pubkey=%x: %v", - req.FirstHopPubkey, err) + req.FirstHopPubkey, parseErr) + + return chanID, htlcAdd, err } // Find an eligible channel ID for the given first-hop pubkey. chanID, err = s.findEligibleChannelID(firstHop, amount) if err != nil { - return nil, status.Errorf(codes.Internal, + err = status.Errorf(codes.FailedPrecondition, "unable to find eligible channel for "+ "pubkey=%x: %v", firstHop.SerializeCompressed(), err) + + return chanID, htlcAdd, err } } - hash, err := lntypes.MakeHash(req.PaymentHash) - if err != nil { - return nil, status.Errorf(codes.InvalidArgument, - "invalid payment_hash=%x: %v", req.PaymentHash, err) + hash, parseErr := lntypes.MakeHash(req.PaymentHash) + if parseErr != nil { + err = status.Errorf(codes.InvalidArgument, + "invalid payment_hash=%x: %v", req.PaymentHash, + parseErr) + + return chanID, htlcAdd, err } var blindingPoint lnwire.BlindingPointRecord if len(req.BlindingPoint) > 0 { - pubkey, err := btcec.ParsePubKey(req.BlindingPoint) - if err != nil { - return nil, status.Errorf(codes.InvalidArgument, - "invalid blinding point: %v", err) + pubkey, parseErr := btcec.ParsePubKey(req.BlindingPoint) + if parseErr != nil { + err = status.Errorf(codes.InvalidArgument, + "invalid blinding point: %v", parseErr) + + return chanID, htlcAdd, err } blindingPoint = tlv.SomeRecordT( @@ -318,7 +497,7 @@ func (s *Server) SendOnion(_ context.Context, // Craft an HTLC packet to send to the htlcswitch. The metadata within // this packet will be used to route the payment through the network, // starting with the first-hop. - htlcAdd := &lnwire.UpdateAddHTLC{ + htlcAdd = &lnwire.UpdateAddHTLC{ Amount: amount, Expiry: req.Timelock, PaymentHash: hash, @@ -328,21 +507,7 @@ func (s *Server) SendOnion(_ context.Context, ExtraData: lnwire.ExtraOpaqueData(req.ExtraData), } - log.Debugf("Dispatching HTLC attempt(id=%v, amt=%v) for payment=%v "+ - "via channel=%s", req.AttemptId, req.Amount, hash, chanID) - - // Send the HTLC to the first hop directly by way of the HTLCSwitch. - err = s.cfg.HtlcDispatcher.SendHTLC(chanID, req.AttemptId, htlcAdd) - if err != nil { - message, code := translateErrorForRPC(err) - return &SendOnionResponse{ - Success: false, - ErrorMessage: message, - ErrorCode: code, - }, nil - } - - return &SendOnionResponse{Success: true}, nil + return chanID, htlcAdd, err } // findEligibleChannelID attempts to find an eligible channel based on the diff --git a/lnrpc/switchrpc/switch_server_test.go b/lnrpc/switchrpc/switch_server_test.go index 87fcde057dc..d0dd7fde69c 100644 --- a/lnrpc/switchrpc/switch_server_test.go +++ b/lnrpc/switchrpc/switch_server_test.go @@ -137,18 +137,13 @@ func TestSendOnion(t *testing.T) { setup: func(t *testing.T, s *Server, req *SendOnionRequest) { - // Mock a duplicate error from the dispatcher, - // which is the new way to signal a duplicate - // attempt for the same ID. - payer, ok := s.cfg.HtlcDispatcher.(*mockPayer) + // Mock a duplicate error from the attempt + // store. + store, ok := s.cfg.AttemptStore.(*mockAttemptStore) require.True(t, ok) - payer.sendErr = htlcswitch.ErrDuplicateAdd - }, - expectedResponse: &SendOnionResponse{ - Success: false, - ErrorMessage: htlcswitch.ErrDuplicateAdd.Error(), - ErrorCode: ErrorCode_DUPLICATE_HTLC, + store.initErr = htlcswitch.ErrPaymentIDAlreadyExists }, + expectedErrCode: codes.AlreadyExists, }, } @@ -160,6 +155,7 @@ func TestSendOnion(t *testing.T) { // isolation. server, _, err := New(&Config{ HtlcDispatcher: &mockPayer{}, + AttemptStore: &mockAttemptStore{}, }) require.NoError(t, err) From 0f4e39030f96c515c6c8f52df8bb16be25388a9f Mon Sep 17 00:00:00 2001 From: Calvin Zachman Date: Mon, 5 Jan 2026 11:52:02 -0500 Subject: [PATCH 3/6] switchrpc: update sendonion rpc docs Clarify useage of API for rpc clients. --- lnrpc/switchrpc/switch.proto | 31 +++++++++++++-- lnrpc/switchrpc/switch.swagger.json | 3 +- lnrpc/switchrpc/switch_grpc.pb.go | 62 ++++++++++++++++++++++++++--- 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/lnrpc/switchrpc/switch.proto b/lnrpc/switchrpc/switch.proto index 06fa60fbb67..bfff1c0c0a2 100644 --- a/lnrpc/switchrpc/switch.proto +++ b/lnrpc/switchrpc/switch.proto @@ -10,9 +10,34 @@ option go_package = "github.com/lightningnetwork/lnd/lnrpc/switchrpc"; // subsystem of the daemon. service Switch { /* - SendOnion attempts to make a payment via the specified onion. This - method differs from SendPayment in that the instance need not be aware of - the full details of the payment route. + SendOnion provides an idempotent API for dispatching a pre-formed onion + packet, which is the primary entry point for a remote router. + + To safely handle network failures, a client can and should retry this RPC + after a timeout or disconnection. Retries MUST use the exact same + attempt_id to allow the server to correctly detect duplicate requests. + + A client interacting with this RPC must handle four distinct categories of + outcomes, communicated via gRPC status codes: + + 1. SUCCESS (gRPC code OK): A definitive confirmation that the HTLC has + been successfully dispatched. The client can proceed to track the + payment's final result via the `TrackOnion` RPC. + + 2. DUPLICATE ACKNOWLEDGMENT (gRPC code AlreadyExists): A definitive + acknowledgment that a request with the same attempt_id has already + been successfully processed. A retrying client should interpret this + as a success and proceed to tracking the payment's result. + + 3. AMBIGUOUS FAILURE (gRPC code Unavailable or DeadlineExceeded): An + ambiguous error occurred (e.g., the server is shutting down or the + client timed out). The state of the HTLC dispatch is unknown. The + client MUST retry the exact same request to resolve the ambiguity. + + 4. DEFINITIVE FAILURE (gRPC code FailedPrecondition, InvalidArgument, etc.): + A definitive failure is a guarantee that the HTLC was not and will not be + dispatched. The client should fail the attempt and may retry with a new + route and/or new attempt_id. */ rpc SendOnion (SendOnionRequest) returns (SendOnionResponse); diff --git a/lnrpc/switchrpc/switch.swagger.json b/lnrpc/switchrpc/switch.swagger.json index 21cc6a95994..1c4523e9c75 100644 --- a/lnrpc/switchrpc/switch.swagger.json +++ b/lnrpc/switchrpc/switch.swagger.json @@ -52,7 +52,8 @@ }, "/v2/switch/onion": { "post": { - "summary": "SendOnion attempts to make a payment via the specified onion. This\nmethod differs from SendPayment in that the instance need not be aware of\nthe full details of the payment route.", + "summary": "SendOnion provides an idempotent API for dispatching a pre-formed onion\npacket, which is the primary entry point for a remote router.", + "description": "To safely handle network failures, a client can and should retry this RPC\nafter a timeout or disconnection. Retries MUST use the exact same\nattempt_id to allow the server to correctly detect duplicate requests.\n\nA client interacting with this RPC must handle four distinct categories of\noutcomes, communicated via gRPC status codes:\n\n1. SUCCESS (gRPC code OK): A definitive confirmation that the HTLC has\nbeen successfully dispatched. The client can proceed to track the\npayment's final result via the `TrackOnion` RPC.\n\n2. DUPLICATE ACKNOWLEDGMENT (gRPC code AlreadyExists): A definitive\nacknowledgment that a request with the same attempt_id has already\nbeen successfully processed. A retrying client should interpret this\nas a success and proceed to tracking the payment's result.\n\n3. AMBIGUOUS FAILURE (gRPC code Unavailable or DeadlineExceeded): An\nambiguous error occurred (e.g., the server is shutting down or the\nclient timed out). The state of the HTLC dispatch is unknown. The\nclient MUST retry the exact same request to resolve the ambiguity.\n\n4. DEFINITIVE FAILURE (gRPC code FailedPrecondition, InvalidArgument, etc.):\nA definitive failure is a guarantee that the HTLC was not and will not be\ndispatched. The client should fail the attempt and may retry with a new\nroute and/or new attempt_id.", "operationId": "Switch_SendOnion", "responses": { "200": { diff --git a/lnrpc/switchrpc/switch_grpc.pb.go b/lnrpc/switchrpc/switch_grpc.pb.go index cd06275a979..977eba1e29a 100644 --- a/lnrpc/switchrpc/switch_grpc.pb.go +++ b/lnrpc/switchrpc/switch_grpc.pb.go @@ -18,9 +18,34 @@ const _ = grpc.SupportPackageIsVersion7 // // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. type SwitchClient interface { - // SendOnion attempts to make a payment via the specified onion. This - // method differs from SendPayment in that the instance need not be aware of - // the full details of the payment route. + // SendOnion provides an idempotent API for dispatching a pre-formed onion + // packet, which is the primary entry point for a remote router. + // + // To safely handle network failures, a client can and should retry this RPC + // after a timeout or disconnection. Retries MUST use the exact same + // attempt_id to allow the server to correctly detect duplicate requests. + // + // A client interacting with this RPC must handle four distinct categories of + // outcomes, communicated via gRPC status codes: + // + // 1. SUCCESS (gRPC code OK): A definitive confirmation that the HTLC has + // been successfully dispatched. The client can proceed to track the + // payment's final result via the `TrackOnion` RPC. + // + // 2. DUPLICATE ACKNOWLEDGMENT (gRPC code AlreadyExists): A definitive + // acknowledgment that a request with the same attempt_id has already + // been successfully processed. A retrying client should interpret this + // as a success and proceed to tracking the payment's result. + // + // 3. AMBIGUOUS FAILURE (gRPC code Unavailable or DeadlineExceeded): An + // ambiguous error occurred (e.g., the server is shutting down or the + // client timed out). The state of the HTLC dispatch is unknown. The + // client MUST retry the exact same request to resolve the ambiguity. + // + // 4. DEFINITIVE FAILURE (gRPC code FailedPrecondition, InvalidArgument, etc.): + // A definitive failure is a guarantee that the HTLC was not and will not be + // dispatched. The client should fail the attempt and may retry with a new + // route and/or new attempt_id. SendOnion(ctx context.Context, in *SendOnionRequest, opts ...grpc.CallOption) (*SendOnionResponse, error) // TrackOnion allows callers to query whether or not a payment dispatched via // SendOnion succeeded or failed. @@ -78,9 +103,34 @@ func (c *switchClient) BuildOnion(ctx context.Context, in *BuildOnionRequest, op // All implementations must embed UnimplementedSwitchServer // for forward compatibility type SwitchServer interface { - // SendOnion attempts to make a payment via the specified onion. This - // method differs from SendPayment in that the instance need not be aware of - // the full details of the payment route. + // SendOnion provides an idempotent API for dispatching a pre-formed onion + // packet, which is the primary entry point for a remote router. + // + // To safely handle network failures, a client can and should retry this RPC + // after a timeout or disconnection. Retries MUST use the exact same + // attempt_id to allow the server to correctly detect duplicate requests. + // + // A client interacting with this RPC must handle four distinct categories of + // outcomes, communicated via gRPC status codes: + // + // 1. SUCCESS (gRPC code OK): A definitive confirmation that the HTLC has + // been successfully dispatched. The client can proceed to track the + // payment's final result via the `TrackOnion` RPC. + // + // 2. DUPLICATE ACKNOWLEDGMENT (gRPC code AlreadyExists): A definitive + // acknowledgment that a request with the same attempt_id has already + // been successfully processed. A retrying client should interpret this + // as a success and proceed to tracking the payment's result. + // + // 3. AMBIGUOUS FAILURE (gRPC code Unavailable or DeadlineExceeded): An + // ambiguous error occurred (e.g., the server is shutting down or the + // client timed out). The state of the HTLC dispatch is unknown. The + // client MUST retry the exact same request to resolve the ambiguity. + // + // 4. DEFINITIVE FAILURE (gRPC code FailedPrecondition, InvalidArgument, etc.): + // A definitive failure is a guarantee that the HTLC was not and will not be + // dispatched. The client should fail the attempt and may retry with a new + // route and/or new attempt_id. SendOnion(context.Context, *SendOnionRequest) (*SendOnionResponse, error) // TrackOnion allows callers to query whether or not a payment dispatched via // SendOnion succeeded or failed. From bd5ede5c7c046c79933bc8239dfe2a277dff85b5 Mon Sep 17 00:00:00 2001 From: Calvin Zachman Date: Wed, 12 Nov 2025 21:50:03 -0500 Subject: [PATCH 4/6] itest: update sendonion itest given duplicate protection We can now assert that making multiple calls to SendOnion for the same attempt ID is prevented. --- itest/lnd_sendonion_test.go | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/itest/lnd_sendonion_test.go b/itest/lnd_sendonion_test.go index 1d5e2adfc04..26900f656d7 100644 --- a/itest/lnd_sendonion_test.go +++ b/itest/lnd_sendonion_test.go @@ -183,8 +183,9 @@ func testSendOnionTwice(ht *lntest.HarnessTest) { // While the first onion is still in-flight, we'll send the same onion // again with the same attempt ID. This should error as our Switch will // detect duplicate ADDs for *in-flight* HTLCs. - ctxt, cancel := context.WithTimeout(context.Background(), - rpc.DefaultTimeout) + ctxt, cancel := context.WithTimeout( + context.Background(), rpc.DefaultTimeout, + ) defer cancel() _, err := alice.RPC.Switch.SendOnion(ctxt, sendReq) @@ -213,14 +214,20 @@ func testSendOnionTwice(ht *lntest.HarnessTest) { require.Equal(ht, preimage[:], trackResp.Preimage) // Now that the original HTLC attempt has settled, we'll send the same - // onion again with the same attempt ID. - // - // NOTE: Currently, this does not error. When we make SendOnion fully - // duplicate safe, this should be updated to assert an error is - // returned. - resp = alice.RPC.SendOnion(sendReq) - require.True(ht, resp.Success, "expected successful onion send") - require.Empty(ht, resp.ErrorMessage, "unexpected failure to send onion") + // onion again with the same attempt ID. Confirm that this is also + // prevented. + ctxt, cancel = context.WithTimeout(context.Background(), + rpc.DefaultTimeout) + defer cancel() + + _, err = alice.RPC.Switch.SendOnion(ctxt, sendReq) + require.Error(ht, err, "expected failure on onion send") + + // Check that we get the expected gRPC error. + s, ok = status.FromError(err) + require.True(ht, ok, "expected gRPC status error") + require.Equal(ht, codes.AlreadyExists, s.Code(), + "unexpected error code") } // testTrackOnion exercises the SwitchRPC server's TrackOnion endpoint, From 2a04b48469c75fa43c690ed8290f3e98de937988 Mon Sep 17 00:00:00 2001 From: Calvin Zachman Date: Sat, 6 Dec 2025 00:48:33 -0500 Subject: [PATCH 5/6] itest: add SendOnion concurrency test --- itest/list_on_test.go | 4 ++ itest/lnd_sendonion_test.go | 128 +++++++++++++++++++++++++++++++++--- 2 files changed, 123 insertions(+), 9 deletions(-) diff --git a/itest/list_on_test.go b/itest/list_on_test.go index a26ff062877..79b64032805 100644 --- a/itest/list_on_test.go +++ b/itest/list_on_test.go @@ -767,6 +767,10 @@ var allTestCases = []*lntest.TestCase{ Name: "send onion twice", TestFunc: testSendOnionTwice, }, + { + Name: "send onion concurrency", + TestFunc: testSendOnionConcurrency, + }, { Name: "track onion", TestFunc: testTrackOnion, diff --git a/itest/lnd_sendonion_test.go b/itest/lnd_sendonion_test.go index 26900f656d7..6fba408e982 100644 --- a/itest/lnd_sendonion_test.go +++ b/itest/lnd_sendonion_test.go @@ -2,6 +2,7 @@ package itest import ( "context" + "sync" "github.com/btcsuite/btcd/btcec/v2" "github.com/btcsuite/btcd/btcutil" @@ -41,8 +42,6 @@ func testSendOnion(ht *lntest.HarnessTest) { // Make sure Alice knows about all channels. aliceBobChan := ht.AssertChannelInGraph(alice, chanPoints[0]) - ht.AssertChannelInGraph(alice, chanPoints[1]) - ht.AssertChannelInGraph(alice, chanPoints[2]) const ( numPayments = 1 @@ -125,9 +124,6 @@ func testSendOnionTwice(ht *lntest.HarnessTest) { defer ht.CloseChannel(bob, chanPoints[1]) defer ht.CloseChannel(carol, chanPoints[2]) - ht.AssertChannelInGraph(alice, chanPoints[1]) - ht.AssertChannelInGraph(alice, chanPoints[2]) - const paymentAmt = 10000 // Create a preimage, that will be held by Dave. @@ -230,6 +226,124 @@ func testSendOnionTwice(ht *lntest.HarnessTest) { "unexpected error code") } +// testSendOnionConcurrency simulates a client that crashes and attempts to +// retry a payment with the same attempt ID concurrently. This test provides a +// strong guarantee that the SendOnion RPC is idempotent and correctly prevents +// duplicate payment attempts from succeeding. +func testSendOnionConcurrency(ht *lntest.HarnessTest) { + // Create a two-node context consisting of Alice and Bob. + const chanAmt = btcutil.Amount(100000) + const numNodes = 2 + nodeCfgs := make([][]string, numNodes) + chanPoints, nodes := ht.CreateSimpleNetwork( + nodeCfgs, lntest.OpenChannelParams{Amt: chanAmt}, + ) + alice, bob := nodes[0], nodes[1] + + // Make sure Alice knows about the channel. + aliceBobChan := ht.AssertChannelInGraph(alice, chanPoints[0]) + + const paymentAmt = 10000 + + // Request an invoice from Bob so he is expecting payment. + _, rHashes, invoices := ht.CreatePayReqs(bob, paymentAmt, 1) + paymentHash := rHashes[0] + + // Query for a route to pay from Alice to Bob. + routesReq := &lnrpc.QueryRoutesRequest{ + PubKey: bob.PubKeyStr, + Amt: paymentAmt, + } + routes := alice.RPC.QueryRoutes(routesReq) + route := routes.Routes[0] + finalHop := route.Hops[len(route.Hops)-1] + finalHop.MppRecord = &lnrpc.MPPRecord{ + PaymentAddr: invoices[0].PaymentAddr, + TotalAmtMsat: int64(lnwire.NewMSatFromSatoshis(paymentAmt)), + } + + // Construct the onion for the route. + onionReq := &switchrpc.BuildOnionRequest{ + Route: route, + PaymentHash: paymentHash, + } + onionResp := alice.RPC.BuildOnion(onionReq) + + // Create the SendOnion request that all goroutines will use. + // The AttemptId MUST be the same for all calls. + sendReq := &switchrpc.SendOnionRequest{ + FirstHopChanId: aliceBobChan.ChannelId, + Amount: route.TotalAmtMsat, + Timelock: route.TotalTimeLock, + PaymentHash: paymentHash, + OnionBlob: onionResp.OnionBlob, + AttemptId: 42, + } + + const numConcurrentRequests = 50 + var wg sync.WaitGroup + wg.Add(numConcurrentRequests) + + // Use channels to collect the results from each goroutine. + resultsChan := make(chan error, + numConcurrentRequests) + + // Launch all requests concurrently to simulate a retry storm. + for i := 0; i < numConcurrentRequests; i++ { + go func() { + defer wg.Done() + ctxt, cancel := context.WithTimeout( + context.Background(), + rpc.DefaultTimeout, + ) + defer cancel() + + _, err := alice.RPC.Switch.SendOnion(ctxt, sendReq) + resultsChan <- err + }() + } + + wg.Wait() + close(resultsChan) + + // We expect exactly one successful dispatch and the rest to be + // rejected as duplicates. + successCount := 0 + duplicateCount := 0 + + for err := range resultsChan { + // A nil error indicates a successful dispatch. + if err == nil { + successCount++ + continue + } + + // For non-nil errors, we should receive a gRPC status error. + s, ok := status.FromError(err) + + // If it's not a gRPC status error, it's an unexpected + // condition. + require.Truef(ht, ok, "unexpected error from SendOnion: %v, "+ + "code: %v", s.Err().Error(), s.Code()) + + // Check if the error code indicates a duplicate acknowledgment. + if s.Code() == codes.AlreadyExists { + duplicateCount++ + } else { + ht.Fatalf("unexpected error from SendOnion: %v, "+ + "code: %v", s.Err().Error(), s.Code()) + } + } + + // Confirm that only a single dispatch succeeds. + require.Equal(ht, 1, successCount, "expected exactly one success") + require.Equal(ht, numConcurrentRequests-1, duplicateCount, + "expected all other attempts to be duplicates") + + // The invoice should eventually show as settled for Bob. + ht.AssertInvoiceSettled(bob, invoices[0].PaymentAddr) +} + // testTrackOnion exercises the SwitchRPC server's TrackOnion endpoint, // confirming that we can receive the result of an onion dispatch and decrypt // the error result. We also verify that the error received from the dispatched @@ -249,10 +363,6 @@ func testTrackOnion(ht *lntest.HarnessTest) { defer ht.CloseChannel(bob, chanPoints[1]) defer ht.CloseChannel(carol, chanPoints[2]) - // Make sure Alice knows about all channels. - ht.AssertChannelInGraph(alice, chanPoints[1]) - ht.AssertChannelInGraph(alice, chanPoints[2]) - const paymentAmt = 10000 // Query for routes to pay from Alice to Dave. From 2e712207b41c14d91740ddea3ccd2695fe82ed6f Mon Sep 17 00:00:00 2001 From: Calvin Zachman Date: Tue, 16 Dec 2025 19:23:15 -0600 Subject: [PATCH 6/6] docs: update v0.21 release notes consolidate remote router doc updates --- docs/release-notes/release-notes-0.21.0.md | 30 ++++++++++++++-------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/docs/release-notes/release-notes-0.21.0.md b/docs/release-notes/release-notes-0.21.0.md index 8dc5aba7c36..1c45c884fc4 100644 --- a/docs/release-notes/release-notes-0.21.0.md +++ b/docs/release-notes/release-notes-0.21.0.md @@ -65,16 +65,6 @@ avoid modifications to the existing flows for dispatching local payments, preserving the existing battle-tested logic. -## RPC Additions - -* [Added support for coordinator-based MuSig2 signing - patterns](https://github.com/lightningnetwork/lnd/pull/10436) with two new - RPCs: `MuSig2RegisterCombinedNonce` allows registering a pre-aggregated - combined nonce for a session (useful when a coordinator aggregates all nonces - externally), and `MuSig2GetCombinedNonce` retrieves the combined nonce after - it becomes available. These methods provide an alternative to the standard - `MuSig2RegisterNonces` workflow and are only supported in MuSig2 v1.0.0rc2. - * Added a new [switchrpc RPC sub-system](https://github.com/lightningnetwork/lnd/pull/9489) with `SendOnion`, `BuildOnion`, and `TrackOnion` endpoints. This allows the daemon to offload path-finding, onion construction and payment life-cycle @@ -86,6 +76,26 @@ potential loss of funds. The compilation of the server is hidden behind the non-default `switchrpc` build tag. +* The `SendOnion` RPC is now fully [idempotent]( + https://github.com/lightningnetwork/lnd/pull/10473), providing a critical + reliability improvement for external payment orchestrators (such as a remote + `ChannelRouter`). Callers can now safely retry a `SendOnion` request after a + network timeout or ambiguous error without risking a duplicate payment. If a + request with the same `attempt_id` has already been processed, the RPC will + now return a `DUPLICATE_HTLC` error, serving as a definitive acknowledgment + that the dispatch was received. This allows clients to build more resilient + payment-sending logic. + +## RPC Additions + +* [Added support for coordinator-based MuSig2 signing + patterns](https://github.com/lightningnetwork/lnd/pull/10436) with two new + RPCs: `MuSig2RegisterCombinedNonce` allows registering a pre-aggregated + combined nonce for a session (useful when a coordinator aggregates all nonces + externally), and `MuSig2GetCombinedNonce` retrieves the combined nonce after + it becomes available. These methods provide an alternative to the standard + `MuSig2RegisterNonces` workflow and are only supported in MuSig2 v1.0.0rc2. + ## lncli Additions # Improvements