From 81d4da7ffd9bc799b46fbf0b1bbaaf916388f7bd Mon Sep 17 00:00:00 2001 From: Matt Morehouse Date: Mon, 2 Feb 2026 16:00:37 -0600 Subject: [PATCH 1/2] discovery: fix gossiper shutdown deadlock When processing a remote network announcement, it is possible for two error messages to be sent back on the errChan. Since Brontide doesn't actually read from errChan, and since errChan only buffered one error message, the sending goroutine would deadlock forever. This would only become apparent when the gossiper attempted to shut down and got hung up. For now, we can fix this simply by buffering up to two error messages on errChan. There is an existing TODO to restructure this logic entirely to use the actor model, and we can do a more thorough fix as part of that work. This bug was discovered while doing full node fuzz testing and was triggered by sending a specific channel_announcement message and then shutting down LND. (cherry picked from commit 21588acb3d5b9b4bb605b0852c91f10b6bb2a1c6) --- discovery/gossiper.go | 9 +++++- discovery/gossiper_test.go | 61 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/discovery/gossiper.go b/discovery/gossiper.go index d62d669919..5400abf4c1 100644 --- a/discovery/gossiper.go +++ b/discovery/gossiper.go @@ -876,7 +876,14 @@ func (d *AuthenticatedGossiper) stop() { func (d *AuthenticatedGossiper) ProcessRemoteAnnouncement(ctx context.Context, msg lnwire.Message, peer lnpeer.Peer) chan error { - errChan := make(chan error, 1) + // Buffer up to two messages on errChan since up to two messages may be + // written and not all callers of this function actually read from + // errChan. Without this buffer goroutines end up blocking on writes to + // errChan, which prevents the gossiper from shutting down cleanly. + // + // TODO(ziggie): Redesign this once the actor model pattern becomes + // available. See https://github.com/lightningnetwork/lnd/pull/9820. + errChan := make(chan error, 2) // For messages in the known set of channel series queries, we'll // dispatch the message directly to the GossipSyncer, and skip the main diff --git a/discovery/gossiper_test.go b/discovery/gossiper_test.go index 2776d4b3a0..e8c59fe147 100644 --- a/discovery/gossiper_test.go +++ b/discovery/gossiper_test.go @@ -5235,3 +5235,64 @@ func TestRecoverGossipPanicNilJobID(t *testing.T) { t.Fatal("timeout waiting for error") } } + +// TestGossiperShutdownWrongChainAnnouncement tests that the gossiper can shut +// down cleanly after processing a channel announcement with the wrong chain +// hash. This is a regression test for a bug where the gossiper would deadlock +// on shutdown because more errors were sent on the error channel than it would +// buffer, and no one was reading those error messages. +// +// In this test we trigger the sending of two error messages: +// 1. First send when rejecting the wrong-chain announcement +// 2. Second send when SignalDependents returns an error +// +// Since the error channel had a buffer of 1, the second send would block +// forever, preventing the goroutine from completing and causing Stop() to hang +// on wg.Wait(). +func TestGossiperShutdownWrongChainAnnouncement(t *testing.T) { + t.Parallel() + + // Create a test context with the gossiper configured for MainNet. + tCtx, err := createTestCtx(t, 0, false) + require.NoError(t, err) + + // Create a channel announcement with: + // 1. Wrong chain hash (SimNet instead of MainNet) + // 2. NodeID1 == NodeID2 + // + // The first condition triggers the first error message to be sent, and + // the second condition causes SignalDependents to attempt to remove the + // same dependent job twice, which then triggers the second error + // message to be sent. + wrongChainAnn := &lnwire.ChannelAnnouncement1{ + ChainHash: *chaincfg.SimNetParams.GenesisHash, + ShortChannelID: lnwire.ShortChannelID{ + BlockHeight: 1, + TxIndex: 0, + TxPosition: 0, + }, + Features: testFeatures, + } + // Use the SAME public key for NodeID1 and NodeID2 to trigger the + // second error message. + copy(wrongChainAnn.NodeID1[:], remoteKeyPub1.SerializeCompressed()) + copy(wrongChainAnn.NodeID2[:], remoteKeyPub1.SerializeCompressed()) + copy(wrongChainAnn.BitcoinKey1[:], bitcoinKeyPub1.SerializeCompressed()) + copy(wrongChainAnn.BitcoinKey2[:], bitcoinKeyPub2.SerializeCompressed()) + + nodePeer := &mockPeer{remoteKeyPub1, nil, nil, atomic.Bool{}} + + // Process the announcement without reading from the error channel, + // exactly as Brontide does. + _ = tCtx.gossiper.ProcessRemoteAnnouncement( + t.Context(), wrongChainAnn, nodePeer, + ) + + // Give the gossiper time to process the announcement. + time.Sleep(100 * time.Millisecond) + + // Now stop the gossiper. This should complete without hanging. + // If the bug is present, Stop() will hang forever because a goroutine + // is blocked trying to send to the error channel a second time. + require.NoError(t, tCtx.gossiper.Stop()) +} From f427fee341948aa48b0faf1713378b0e9314ec21 Mon Sep 17 00:00:00 2001 From: Matt Morehouse Date: Tue, 3 Feb 2026 09:12:56 -0600 Subject: [PATCH 2/2] docs: add release note for #10540 --- docs/release-notes/release-notes-0.20.1.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/release-notes/release-notes-0.20.1.md b/docs/release-notes/release-notes-0.20.1.md index 45ba4b4f82..02dd849f6b 100644 --- a/docs/release-notes/release-notes-0.20.1.md +++ b/docs/release-notes/release-notes-0.20.1.md @@ -82,6 +82,12 @@ The fix adds automatic format detection to handle both legacy (raw feature bits) and new (length-prefixed) formats. +* [Fixed a shutdown + deadlock](https://github.com/lightningnetwork/lnd/pull/10540) in the gossiper. + Certain gossip messages could cause multiple error messages to be sent on a + channel that was only expected to be used for a single message. The erring + goroutine would block on the second send, leading to a deadlock at shutdown. + # New Features ## Functional Enhancements @@ -152,4 +158,5 @@ * Abdulkbk * bitromortac +* Matt Morehouse * Ziggie