diff --git a/discovery/gossiper.go b/discovery/gossiper.go index d62d669919..5400abf4c1 100644 --- a/discovery/gossiper.go +++ b/discovery/gossiper.go @@ -876,7 +876,14 @@ func (d *AuthenticatedGossiper) stop() { func (d *AuthenticatedGossiper) ProcessRemoteAnnouncement(ctx context.Context, msg lnwire.Message, peer lnpeer.Peer) chan error { - errChan := make(chan error, 1) + // Buffer up to two messages on errChan since up to two messages may be + // written and not all callers of this function actually read from + // errChan. Without this buffer goroutines end up blocking on writes to + // errChan, which prevents the gossiper from shutting down cleanly. + // + // TODO(ziggie): Redesign this once the actor model pattern becomes + // available. See https://github.com/lightningnetwork/lnd/pull/9820. + errChan := make(chan error, 2) // For messages in the known set of channel series queries, we'll // dispatch the message directly to the GossipSyncer, and skip the main diff --git a/discovery/gossiper_test.go b/discovery/gossiper_test.go index 2776d4b3a0..e8c59fe147 100644 --- a/discovery/gossiper_test.go +++ b/discovery/gossiper_test.go @@ -5235,3 +5235,64 @@ func TestRecoverGossipPanicNilJobID(t *testing.T) { t.Fatal("timeout waiting for error") } } + +// TestGossiperShutdownWrongChainAnnouncement tests that the gossiper can shut +// down cleanly after processing a channel announcement with the wrong chain +// hash. This is a regression test for a bug where the gossiper would deadlock +// on shutdown because more errors were sent on the error channel than it would +// buffer, and no one was reading those error messages. +// +// In this test we trigger the sending of two error messages: +// 1. First send when rejecting the wrong-chain announcement +// 2. Second send when SignalDependents returns an error +// +// Since the error channel had a buffer of 1, the second send would block +// forever, preventing the goroutine from completing and causing Stop() to hang +// on wg.Wait(). +func TestGossiperShutdownWrongChainAnnouncement(t *testing.T) { + t.Parallel() + + // Create a test context with the gossiper configured for MainNet. + tCtx, err := createTestCtx(t, 0, false) + require.NoError(t, err) + + // Create a channel announcement with: + // 1. Wrong chain hash (SimNet instead of MainNet) + // 2. NodeID1 == NodeID2 + // + // The first condition triggers the first error message to be sent, and + // the second condition causes SignalDependents to attempt to remove the + // same dependent job twice, which then triggers the second error + // message to be sent. + wrongChainAnn := &lnwire.ChannelAnnouncement1{ + ChainHash: *chaincfg.SimNetParams.GenesisHash, + ShortChannelID: lnwire.ShortChannelID{ + BlockHeight: 1, + TxIndex: 0, + TxPosition: 0, + }, + Features: testFeatures, + } + // Use the SAME public key for NodeID1 and NodeID2 to trigger the + // second error message. + copy(wrongChainAnn.NodeID1[:], remoteKeyPub1.SerializeCompressed()) + copy(wrongChainAnn.NodeID2[:], remoteKeyPub1.SerializeCompressed()) + copy(wrongChainAnn.BitcoinKey1[:], bitcoinKeyPub1.SerializeCompressed()) + copy(wrongChainAnn.BitcoinKey2[:], bitcoinKeyPub2.SerializeCompressed()) + + nodePeer := &mockPeer{remoteKeyPub1, nil, nil, atomic.Bool{}} + + // Process the announcement without reading from the error channel, + // exactly as Brontide does. + _ = tCtx.gossiper.ProcessRemoteAnnouncement( + t.Context(), wrongChainAnn, nodePeer, + ) + + // Give the gossiper time to process the announcement. + time.Sleep(100 * time.Millisecond) + + // Now stop the gossiper. This should complete without hanging. + // If the bug is present, Stop() will hang forever because a goroutine + // is blocked trying to send to the error channel a second time. + require.NoError(t, tCtx.gossiper.Stop()) +} diff --git a/docs/release-notes/release-notes-0.20.1.md b/docs/release-notes/release-notes-0.20.1.md index 45ba4b4f82..02dd849f6b 100644 --- a/docs/release-notes/release-notes-0.20.1.md +++ b/docs/release-notes/release-notes-0.20.1.md @@ -82,6 +82,12 @@ The fix adds automatic format detection to handle both legacy (raw feature bits) and new (length-prefixed) formats. +* [Fixed a shutdown + deadlock](https://github.com/lightningnetwork/lnd/pull/10540) in the gossiper. + Certain gossip messages could cause multiple error messages to be sent on a + channel that was only expected to be used for a single message. The erring + goroutine would block on the second send, leading to a deadlock at shutdown. + # New Features ## Functional Enhancements @@ -152,4 +158,5 @@ * Abdulkbk * bitromortac +* Matt Morehouse * Ziggie