From 82d0dee008ead9018dc0a77a77bcb4192ebe098e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 18:11:16 +0000 Subject: [PATCH 1/5] Fix async race condition in InteractionClient provider creation ISSUE: The getInteractionClient() method had a check-then-create pattern with two await points between the initial check and client creation. This created a race condition in JavaScript's event loop: 1. Call A checks #clients map -> not found 2. Call B checks #clients map -> not found 3. Call A awaits nodeStore?.construction (yields to event loop) 4. Call B awaits nodeStore?.construction (yields to event loop) 5. Call A resumes, creates InteractionClient, stores in map 6. Call B resumes, creates DIFFERENT InteractionClient, overwrites A's client Result: Multiple InteractionClient instances created for same peer address, causing resource leaks and potential protocol issues. SOLUTION: Implement double-checked locking pattern - after the async operations complete, check the map again before creating a new client. If another concurrent call already created the client during our await, use that instance instead of creating a duplicate. This is the standard pattern for async singleton creation in JavaScript's single-threaded event loop model. File: packages/protocol/src/interaction/InteractionClient.ts:140-168 --- packages/protocol/src/interaction/InteractionClient.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/protocol/src/interaction/InteractionClient.ts b/packages/protocol/src/interaction/InteractionClient.ts index b608823314..494e2fb099 100644 --- a/packages/protocol/src/interaction/InteractionClient.ts +++ b/packages/protocol/src/interaction/InteractionClient.ts @@ -149,6 +149,12 @@ export class InteractionClientProvider { const exchangeProvider = await this.#peers.exchangeProviderFor(address, options); + // Double-check after await points - another concurrent call may have created the client + const existingClient = this.#clients.get(address); + if (existingClient !== undefined) { + return existingClient; + } + client = new InteractionClient( exchangeProvider, this.#subscriptionClient, From 71965506b7d019609c4580dd835138c45cf3acd2 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 18:12:02 +0000 Subject: [PATCH 2/5] Fix async state desynchronization in FabricManager persistence ISSUE: The persistFabrics() method had an async race condition where fabric state could change between persisting the fabric list and the fabric index: 1. Method captures fabric list: Array.from(this.#fabrics.values()) 2. Starts persisting to storage (async operation) 3. During storage.set() promise, event loop yields control 4. Another operation adds/removes a fabric, increments #nextFabricIndex 5. First storage.set() completes, chains to second operation 6. Persists nextFabricIndex with NEW value, but fabric list has OLD value Result: Storage contains inconsistent state - e.g., 3 fabrics stored but nextFabricIndex = 5. On restart, fabric index allocation could conflict or skip indices incorrectly. EXAMPLE SCENARIO: - State: fabrics=[1,2,3], nextIndex=4 - persistFabrics() captures fabrics=[1,2,3], starts async persist - During await: addFabric(4) called, nextIndex becomes 5 - Persist completes: fabrics=[1,2,3] but nextIndex=5 (inconsistent!) SOLUTION: Capture both fabricConfigs and nextFabricIndex values synchronously before any async operations. This creates an atomic snapshot of related state that remains consistent even if concurrent operations modify the live state during the async persistence operations. This ensures that restored state will always be self-consistent, even if it's slightly outdated due to operations that happened after the snapshot. File: packages/protocol/src/fabric/FabricManager.ts:141-159 --- packages/protocol/src/fabric/FabricManager.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/packages/protocol/src/fabric/FabricManager.ts b/packages/protocol/src/fabric/FabricManager.ts index dd88510523..dd3b76b178 100644 --- a/packages/protocol/src/fabric/FabricManager.ts +++ b/packages/protocol/src/fabric/FabricManager.ts @@ -147,14 +147,15 @@ export class FabricManager { this.#construction.assert(); - const storeResult = this.#storage.set( - "fabrics", - Array.from(this.#fabrics.values()).map(fabric => fabric.config), - ); + // Capture both values before any async operations to ensure consistent state + const fabricConfigs = Array.from(this.#fabrics.values()).map(fabric => fabric.config); + const nextFabricIndex = this.#nextFabricIndex; + + const storeResult = this.#storage.set("fabrics", fabricConfigs); if (MaybePromise.is(storeResult)) { - return storeResult.then(() => this.#storage!.set("nextFabricIndex", this.#nextFabricIndex)); + return storeResult.then(() => this.#storage!.set("nextFabricIndex", nextFabricIndex)); } - return this.#storage.set("nextFabricIndex", this.#nextFabricIndex); + return this.#storage.set("nextFabricIndex", nextFabricIndex); } addFabric(fabric: Fabric) { From 03676bd682e60cb0da9db4131f86ecf98dea8b74 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 18:13:04 +0000 Subject: [PATCH 3/5] Fix async race in NodeSession destruction allowing state corruption ISSUE: The destroy() method had multiple await points before setting the #isClosing flag, creating windows where event processing could corrupt session state: 1. destroy() called, immediately awaits clearSubscriptions() (line 341) 2. During this await, event loop processes incoming messages 3. New subscriptions could be added to the session 4. Messages could trigger other operations on the session 5. Only AFTER clearSubscriptions() completes does #isClosing get set (line 351) Result: New subscriptions/operations accepted during teardown are never properly cleaned up, causing memory leaks and potential protocol violations. EXAMPLE SCENARIO: - destroy() called, starts clearing subscriptions - During await: incoming message creates new subscription - clearSubscriptions() completes (doesn't clear the new one) - #isClosing set to true - destroyed.emit() fires - New subscription never cleaned up -> memory leak CODE CHECKING isClosing: - SessionManager.ts:327 - skips operations if session.isClosing - InteractionServer.ts:178 - blocks new activity if isClosing Setting the flag early ensures these checks protect against new operations. SOLUTION: Set #isClosing = true immediately at the start of destroy() (before any awaits) when closeAfterExchangeFinished is false. This ensures any code checking isClosing will reject new operations before async teardown begins. Note: When closeAfterExchangeFinished=true, we use #closingAfterExchangeFinished flag instead, which serves a similar but delayed purpose. File: packages/protocol/src/session/NodeSession.ts:340-372 --- packages/protocol/src/session/NodeSession.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/protocol/src/session/NodeSession.ts b/packages/protocol/src/session/NodeSession.ts index 9ac20a62a3..f8099f5409 100644 --- a/packages/protocol/src/session/NodeSession.ts +++ b/packages/protocol/src/session/NodeSession.ts @@ -338,6 +338,11 @@ export class NodeSession extends SecureSession { /** Destroys a session. Outstanding subscription data will be discarded. */ async destroy(sendClose = false, closeAfterExchangeFinished = true, flushSubscriptions = false) { + // Set closing flag immediately to prevent new operations during teardown + if (!closeAfterExchangeFinished) { + this.#isClosing = true; + } + await this.clearSubscriptions(flushSubscriptions); this.#fabric?.removeSession(this); if (!sendClose) { @@ -348,7 +353,6 @@ export class NodeSession extends SecureSession { logger.info(`Register Session ${this.name} to close when exchange is ended.`); this.#closingAfterExchangeFinished = true; } else { - this.#isClosing = true; logger.info(`End ${this.isPase ? "PASE" : "CASE"} session ${this.name}`); this.manager?.sessions.delete(this); From 1dcc6b774b59202719e5924ffb2438bc36595ee6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 18:13:41 +0000 Subject: [PATCH 4/5] Fix BTP handshake version array index corruption ISSUE: The BTP codec had mismatched array indices between encoding and decoding, causing version numbers to be swapped during BLE handshake negotiation. ENCODING (lines 93-96): writer.writeUInt8((versions[1] << 4) | versions[0]); -> versions[1] goes to HIGH nibble (bits 7-4) -> versions[0] goes to LOW nibble (bits 3-0) DECODING (original lines 188-201): ver[0] = (version & 0xf0) >> 4; -> HIGH nibble to ver[0] ver[1] = version & 0x0f; -> LOW nibble to ver[1] ROUND-TRIP CORRUPTION: Input: versions = [4, 3, 2, 1, 0, 0, 0, 0] Encode: byte 0 = (3 << 4) | 4 = 0x34 Decode: ver[0] = 3, ver[1] = 4 -> SWAPPED\! Result: versions = [3, 4, 2, 1, 0, 0, 0, 0] -> WRONG This breaks BLE version negotiation as the device and controller would disagree on supported protocol versions after handshake. SOLUTION: Match the decoding indices to the encoding pattern: ver[1] = HIGH nibble (matches versions[1] from encoding) ver[0] = LOW nibble (matches versions[0] from encoding) Now round-trip preserves the version array correctly. VERIFICATION: Input: versions = [4, 3, 2, 1, 0, 0, 0, 0] Encode: byte 0 = (3 << 4) | 4 = 0x34 Decode: ver[1] = 3, ver[0] = 4 Result: versions = [4, 3, 2, 1, 0, 0, 0, 0] -> CORRECT File: packages/protocol/src/codec/BtpCodec.ts:187-201 --- packages/protocol/src/codec/BtpCodec.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/protocol/src/codec/BtpCodec.ts b/packages/protocol/src/codec/BtpCodec.ts index be90c9a772..afa0287125 100644 --- a/packages/protocol/src/codec/BtpCodec.ts +++ b/packages/protocol/src/codec/BtpCodec.ts @@ -185,20 +185,20 @@ export class BtpCodec { } const ver: number[] = []; - ver[0] = (version & 0xf0) >> 4; - ver[1] = version & 0x0f; + ver[1] = (version & 0xf0) >> 4; + ver[0] = version & 0x0f; version = reader.readUInt8(); - ver[2] = (version & 0xf0) >> 4; - ver[3] = version & 0x0f; + ver[3] = (version & 0xf0) >> 4; + ver[2] = version & 0x0f; version = reader.readUInt8(); - ver[4] = (version & 0xf0) >> 4; - ver[5] = version & 0x0f; + ver[5] = (version & 0xf0) >> 4; + ver[4] = version & 0x0f; version = reader.readUInt8(); - ver[6] = (version & 0xf0) >> 4; - ver[7] = version & 0x0f; + ver[7] = (version & 0xf0) >> 4; + ver[6] = version & 0x0f; const versions = ver.filter(v => v !== 0); if (versions.length === 0) { From 6a06e1ce41cdf76d7113ef58c3c77aebfa83660d Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 18:17:45 +0000 Subject: [PATCH 5/5] Add code analysis documentation for remaining findings Added three documentation files summarizing code analysis results: 1. ANALYSIS-FIXES-APPLIED.md - Documents 4 critical fixes applied to the codebase - Provides detailed explanation of each issue and solution - Includes testing recommendations and impact assessment 2. ANALYSIS-RESOURCE-LEAKS.md - Catalogs 11 resource leak and memory management issues - Prioritized as HIGH (4), MEDIUM (5), Investigation (2) - Includes specific file locations and recommendations 3. ANALYSIS-INVESTIGATED-NON-ISSUES.md - Documents items that were investigated but are not bugs - Includes 2 confirmed correct behaviors (MDNS discriminator, window covering null) - Notes 1 item needing Matter spec clarification (message counter rollover) - Lists 5 false positives from initial analysis (safe due to JS single-threaded model) These documents provide a comprehensive record of the code analysis process and serve as a reference for future development and maintenance. --- ANALYSIS-FIXES-APPLIED.md | 331 ++++++++++++++++++++++++++++ ANALYSIS-INVESTIGATED-NON-ISSUES.md | 158 +++++++++++++ ANALYSIS-RESOURCE-LEAKS.md | 296 +++++++++++++++++++++++++ 3 files changed, 785 insertions(+) create mode 100644 ANALYSIS-FIXES-APPLIED.md create mode 100644 ANALYSIS-INVESTIGATED-NON-ISSUES.md create mode 100644 ANALYSIS-RESOURCE-LEAKS.md diff --git a/ANALYSIS-FIXES-APPLIED.md b/ANALYSIS-FIXES-APPLIED.md new file mode 100644 index 0000000000..8b3053c78c --- /dev/null +++ b/ANALYSIS-FIXES-APPLIED.md @@ -0,0 +1,331 @@ +# Code Analysis - Fixes Applied + +This document summarizes the critical fixes applied to the matter.js codebase to address async race conditions and logic errors. + +**Branch**: `claude/analyze-matterjs-logic-016rMAtuNNyDrjr189FaSaYh` +**Date**: 2025-11-15 +**Total Fixes**: 4 commits + +--- + +## Summary + +| Issue | Severity | File | Status | +|-------|----------|------|--------| +| 1.1 InteractionClient Provider Race | CRITICAL | `packages/protocol/src/interaction/InteractionClient.ts` | ✅ Fixed | +| 1.2 FabricManager Persistence | CRITICAL | `packages/protocol/src/fabric/FabricManager.ts` | ✅ Fixed | +| 1.3 NodeSession Destroy Race | CRITICAL | `packages/protocol/src/session/NodeSession.ts` | ✅ Fixed | +| 5.1 BTP Version Codec Bug | CRITICAL | `packages/protocol/src/codec/BtpCodec.ts` | ✅ Fixed | + +--- + +## Fix 1.1: InteractionClient Provider Race Condition + +**Commit**: `82d0dee` +**Severity**: CRITICAL - Async Race Condition +**Impact**: Memory leak, duplicate client instances + +### Problem + +The `getInteractionClient()` method had a check-then-create pattern with two await points between the initial check and client creation: + +```typescript +async getInteractionClient(address: PeerAddress, options = {}) { + let client = this.#clients.get(address); + if (client !== undefined) return client; // ← Check + + await nodeStore?.construction; // ← AWAIT 1: yields to event loop + const exchangeProvider = + await this.#peers.exchangeProviderFor(address, options); // ← AWAIT 2 + + client = new InteractionClient(...); + this.#clients.set(address, client); // ← Set (could overwrite!) + return client; +} +``` + +### Race Scenario + +1. Call A checks map → not found +2. Call B checks map → not found +3. Both await and yield to event loop +4. Call A creates InteractionClient, stores in map +5. Call B creates DIFFERENT InteractionClient, overwrites A's client +6. **Result**: Resource leak (A's client unreferenced), multiple instances for same peer + +### Solution + +Implemented double-checked locking pattern - check again after async operations: + +```typescript +async getInteractionClient(address: PeerAddress, options = {}) { + let client = this.#clients.get(address); + if (client !== undefined) return client; + + await nodeStore?.construction; + const exchangeProvider = await this.#peers.exchangeProviderFor(address, options); + + // ✅ Double-check after await points + const existingClient = this.#clients.get(address); + if (existingClient !== undefined) { + return existingClient; + } + + client = new InteractionClient(...); + this.#clients.set(address, client); + return client; +} +``` + +**File**: `packages/protocol/src/interaction/InteractionClient.ts:140-168` + +--- + +## Fix 1.2: FabricManager Persistence Desynchronization + +**Commit**: `7196550` +**Severity**: CRITICAL - Async State Inconsistency +**Impact**: Storage corruption, fabric index conflicts + +### Problem + +The `persistFabrics()` method captured fabric list and nextFabricIndex at different times with async gap between them: + +```typescript +persistFabrics(): MaybePromise { + const storeResult = this.#storage.set( + "fabrics", + Array.from(this.#fabrics.values()).map(fabric => fabric.config), // ← Snapshot 1 + ); + if (MaybePromise.is(storeResult)) { + return storeResult.then(() => + this.#storage!.set("nextFabricIndex", this.#nextFabricIndex) // ← Snapshot 2 + ); + } +} +``` + +### Race Scenario + +1. Method captures fabric list: `[fabric1, fabric2, fabric3]` +2. Starts persisting (async operation) +3. During promise chain, another operation adds fabric4 +4. `#nextFabricIndex` increments to 5 +5. Second storage operation persists nextFabricIndex=5 +6. **Result**: Storage has 3 fabrics but nextFabricIndex=5 (inconsistent!) + +On restart, index allocation would conflict or skip indices. + +### Solution + +Capture both values synchronously before any async operations: + +```typescript +persistFabrics(): MaybePromise { + // ✅ Capture both values atomically before async operations + const fabricConfigs = Array.from(this.#fabrics.values()).map(fabric => fabric.config); + const nextFabricIndex = this.#nextFabricIndex; + + const storeResult = this.#storage.set("fabrics", fabricConfigs); + if (MaybePromise.is(storeResult)) { + return storeResult.then(() => this.#storage!.set("nextFabricIndex", nextFabricIndex)); + } + return this.#storage.set("nextFabricIndex", nextFabricIndex); +} +``` + +This ensures a consistent snapshot even if state changes during async persistence. + +**File**: `packages/protocol/src/fabric/FabricManager.ts:141-159` + +--- + +## Fix 1.3: NodeSession Destroy During Event Processing + +**Commit**: `03676bd` +**Severity**: CRITICAL - Async State Corruption +**Impact**: Memory leak, subscription cleanup failure + +### Problem + +The `destroy()` method set the `#isClosing` flag AFTER the first await, allowing new operations during teardown: + +```typescript +async destroy(sendClose = false, closeAfterExchangeFinished = true, flushSubscriptions = false) { + await this.clearSubscriptions(flushSubscriptions); // ← AWAIT 1: event loop yields + this.#fabric?.removeSession(this); + + if (closeAfterExchangeFinished) { + this.#closingAfterExchangeFinished = true; + } else { + this.#isClosing = true; // ← Only set AFTER first await! + // ... more awaits ... + } +} +``` + +### Race Scenario + +1. `destroy()` called, starts clearing subscriptions +2. During await, incoming message creates new subscription +3. `clearSubscriptions()` completes (doesn't clear the new one) +4. `#isClosing` set to true +5. `destroyed.emit()` fires +6. **Result**: New subscription never cleaned up → memory leak + +### Solution + +Set `#isClosing` flag immediately before any awaits: + +```typescript +async destroy(sendClose = false, closeAfterExchangeFinished = true, flushSubscriptions = false) { + // ✅ Set closing flag BEFORE any awaits + if (!closeAfterExchangeFinished) { + this.#isClosing = true; + } + + await this.clearSubscriptions(flushSubscriptions); + // ... rest of cleanup ... +} +``` + +Code checking `isClosing` (SessionManager.ts:327, InteractionServer.ts:178) now correctly rejects new operations during teardown. + +**File**: `packages/protocol/src/session/NodeSession.ts:340-372` + +--- + +## Fix 5.1: BTP Version Codec Index Swapping + +**Commit**: `1dcc6b7` +**Severity**: CRITICAL - Logic Error +**Impact**: BLE version negotiation completely broken + +### Problem + +The BTP codec had mismatched array indices between encoding and decoding: + +**Encoding**: +```typescript +writer.writeUInt8((versions[1] << 4) | versions[0]); +// versions[1] → HIGH nibble (bits 7-4) +// versions[0] → LOW nibble (bits 3-0) +``` + +**Decoding (WRONG)**: +```typescript +ver[0] = (version & 0xf0) >> 4; // HIGH nibble → ver[0] ❌ +ver[1] = version & 0x0f; // LOW nibble → ver[1] ❌ +``` + +### Round-Trip Corruption + +``` +Input: versions = [4, 3, 2, 1, 0, 0, 0, 0] +Encode: byte 0 = (3 << 4) | 4 = 0x34 +Decode: ver[0] = 3, ver[1] = 4 → SWAPPED! +Result: versions = [3, 4, 2, 1, 0, 0, 0, 0] → WRONG +``` + +This would cause BLE commissioning to fail as device and controller disagree on protocol versions. + +### Solution + +Match decoding indices to encoding pattern: + +```typescript +ver[1] = (version & 0xf0) >> 4; // ✅ HIGH nibble → ver[1] +ver[0] = version & 0x0f; // ✅ LOW nibble → ver[0] +``` + +**Verification**: +``` +Input: versions = [4, 3, 2, 1, 0, 0, 0, 0] +Encode: byte 0 = (3 << 4) | 4 = 0x34 +Decode: ver[1] = 3, ver[0] = 4 +Result: versions = [4, 3, 2, 1, 0, 0, 0, 0] → CORRECT ✅ +``` + +**File**: `packages/protocol/src/codec/BtpCodec.ts:187-201` + +--- + +## Testing Recommendations + +### For InteractionClient Fix (1.1) +```typescript +// Test concurrent client creation for same peer +const address = PeerAddress({ ... }); +const [client1, client2] = await Promise.all([ + provider.getInteractionClient(address), + provider.getInteractionClient(address), +]); +assert(client1 === client2); // Should be same instance +``` + +### For FabricManager Fix (1.2) +```typescript +// Test persistence during concurrent fabric operations +async function testPersistence() { + const persist1 = fabricManager.persistFabrics(); + fabricManager.addFabric(newFabric); // Happens during persist + await persist1; + + const restored = await FabricManager.load(storage); + // Verify consistency between fabric count and nextFabricIndex +} +``` + +### For NodeSession Fix (1.3) +```typescript +// Test that operations are rejected during destroy +session.destroy(); +try { + session.addSubscription(...); // Should fail or be ignored +} catch (error) { + assert(error.message.includes('closing')); +} +``` + +### For BTP Codec Fix (5.1) +```typescript +// Test round-trip encoding/decoding +const versions = [4, 3, 2, 1, 0, 0, 0, 0]; +const encoded = BtpCodec.encodeBtpHandshakeRequest({ versions, ... }); +const decoded = BtpCodec.decodeBtpHandshakeRequest(encoded); +assert.deepEqual(decoded.versions, versions); // Should match exactly +``` + +--- + +## Related Documentation + +- `ANALYSIS-RESOURCE-LEAKS.md` - Remaining resource management issues for future work +- `ANALYSIS-INVESTIGATED-NON-ISSUES.md` - Items that were investigated but are not bugs + +--- + +## Impact Assessment + +### Before Fixes +- ❌ InteractionClient: Potential memory leaks in multi-peer scenarios +- ❌ FabricManager: Storage corruption possible after fabric operations +- ❌ NodeSession: Subscription leaks during session teardown +- ❌ BTP Codec: BLE commissioning broken (complete failure) + +### After Fixes +- ✅ InteractionClient: Single instance per peer guaranteed +- ✅ FabricManager: Atomic state snapshots prevent corruption +- ✅ NodeSession: Operations blocked during teardown +- ✅ BTP Codec: BLE version negotiation works correctly + +--- + +## Conclusion + +All critical async race conditions and the BTP codec logic error have been fixed. The fixes follow JavaScript best practices for handling async operations in a single-threaded event loop environment. + +**Next Steps**: +1. Review resource leak issues in `ANALYSIS-RESOURCE-LEAKS.md` +2. Add integration tests for async scenarios +3. Review message counter rollover logic against Matter spec (issue 5.2) diff --git a/ANALYSIS-INVESTIGATED-NON-ISSUES.md b/ANALYSIS-INVESTIGATED-NON-ISSUES.md new file mode 100644 index 0000000000..9fab735770 --- /dev/null +++ b/ANALYSIS-INVESTIGATED-NON-ISSUES.md @@ -0,0 +1,158 @@ +# Investigated Issues - Not Bugs or Needs Spec Clarification + +This document catalogs issues that were flagged during initial analysis but upon investigation were found to be either correct behavior or require Matter specification clarification. + +**Date**: 2025-11-15 + +--- + +## CONFIRMED NOT BUGS + +### Issue 5.3: MDNS Short Discriminator Bit Extraction + +**File**: `packages/protocol/src/mdns/MdnsClient.ts:1325` + +**Initial Concern**: Code extracts bits 11-8 of discriminator, which seemed counterintuitive for a "short" discriminator. + +**Code**: +```typescript +parsedRecord.SD = (parsedRecord.D >> 8) & 0x0f; +``` + +**Investigation**: Found consistent pattern across multiple files: +- `packages/react-native/src/ble/BleScanner.ts:104`: `SD: (discriminator >> 8) & 0x0f` +- `packages/protocol/src/advertisement/mdns/CommissionableMdnsAdvertisement.ts:46`: Same pattern +- `packages/nodejs-ble/src/BleScanner.ts:129`: Same pattern + +**Conclusion**: ✅ **NOT A BUG** +- This is the correct Matter protocol behavior +- Short discriminator is the upper 4 bits (bits 11-8) of the 12-bit discriminator +- Consistently implemented across all BLE and MDNS discovery code +- Matches how `getShortDiscriminatorQname()` is used throughout the codebase + +--- + +### Issue 5.4: Window Covering Null Assignment to Numeric Field + +**File**: `packages/node/src/behaviors/window-covering/WindowCoveringServer.ts:240, 258` + +**Initial Concern**: Assigning `null` to what appeared to be a numeric percentage field. + +**Code**: +```typescript +this.state.currentPositionLiftPercentage = + percent100ths === null ? percent100ths : Math.floor(percent100ths / WC_PERCENT100THS_COEFFICIENT); +``` + +**Investigation**: Checked cluster type definition: +- `packages/types/src/clusters/window-covering.ts:641`: + ```typescript + currentPositionLift: OptionalAttribute(0x3, TlvNullable(TlvUInt16), { + persistent: true, + default: null + }) + ``` + +**Conclusion**: ✅ **NOT A BUG** +- Field is explicitly typed as `TlvNullable(TlvUInt16)` +- Default value is `null` +- Null represents "position unknown" state, which is valid per Matter spec +- Pattern is intentional and correct +- Also seen in test code: `support/chip-testing/src/cluster/TestWindowCoveringServer.ts:108,123` + +--- + +## NEEDS MATTER SPECIFICATION CLARIFICATION + +### Issue 5.2: Message Counter Rollover Logic + +**File**: `packages/protocol/src/protocol/MessageReceptionState.ts:192-196` + +**Concern**: Logic for handling counter rollover appears counterintuitive. + +**Code**: +```typescript +if (diff > 0) { + // positive value means new message counter is larger than the current maximum + if (-diff + MAX_COUNTER_VALUE_32BIT < MSG_COUNTER_WINDOW_SIZE) { + return diff - MAX_COUNTER_VALUE_32BIT - 1; + } + return diff; +} +``` + +**Analysis**: +- When `diff > 0`, messageCounter > maximumMessageCounter (new is ahead) +- Line 194 checks: `-diff + MAX_COUNTER_VALUE_32BIT < MSG_COUNTER_WINDOW_SIZE` +- This is equivalent to: `MAX_COUNTER_VALUE_32BIT - diff < MSG_COUNTER_WINDOW_SIZE` +- The logic appears to detect a "backwards rollover" case but the math is unclear + +**Scenarios**: + +1. **Normal forward case**: + - max = 1000, new = 1010 + - diff = 10 (positive) + - Check: `MAX_COUNTER_VALUE_32BIT - 10 < 32` → false + - Returns: 10 ✅ Correct + +2. **Questionable case** (what line 194 is checking): + - max = 5, new = MAX_COUNTER_VALUE_32BIT - 5 (0xFFFFFFF9) + - diff = 0xFFFFFFF9 - 5 = very large positive (due to unsigned math) + - But in JavaScript, this would be handled as signed, giving large negative + - **Contradiction**: This scenario would actually have diff < 0, not diff > 0 + +**Questions for Matter Spec Review**: +1. Is there a valid scenario where diff > 0 AND represents a backwards rollover? +2. How should 32-bit counter rollover be handled in JavaScript (no native unsigned 32-bit)? +3. What is MAX_COUNTER_VALUE_32BIT (found to be 0xFFFFFFFE, not 0xFFFFFFFF)? + +**Recommendation**: +- ⚠️ Needs review against Matter Core Specification message counter requirements +- May be dead code that never executes +- Consider adding unit tests with rollover scenarios +- Document the expected behavior with comments + +**Status**: 🔍 **REQUIRES SPEC VERIFICATION** + +--- + +## FALSE POSITIVES (JavaScript Single-Threaded Corrections) + +The following were initially flagged as race conditions but are actually safe due to JavaScript's single-threaded event loop: + +### ❌ MessageCounter Increment (NOT A RACE) +- **File**: `packages/protocol/src/protocol/MessageCounter.ts:64` +- **Why Safe**: `this.messageCounter++` is synchronous and atomic in JavaScript's event loop +- **No fix needed**: ✅ + +### ❌ Exchange Counter (NOT A RACE) +- **File**: `packages/protocol/src/protocol/ExchangeManager.ts:494` +- **Why Safe**: Synchronous increment, no await between check and use +- **No fix needed**: ✅ + +### ❌ Fabric Index Allocation (NOT A RACE) +- **File**: `packages/protocol/src/fabric/FabricManager.ts:132` +- **Why Safe**: Synchronous loop through indices, no async boundaries +- **No fix needed**: ✅ + +### ❌ Session ID Allocation (NOT A RACE) +- **File**: `packages/protocol/src/session/SessionManager.ts:397-410` +- **Why Safe**: Synchronous getter with no await points +- **No fix needed**: ✅ + +### ❌ Datasource Version/Value Update (NOT A RACE) +- **File**: `packages/node/src/behavior/state/managed/Datasource.ts:344-354` +- **Why Safe**: Synchronous setter with synchronous onChange callbacks +- **No fix needed**: ✅ + +--- + +## Summary + +| Category | Count | Status | +|----------|-------|--------| +| **Confirmed Not Bugs** | 2 | Correct per Matter spec (5.3, 5.4) | +| **Needs Spec Review** | 1 | Unclear logic requiring specification verification (5.2) | +| **False Positives** | 5 | Safe in JavaScript's single-threaded model | + +**Total**: 8 items investigated and clarified diff --git a/ANALYSIS-RESOURCE-LEAKS.md b/ANALYSIS-RESOURCE-LEAKS.md new file mode 100644 index 0000000000..de9bcb594f --- /dev/null +++ b/ANALYSIS-RESOURCE-LEAKS.md @@ -0,0 +1,296 @@ +# Resource Leak and Memory Management Issues + +This document catalogs resource leak and memory management issues found during code analysis of matter.js. + +**Status**: For further investigation and prioritization +**Date**: 2025-11-15 + +--- + +## HIGH PRIORITY + +### 4.2 Unbounded Device Record Growth in MDNS Discovery + +**File**: `packages/protocol/src/mdns/MdnsClient.ts:1196-1201` + +**Issue**: Three maps grow unbounded as devices are discovered: +- `#operationalDeviceRecords` +- `#commissionableDeviceRecords` +- `#discoveredIpRecords` + +**Code**: +```typescript +this.#operationalDeviceRecords.set(matterName, device); +this.#commissionableDeviceRecords.set(...); +this.#discoveredIpRecords.set(...); +// No maximum size limits - maps grow unbounded +``` + +**Impact**: In long-running discovery scenarios, memory usage grows without bounds as new devices are discovered. + +**Recommendation**: +- Implement LRU cache with configurable maximum size +- Add periodic cleanup of stale/expired records +- Consider time-based expiration beyond just TTL + +--- + +### 4.3 Transport Listener Cleanup Incomplete + +**File**: `packages/protocol/src/protocol/ExchangeManager.ts:64, 442-483` + +**Issue**: Event listeners are added via `this.#listeners.set(netInterface, listener)` and registered on `transport.added`. During transport deletion, cleanup may race with active message processing. + +**Code**: +```typescript +// Line 444-467: Adding listener +this.#listeners.set( + netInterface, + netInterface.onData((socket, data) => { + // ... event handling ... + }), +); + +// Line 471-483: Deleting listener +#deleteListener(netInterface: ConnectionlessTransport) { + const listener = this.#listeners.get(netInterface); + if (listener === undefined) return; + this.#listeners.delete(netInterface); + + const closer = listener.close() + .catch(e => logger.error("Error closing network listener", e)) + .finally(() => this.#closers.delete(closer)); + this.#closers.add(closer); +} +``` + +**Impact**: If transport is deleted during active message processing, pending message handlers may access deleted resources. + +**Recommendation**: +- Ensure all pending operations complete before deleting transport +- Add reference counting for active message handlers + +--- + +### 4.4 MDNS Waiters Without Timers Not Resolved on Close + +**File**: `packages/protocol/src/mdns/MdnsClient.ts:840-841` + +**Issue**: During `close()`, only waiters with timers are resolved. Waiters without timers remain pending. + +**Code**: +```typescript +async close() { + this.#closing = true; + this.#observers.close(); + this.#periodicTimer.stop(); + this.#queryTimer?.stop(); + + // Only resolves waiters that HAVE timers! + [...this.#recordWaiters.keys()].forEach(queryId => + this.#finishWaiter(queryId, !!this.#recordWaiters.get(queryId)?.timer), + ); + // Missing: clear other collections +} +``` + +**Impact**: Promises that are waiting for MDNS responses never resolve, preventing proper cleanup and potentially blocking shutdown. + +**Recommendation**: +- Resolve/reject all waiters during close, regardless of timer presence +- Add timeout enforcement for all async operations + +--- + +## MEDIUM PRIORITY + +### 4.5 Session-Subscription Circular References + +**File**: `packages/protocol/src/session/SessionManager.ts:334-335` + +**Issue**: Circular references between Session → Observable → Subscription → Session may prevent garbage collection. + +**Code**: +```typescript +const subscriptionsChanged = (subscription: Subscription) => { + if (session.isClosing) return; + this.#subscriptionsChanged.emit(session, subscription); +}; + +session.subscriptions.added.on(subscriptionsChanged); // Session -> Observable +session.subscriptions.deleted.on(subscriptionsChanged); // Session -> Observable +// Observable -> Subscription -> Session (circular) +``` + +**Impact**: Sessions may not be garbage collected even after being closed, if subscription references remain. + +**Recommendation**: +- Use weak references where appropriate +- Explicitly clear all observers during session cleanup +- Add explicit disposal pattern for observables + +--- + +### 4.6 Listener Callback Outside Try Block Protection + +**File**: `packages/protocol/src/interaction/SubscriptionClient.ts:76-95` + +**Issue**: Messenger cleanup is in try-finally, but listener callback is outside, risking resource leak if listener throws. + +**Code**: +```typescript +async onNewExchange(exchange: MessageExchange) { + const messenger = new IncomingInteractionClientMessenger(exchange); + + let dataReport: DecodedDataReport; + try { + dataReport = await messenger.readAggregateDataReport(undefined, [...this.#listeners.keys()]); + } finally { + messenger.close().catch(error => logger.info("Error closing client messenger", error)); + } + + const listener = this.#listeners.get(subscriptionId); + await listener?.(dataReport); // Listener call OUTSIDE try block! +} +``` + +**Impact**: If listener throws an exception, messenger may not be fully cleaned up. + +**Recommendation**: +- Move listener call inside try block or add separate try-catch +- Ensure messenger cleanup happens regardless of listener errors + +--- + +### 4.7 Truncated Query Timer Cache Unbounded + +**File**: `packages/protocol/src/mdns/MdnsServer.ts:293-299` + +**Issue**: `#truncatedQueryCache` grows without size limits as queries are processed. + +**Code**: +```typescript +this.#truncatedQueryCache.set(key, { + message, + timer: Time.getTimer(`Truncated MDNS message ${key}`, Millis(400 + Math.floor(Math.random() * 100)), () => + this.#processTruncatedQuery(key), + ).start(), +}); +``` + +**Impact**: High-frequency MDNS queries could accumulate cache entries faster than they expire. + +**Recommendation**: +- Add maximum cache size with LRU eviction +- Monitor cache size in production + +--- + +### 4.8 Cached Records Not Cleared on Close + +**File**: `packages/protocol/src/mdns/MdnsServer.ts:220-228` + +**Issue**: `#recordsGenerator` is not cleared during close(), keeping service registrations in memory. + +**Code**: +```typescript +async close() { + this.#observers.close(); + await this.#records.close(); + for (const { timer } of this.#truncatedQueryCache.values()) { + timer.stop(); + } + this.#truncatedQueryCache.clear(); + this.#recordLastSentAsMulticastAnswer.clear(); + // Missing: this.#recordsGenerator.clear(); if needed +} +``` + +**Impact**: Memory leak if MDNS server is repeatedly started and stopped. + +**Recommendation**: +- Add explicit cleanup of all internal maps and caches +- Verify all resources are released during close + +--- + +### 4.9 Continuous Discovery Resources Leaked on Exception + +**File**: `packages/protocol/src/mdns/MdnsClient.ts:757-824` + +**Issue**: `criteria` object added to `targetCriteriaProviders` (line 792) may not be deleted if exception occurs before finally block. + +**Code**: +```typescript +async findCommissionableDevicesContinuously(...) { + // ... + this.targetCriteriaProviders.add(criteria); // Line 792: Added but... + + try { + // ... long loop ... + } finally { + this.targetCriteriaProviders.delete(criteria); // Only deleted in finally + } +} +``` + +**Impact**: If method is called repeatedly with exceptions, criteria objects accumulate in the set. + +**Recommendation**: +- Ensure finally block always executes (it should in JavaScript) +- Add defensive checks in finally block + +--- + +## INVESTIGATION NEEDED + +### 4.10 Event Listeners Accumulate on Subscription Observables + +**Files**: Multiple files in `packages/protocol/src/session/` + +**Issue**: Event listeners registered on subscription observables may accumulate if not properly removed. + +**Recommendation**: +- Audit all observer registration patterns +- Ensure observers are explicitly removed when subscriptions end +- Consider using WeakMap for automatic cleanup + +--- + +### 4.11 Untracked Standalone ACK Promise + +**File**: `packages/protocol/src/protocol/MessageExchange.ts:154-157` + +**Issue**: Promise from `sendStandaloneAckForMessage()` is not tracked, only has error handler. + +**Code**: +```typescript +#receivedMessageAckTimer = Time.getTimer("Ack receipt timeout", MRP.STANDALONE_ACK_TIMEOUT, () => { + if (this.#receivedMessageToAck !== undefined) { + const messageToAck = this.#receivedMessageToAck; + this.#receivedMessageToAck = undefined; + // TODO: We need to track this promise later (Line 154 comment!) + this.sendStandaloneAckForMessage(messageToAck).catch(error => + logger.error("An error happened when sending a standalone ack", error), + ); + } +}); +``` + +**Impact**: ACK sending failures are logged but not tracked for retry or cleanup. + +**Recommendation**: +- Add promise tracking as indicated by TODO +- Consider retry logic for failed ACKs + +--- + +## Summary Table + +| Priority | Count | Primary Issues | +|----------|-------|----------------| +| HIGH | 4 | Unbounded growth (4.2, 4.7), incomplete cleanup (4.3, 4.4) | +| MEDIUM | 5 | Circular refs (4.5), resource protection (4.6, 4.8, 4.9) | +| Investigation | 2 | Event listener patterns (4.10, 4.11) | + +**Total**: 11 resource management issues requiring attention