diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 7b6f2e1..9dc65d9 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -15,7 +15,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          dnf install -y openblas-devel openssl-devel perl-IPC-Cmd
+          dnf install -y openssl-devel perl-IPC-Cmd
           curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path
           /opt/python/cp312-cp312/bin/pip install maturin
 
@@ -54,17 +54,11 @@ jobs:
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
 
-      - name: Install OpenBLAS
-        run: brew install openblas
-
       - name: Install maturin
         run: pip install maturin
 
       - name: Build wheel
-        run: |
-          export OPENBLAS_DIR=$(brew --prefix openblas)
-          export PKG_CONFIG_PATH=$(brew --prefix openblas)/lib/pkgconfig
-          maturin build --release --out dist --features extension-module
+        run: maturin build --release --out dist --features extension-module
 
       - name: Upload wheels
         uses: actions/upload-artifact@v4
@@ -72,6 +66,36 @@ jobs:
           name: wheels-macos-arm64-py${{ matrix.python-version }}
           path: dist/*.whl
 
+  # Build wheels on Windows
+  build-windows:
+    name: Build Windows wheels
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10', '3.11', '3.12']
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install maturin
+        run: pip install maturin
+
+      - name: Build wheel
+        run: maturin build --release --out dist --features extension-module
+
+      - name: Upload wheels
+        uses: actions/upload-artifact@v4
+        with:
+          name: wheels-windows-py${{ matrix.python-version }}
+          path: dist/*.whl
+
   # Build source distribution
   build-sdist:
     name: Build source distribution
@@ -97,10 +121,9 @@ jobs:
           path: dist/*.tar.gz
 
   # Publish to PyPI
-  # Windows and macOS x86_64 users install from sdist and get pure Python fallback
   publish:
     name: Publish to PyPI
-    needs: [build-linux, build-macos-arm, build-sdist]
+    needs: [build-linux, build-macos-arm, build-windows, build-sdist]
     runs-on: ubuntu-latest
     environment: pypi
     permissions:
diff --git a/.github/workflows/rust-test.yml b/.github/workflows/rust-test.yml
index 477a5ee..2ed2dfb 100644
--- a/.github/workflows/rust-test.yml
+++ b/.github/workflows/rust-test.yml
@@ -22,19 +22,23 @@ env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  # Run Rust unit tests
+  # Run Rust unit tests on all platforms
   rust-tests:
-    name: Rust Unit Tests
-    runs-on: ubuntu-latest
+    name: Rust Unit Tests (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    env:
+      PYO3_USE_ABI3_FORWARD_COMPATIBILITY: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+
     steps:
       - uses: actions/checkout@v4
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
 
-      - name: Install OpenBLAS
-        run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
-
       - name: Run Rust tests
         working-directory: rust
         run: cargo test --verbose
@@ -46,8 +50,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, macos-latest]
-        # Windows excluded due to Intel MKL build complexity
+        os: [ubuntu-latest, macos-latest, windows-latest]
 
     steps:
       - uses: actions/checkout@v4
@@ -57,20 +60,6 @@ jobs:
         with:
           python-version: '3.11'
 
-      - name: Install OpenBLAS (Ubuntu)
-        if: matrix.os == 'ubuntu-latest'
-        run: sudo apt-get update && sudo apt-get install -y libopenblas-dev
-
-      - name: Install OpenBLAS (macOS)
-        if: matrix.os == 'macos-latest'
-        run: brew install openblas
-
-      - name: Set OpenBLAS paths (macOS)
-        if: matrix.os == 'macos-latest'
-        run: |
-          echo "OPENBLAS_DIR=$(brew --prefix openblas)" >> $GITHUB_ENV
-          echo "PKG_CONFIG_PATH=$(brew --prefix openblas)/lib/pkgconfig" >> $GITHUB_ENV
-
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
 
@@ -82,28 +71,66 @@ jobs:
           pip install maturin
           maturin build --release -o dist
           echo "=== Built wheels ==="
-          ls -la dist/
-          # --no-index ensures we install from local wheel, not PyPI
-          pip install --no-index --find-links=dist diff-diff
+          ls -la dist/ || dir dist
+        shell: bash
+
+      - name: Install wheel (Unix)
+        if: runner.os != 'Windows'
+        run: pip install --no-index --find-links=dist diff-diff
 
-      - name: Verify Rust backend is available
-        # Run from /tmp to avoid source directory shadowing installed package
+      - name: Install wheel (Windows)
+        if: runner.os == 'Windows'
+        run: |
+          $wheel = Get-ChildItem dist/*.whl | Select-Object -First 1
+          pip install $wheel.FullName
+        shell: pwsh
+
+      - name: Verify Rust backend is available (Unix)
+        if: runner.os != 'Windows'
         working-directory: /tmp
         run: |
           python -c "import diff_diff; print('Location:', diff_diff.__file__)"
           python -c "from diff_diff import HAS_RUST_BACKEND; print('HAS_RUST_BACKEND:', HAS_RUST_BACKEND); assert HAS_RUST_BACKEND, 'Rust backend not available'"
 
-      - name: Copy tests to isolated location
+      - name: Verify Rust backend is available (Windows)
+        if: runner.os == 'Windows'
+        working-directory: ${{ runner.temp }}
+        run: |
+          python -c "import diff_diff; print('Location:', diff_diff.__file__)"
+          python -c "from diff_diff import HAS_RUST_BACKEND; print('HAS_RUST_BACKEND:', HAS_RUST_BACKEND); assert HAS_RUST_BACKEND, 'Rust backend not available'"
+
+      - name: Copy tests to isolated location (Unix)
+        if: runner.os != 'Windows'
         run: cp -r tests /tmp/tests
 
-      - name: Run Rust backend tests
+      - name: Copy tests to isolated location (Windows)
+        if: runner.os == 'Windows'
+        run: Copy-Item -Recurse tests $env:RUNNER_TEMP\tests
+        shell: pwsh
+
+      - name: Run Rust backend tests (Unix)
+        if: runner.os != 'Windows'
         working-directory: /tmp
         run: pytest tests/test_rust_backend.py -v
 
-      - name: Run tests with Rust backend
+      - name: Run Rust backend tests (Windows)
+        if: runner.os == 'Windows'
+        working-directory: ${{ runner.temp }}
+        run: pytest tests/test_rust_backend.py -v
+
+      - name: Run tests with Rust backend (Unix)
+        if: runner.os != 'Windows'
         working-directory: /tmp
         run: DIFF_DIFF_BACKEND=rust pytest tests/ -x -q
 
+      - name: Run tests with Rust backend (Windows)
+        if: runner.os == 'Windows'
+        working-directory: ${{ runner.temp }}
+        run: |
+          $env:DIFF_DIFF_BACKEND="rust"
+          pytest tests/ -x -q
+        shell: pwsh
+
   # Test pure Python fallback (without Rust extension)
   python-fallback:
     name: Pure Python Fallback
diff --git a/CLAUDE.md b/CLAUDE.md
index cfe1930..eab6ab1 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -40,9 +40,6 @@ maturin develop
 # Build with release optimizations
 maturin develop --release
 
-# Run Rust unit tests
-cd rust && cargo test
-
 # Force pure Python mode (disable Rust backend)
 DIFF_DIFF_BACKEND=python pytest
 
@@ -53,35 +50,9 @@ DIFF_DIFF_BACKEND=rust pytest
 pytest tests/test_rust_backend.py -v
 ```
 
-#### Troubleshooting Rust Tests (PyO3 Linking)
-
-If `cargo test` fails with `library 'pythonX.Y' not found`, PyO3 cannot find the Python library. This commonly happens on macOS when using the system Python (which lacks development headers in expected locations).
-
-**Solution**: Use a Python environment with proper library paths (e.g., conda, Homebrew, or pyenv):
-
-```bash
-# Using miniconda (example path - adjust for your system)
-cd rust
-PYO3_PYTHON=/path/to/miniconda3/bin/python3 \
-DYLD_LIBRARY_PATH="/path/to/miniconda3/lib" \
-cargo test
-
-# Using Homebrew Python
-PYO3_PYTHON=/opt/homebrew/bin/python3 \
-DYLD_LIBRARY_PATH="/opt/homebrew/lib" \
-cargo test
-```
-
-**Environment variables:**
-- `PYO3_PYTHON`: Path to Python interpreter with development headers
-- `DYLD_LIBRARY_PATH` (macOS) / `LD_LIBRARY_PATH` (Linux): Path to `libpythonX.Y.dylib`/`.so`
-
-**Verification**: All 22 Rust tests should pass, including bootstrap weight tests:
-```
-test bootstrap::tests::test_webb_variance_approx_correct ... ok
-test bootstrap::tests::test_webb_values_correct ... ok
-test bootstrap::tests::test_webb_mean_approx_zero ... ok
-```
+**Note**: As of v2.2.0, the Rust backend uses the pure-Rust `faer` library for linear algebra,
+eliminating external BLAS/LAPACK dependencies. This enables Windows wheel builds and simplifies
+cross-platform compilation - no OpenBLAS or Intel MKL installation required.
 
 ## Architecture
 
@@ -173,16 +144,17 @@ test bootstrap::tests::test_webb_mean_approx_zero ... ok
   - Exports `HAS_RUST_BACKEND` flag and Rust function references
   - Other modules import from here to avoid circular imports with `__init__.py`
 
-- **`rust/`** - Optional Rust backend for accelerated computation (v2.0.0):
+- **`rust/`** - Optional Rust backend for accelerated computation (v2.0.0+):
   - **`rust/src/lib.rs`** - PyO3 module definition, exports Python bindings
   - **`rust/src/bootstrap.rs`** - Parallel bootstrap weight generation (Rademacher, Mammen, Webb)
-  - **`rust/src/linalg.rs`** - OLS solver and cluster-robust variance estimation
+  - **`rust/src/linalg.rs`** - OLS solver (SVD-based) and cluster-robust variance estimation
   - **`rust/src/weights.rs`** - Synthetic control weights and simplex projection
   - **`rust/src/trop.rs`** - TROP estimator acceleration:
     - `compute_unit_distance_matrix()` - Parallel pairwise RMSE distance computation (4-8x speedup)
     - `loocv_grid_search()` - Parallel LOOCV across tuning parameters (10-50x speedup)
     - `bootstrap_trop_variance()` - Parallel bootstrap variance estimation (5-15x speedup)
-  - Uses ndarray-linalg with OpenBLAS (Linux/macOS) or Intel MKL (Windows)
+  - Uses pure-Rust `faer` library for linear algebra (no external BLAS/LAPACK dependencies)
+  - Cross-platform: builds on Linux, macOS, and Windows without additional setup
   - Provides 4-8x speedup for SyntheticDiD, 5-20x speedup for TROP
 
 - **`diff_diff/results.py`** - Dataclass containers for estimation results:
diff --git a/diff_diff/linalg.py b/diff_diff/linalg.py
index c833a88..f5764d8 100644
--- a/diff_diff/linalg.py
+++ b/diff_diff/linalg.py
@@ -251,10 +251,10 @@ def _solve_ols_rust(
     cluster_ids: Optional[np.ndarray] = None,
     return_vcov: bool = True,
     return_fitted: bool = False,
-) -> Union[
+) -> Optional[Union[
     Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
     Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
-]:
+]]:
     """
     Rust backend implementation of solve_ols for full-rank matrices.
 
@@ -296,15 +296,30 @@ def _solve_ols_rust(
         Fitted values if return_fitted=True.
     vcov : np.ndarray, optional
         Variance-covariance matrix if return_vcov=True.
+    None
+        If Rust backend detects numerical instability and caller should
+        fall back to Python backend.
     """
     # Convert cluster_ids to int64 for Rust (handles string/categorical IDs)
     if cluster_ids is not None:
         cluster_ids = _factorize_cluster_ids(cluster_ids)
 
-    # Call Rust backend
-    coefficients, residuals, vcov = _rust_solve_ols(
-        X, y, cluster_ids=cluster_ids, return_vcov=return_vcov
-    )
+    # Call Rust backend with fallback on numerical instability
+    try:
+        coefficients, residuals, vcov = _rust_solve_ols(
+            X, y, cluster_ids=cluster_ids, return_vcov=return_vcov
+        )
+    except ValueError as e:
+        error_msg = str(e).lower()
+        if "numerically unstable" in error_msg or "singular" in error_msg:
+            warnings.warn(
+                f"Rust backend detected numerical instability: {e}. "
+                "Falling back to Python backend.",
+                UserWarning,
+                stacklevel=3,
+            )
+            return None  # Signal caller to use Python fallback
+        raise
 
     # Convert to numpy arrays
     coefficients = np.asarray(coefficients)
@@ -468,12 +483,15 @@ def solve_ols(
     # This saves O(nk²) QR overhead but won't detect rank-deficient matrices
     if skip_rank_check:
         if HAS_RUST_BACKEND and _rust_solve_ols is not None:
-            return _solve_ols_rust(
+            result = _solve_ols_rust(
                 X, y,
                 cluster_ids=cluster_ids,
                 return_vcov=return_vcov,
                 return_fitted=return_fitted,
             )
+            if result is not None:
+                return result
+            # Fall through to NumPy on numerical instability
         # Fall through to Python without rank check (user guarantees full rank)
         return _solve_ols_numpy(
             X, y,
@@ -499,6 +517,7 @@ def solve_ols(
     # Routing strategy:
     # - Full-rank + Rust available → fast Rust backend (SVD-based solve)
     # - Rank-deficient → Python backend (proper NA handling, valid SEs)
+    # - Rust numerical instability → Python fallback (via None return)
     # - No Rust → Python backend (works for all cases)
     if HAS_RUST_BACKEND and _rust_solve_ols is not None and not is_rank_deficient:
         result = _solve_ols_rust(
@@ -508,6 +527,19 @@ def solve_ols(
             return_fitted=return_fitted,
         )
 
+        # Check for None: Rust backend detected numerical instability and
+        # signaled us to fall back to Python backend
+        if result is None:
+            return _solve_ols_numpy(
+                X, y,
+                cluster_ids=cluster_ids,
+                return_vcov=return_vcov,
+                return_fitted=return_fitted,
+                rank_deficient_action=rank_deficient_action,
+                column_names=column_names,
+                _precomputed_rank_info=None,  # Force fresh rank detection
+            )
+
         # Check for NaN vcov: Rust SVD may detect rank-deficiency that QR missed
         # for ill-conditioned matrices (QR and SVD have different numerical properties).
         # When this happens, fall back to Python's R-style handling.
@@ -732,7 +764,7 @@ def compute_robust_vcov(
         try:
             return _rust_compute_robust_vcov(X, residuals, cluster_ids_int)
         except ValueError as e:
-            # Translate Rust LAPACK errors to consistent Python error messages
+            # Translate Rust errors to consistent Python error messages or fallback
             error_msg = str(e)
             if "Matrix inversion failed" in error_msg:
                 raise ValueError(
@@ -740,6 +772,15 @@ def compute_robust_vcov(
                     "This indicates perfect multicollinearity. Check your fixed effects "
                     "and covariates for linear dependencies."
                 ) from e
+            if "numerically unstable" in error_msg.lower():
+                # Fall back to NumPy on numerical instability (with warning)
+                warnings.warn(
+                    f"Rust backend detected numerical instability: {e}. "
+                    "Falling back to Python backend for variance computation.",
+                    UserWarning,
+                    stacklevel=2,
+                )
+                return _compute_robust_vcov_numpy(X, residuals, cluster_ids)
             raise
 
     # Fallback to NumPy implementation
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 3693aef..5fe9f87 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -14,28 +14,19 @@ crate-type = ["cdylib", "rlib"]
 default = []
 # extension-module is only needed for cdylib builds, not for cargo test
 extension-module = ["pyo3/extension-module"]
-# Static BLAS linking for standalone distribution (adds ~20-50MB to binary)
-# Eliminates runtime BLAS dependency at cost of larger binary size
-openblas-static = ["ndarray-linalg/openblas-static"]
-intel-mkl-static = ["ndarray-linalg/intel-mkl-static"]
 
 [dependencies]
-# PyO3 0.20 supports Python 3.7-3.12
-# numpy 0.20 is compatible with ndarray 0.15 (same as ndarray-linalg)
-# Python 3.13+ support pending ecosystem updates
-pyo3 = "0.20"
-numpy = "0.20"
-ndarray = { version = "0.15", features = ["rayon"] }
+# PyO3 0.22 supports Python 3.8-3.13
+pyo3 = "0.22"
+numpy = "0.22"
+ndarray = { version = "0.16", features = ["rayon"] }
 rand = "0.8"
 rand_xoshiro = "0.6"
 rayon = "1.8"
 
-# Platform-specific BLAS backends for linear algebra
-[target.'cfg(not(target_os = "windows"))'.dependencies]
-ndarray-linalg = { version = "0.16", features = ["openblas-system"] }
-
-[target.'cfg(target_os = "windows")'.dependencies]
-ndarray-linalg = { version = "0.16", features = ["intel-mkl-system"] }
+# Pure Rust linear algebra library - no external BLAS/LAPACK dependencies
+# This enables Windows builds without Intel MKL complexity
+faer = "0.24"
 
 [profile.release]
 lto = true
diff --git a/rust/src/bootstrap.rs b/rust/src/bootstrap.rs
index de26626..e653928 100644
--- a/rust/src/bootstrap.rs
+++ b/rust/src/bootstrap.rs
@@ -4,7 +4,7 @@
 //! using various distributions (Rademacher, Mammen, Webb).
 
 use ndarray::{Array2, Axis};
-use numpy::{IntoPyArray, PyArray2};
+use numpy::{PyArray2, ToPyArray};
 use pyo3::prelude::*;
 use rand::prelude::*;
 use rand_xoshiro::Xoshiro256PlusPlus;
@@ -35,7 +35,7 @@ pub fn generate_bootstrap_weights_batch<'py>(
     n_units: usize,
     weight_type: &str,
     seed: u64,
-) -> PyResult<&'py PyArray2<f64>> {
+) -> PyResult<Bound<'py, PyArray2<f64>>> {
     let weights = match weight_type.to_lowercase().as_str() {
         "rademacher" => generate_rademacher_batch(n_bootstrap, n_units, seed),
         "mammen" => generate_mammen_batch(n_bootstrap, n_units, seed),
@@ -48,7 +48,7 @@ pub fn generate_bootstrap_weights_batch<'py>(
         }
     };
 
-    Ok(weights.into_pyarray(py))
+    Ok(weights.to_pyarray_bound(py))
 }
 
 /// Generate Rademacher weights: ±1 with equal probability.
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index c11d3a9..e4ed7df 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -12,7 +12,7 @@ mod weights;
 
 /// A Python module implemented in Rust for diff-diff acceleration.
 #[pymodule]
-fn _rust_backend(_py: Python, m: &PyModule) -> PyResult<()> {
+fn _rust_backend(m: &Bound<'_, PyModule>) -> PyResult<()> {
     // Bootstrap weight generation
     m.add_function(wrap_pyfunction!(
         bootstrap::generate_bootstrap_weights_batch,
diff --git a/rust/src/linalg.rs b/rust/src/linalg.rs
index 282e7e0..1260fa6 100644
--- a/rust/src/linalg.rs
+++ b/rust/src/linalg.rs
@@ -1,16 +1,18 @@
 //! Linear algebra operations for OLS estimation and robust variance computation.
 //!
 //! This module provides optimized implementations of:
-//! - OLS solving using LAPACK
+//! - OLS solving using pure Rust (faer library)
 //! - HC1 (heteroskedasticity-consistent) variance-covariance estimation
 //! - Cluster-robust variance-covariance estimation
 
 use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
-use ndarray_linalg::{FactorizeC, Solve, SolveC, SVD, UPLO};
-use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
+use numpy::{PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, ToPyArray};
 use pyo3::prelude::*;
 use std::collections::HashMap;
 
+// faer for pure Rust linear algebra (no external BLAS/LAPACK dependencies)
+use faer::linalg::solvers::{PartialPivLu, Solve};
+
 /// Solve OLS regression: β = (X'X)^{-1} X'y
 ///
 /// Uses SVD with truncation for rank-deficient matrices:
@@ -46,14 +48,14 @@ pub fn solve_ols<'py>(
     cluster_ids: Option<PyReadonlyArray1<'py, i64>>,
     return_vcov: bool,
 ) -> PyResult<(
-    &'py PyArray1<f64>,
-    &'py PyArray1<f64>,
-    Option<&'py PyArray2<f64>>,
+    Bound<'py, PyArray1<f64>>,
+    Bound<'py, PyArray1<f64>>,
+    Option<Bound<'py, PyArray2<f64>>>,
 )> {
     let x_arr = x.as_array();
     let y_arr = y.as_array();
 
-    let _n = x_arr.nrows();
+    let n = x_arr.nrows();
     let k = x_arr.ncols();
 
     // Solve using SVD with truncation for rank-deficient matrices
@@ -61,17 +63,49 @@ pub fn solve_ols<'py>(
     let x_owned = x_arr.to_owned();
     let y_owned = y_arr.to_owned();
 
-    // Compute SVD: X = U * S * V^T
-    let (u_opt, s, vt_opt) = x_owned
-        .svd(true, true)
-        .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("SVD failed: {}", e)))?;
+    // Convert ndarray to faer for SVD computation
+    let x_faer = ndarray_to_faer(&x_owned);
+
+    // Compute thin SVD using faer: X = U * S * V^T
+    let svd = match x_faer.thin_svd() {
+        Ok(s) => s,
+        Err(_) => {
+            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
+                "SVD computation failed"
+            ))
+        }
+    };
+
+    // Extract U, S, V from faer SVD result (capitalized methods in faer 0.24)
+    let u_faer = svd.U();
+    let s_diag = svd.S();  // Returns diagonal view
+    let s_col = s_diag.column_vector();  // Get as column vector
+    let v_faer = svd.V();  // This is V, not V^T
+
+    // Convert back to ndarray
+    let n_rows = u_faer.nrows();
+    let n_svd_cols = u_faer.ncols();
+    let mut u = Array2::<f64>::zeros((n_rows, n_svd_cols));
+    for i in 0..n_rows {
+        for j in 0..n_svd_cols {
+            u[[i, j]] = u_faer[(i, j)];
+        }
+    }
+
+    let s_len = s_col.nrows();
+    let mut s = Array1::<f64>::zeros(s_len);
+    for i in 0..s_len {
+        s[i] = s_col[i];  // S column vector
+    }
 
-    let u = u_opt.ok_or_else(|| {
-        PyErr::new::<pyo3::exceptions::PyValueError, _>("SVD did not return U matrix")
-    })?;
-    let vt = vt_opt.ok_or_else(|| {
-        PyErr::new::<pyo3::exceptions::PyValueError, _>("SVD did not return V^T matrix")
-    })?;
+    let v_rows = v_faer.nrows();
+    let v_cols = v_faer.ncols();
+    let mut vt = Array2::<f64>::zeros((v_cols, v_rows));  // V^T has shape (k, k)
+    for i in 0..v_rows {
+        for j in 0..v_cols {
+            vt[[j, i]] = v_faer[(i, j)];  // Transpose V to get V^T
+        }
+    }
 
     // Compute rcond threshold to match R's lm() behavior
     // R's qr() uses tol = 1e-07 by default, which is sqrt(eps) ≈ 1.49e-08
@@ -85,9 +119,10 @@ pub fn solve_ols<'py>(
     let uty = u.t().dot(&y_owned); // (min(n,k),)
 
     // Build S^{-1} with truncation and count effective rank
-    let mut s_inv_uty = Array1::<f64>::zeros(k);
+    // Note: s.len() = min(n, k) from thin SVD, so this handles underdetermined (n < k) correctly
+    let mut s_inv_uty = Array1::<f64>::zeros(s.len());
     let mut rank = 0usize;
-    for i in 0..s.len().min(k) {
+    for i in 0..s.len() {
         if s[i] > threshold {
             s_inv_uty[i] = uty[i] / s[i];
             rank += 1;
@@ -109,20 +144,20 @@ pub fn solve_ols<'py>(
             // Rank-deficient: cannot compute valid vcov, return NaN matrix
             let mut nan_vcov = Array2::<f64>::zeros((k, k));
             nan_vcov.fill(f64::NAN);
-            Some(nan_vcov.into_pyarray(py))
+            Some(nan_vcov.to_pyarray_bound(py))
         } else {
             // Full rank: compute robust vcov normally
             let cluster_arr = cluster_ids.as_ref().map(|c| c.as_array().to_owned());
-            let vcov_arr = compute_robust_vcov_internal(&x_arr, &residuals.view(), cluster_arr.as_ref())?;
-            Some(vcov_arr.into_pyarray(py))
+            let vcov_arr = compute_robust_vcov_internal(&x_arr, &residuals.view(), cluster_arr.as_ref(), n, k)?;
+            Some(vcov_arr.to_pyarray_bound(py))
         }
     } else {
         None
     };
 
     Ok((
-        coefficients.into_pyarray(py),
-        residuals.into_pyarray(py),
+        coefficients.to_pyarray_bound(py),
+        residuals.to_pyarray_bound(py),
         vcov,
     ))
 }
@@ -143,13 +178,15 @@ pub fn compute_robust_vcov<'py>(
     x: PyReadonlyArray2<'py, f64>,
     residuals: PyReadonlyArray1<'py, f64>,
     cluster_ids: Option<PyReadonlyArray1<'py, i64>>,
-) -> PyResult<&'py PyArray2<f64>> {
+) -> PyResult<Bound<'py, PyArray2<f64>>> {
     let x_arr = x.as_array();
     let residuals_arr = residuals.as_array();
     let cluster_arr = cluster_ids.as_ref().map(|c| c.as_array().to_owned());
 
-    let vcov = compute_robust_vcov_internal(&x_arr, &residuals_arr, cluster_arr.as_ref())?;
-    Ok(vcov.into_pyarray(py))
+    let n = x_arr.nrows();
+    let k = x_arr.ncols();
+    let vcov = compute_robust_vcov_internal(&x_arr, &residuals_arr, cluster_arr.as_ref(), n, k)?;
+    Ok(vcov.to_pyarray_bound(py))
 }
 
 /// Internal implementation of robust variance-covariance computation.
@@ -157,14 +194,13 @@ fn compute_robust_vcov_internal(
     x: &ArrayView2<f64>,
     residuals: &ArrayView1<f64>,
     cluster_ids: Option<&Array1<i64>>,
+    n: usize,
+    k: usize,
 ) -> PyResult<Array2<f64>> {
-    let n = x.nrows();
-    let k = x.ncols();
-
     // Compute X'X
     let xtx = x.t().dot(x);
 
-    // Compute (X'X)^{-1} using Cholesky decomposition
+    // Compute (X'X)^{-1} using LU decomposition
     let xtx_inv = invert_symmetric(&xtx)?;
 
     match cluster_ids {
@@ -240,58 +276,110 @@ fn compute_robust_vcov_internal(
     }
 }
 
+/// Convert ndarray Array2 to faer Mat
+fn ndarray_to_faer(arr: &Array2<f64>) -> faer::Mat<f64> {
+    let nrows = arr.nrows();
+    let ncols = arr.ncols();
+    faer::Mat::from_fn(nrows, ncols, |i, j| arr[[i, j]])
+}
+
 /// Invert a symmetric positive-definite matrix.
 ///
-/// Tries Cholesky factorization first (faster for well-conditioned SPD matrices),
-/// falls back to LU decomposition for near-singular or indefinite matrices.
+/// Uses LU decomposition with partial pivoting. Includes both NaN/Inf check
+/// and conditional residual-based verification to catch near-singular matrices
+/// that produce finite but numerically inaccurate results.
 ///
-/// Cholesky (when applicable):
-/// - ~2x faster than LU decomposition
-/// - More numerically stable for positive-definite matrices
-/// - Reuses the factorization across all column solves
+/// Performance optimization: The expensive O(n³) residual check (A * A⁻¹ - I)
+/// is only performed when LU pivot ratios suggest potential instability. For
+/// well-conditioned matrices (the common case), this check is skipped.
 fn invert_symmetric(a: &Array2<f64>) -> PyResult<Array2<f64>> {
     let n = a.nrows();
 
-    // Try Cholesky factorization first (faster for well-conditioned SPD matrices)
-    if let Ok(factorized) = a.factorizec(UPLO::Lower) {
-        // Solve A X = I for each column using Cholesky
-        let mut result = Array2::<f64>::zeros((n, n));
-        let mut cholesky_failed = false;
+    // Convert ndarray to faer
+    let a_faer = ndarray_to_faer(a);
+
+    // Create identity matrix in faer
+    let identity = faer::Mat::from_fn(n, n, |i, j| if i == j { 1.0 } else { 0.0 });
+
+    // Use LU decomposition with partial pivoting
+    let lu = PartialPivLu::new(a_faer.as_ref());
+
+    // Solve A * X = I  =>  X = A^{-1}
+    let x_faer = lu.solve(&identity);
+
+    // Check for NaN/Inf in result (indicates singular matrix)
+    let mut has_nan = false;
+    for i in 0..n {
+        for j in 0..n {
+            if !x_faer[(i, j)].is_finite() {
+                has_nan = true;
+                break;
+            }
+        }
+        if has_nan {
+            break;
+        }
+    }
 
+    if has_nan {
+        return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
+            "Matrix inversion failed (singular matrix)"
+        ));
+    }
+
+    // Check pivot ratio to detect potential instability.
+    // The diagonal of U contains the pivots from LU factorization.
+    // A small pivot ratio (min/max) indicates potential numerical instability.
+    let u_factor = lu.U();
+    let mut max_pivot = 0.0_f64;
+    let mut min_pivot = f64::INFINITY;
+    for i in 0..n {
+        let pivot = u_factor[(i, i)].abs();
+        if pivot > 0.0 {
+            max_pivot = max_pivot.max(pivot);
+            min_pivot = min_pivot.min(pivot);
+        }
+    }
+    let pivot_ratio = if max_pivot > 0.0 { min_pivot / max_pivot } else { 0.0 };
+
+    // Only perform expensive residual check if pivots suggest potential instability.
+    // Threshold of 1e-10 catches truly problematic matrices while avoiding
+    // unnecessary O(n³) computation for well-conditioned cases.
+    if pivot_ratio < 1e-10 {
+        // Verify inversion accuracy by checking ||A * A^{-1} - I||_max
+        // For near-singular matrices, this residual will be large even if
+        // the result contains no NaN/Inf values
+        let a_times_inv = a_faer.as_ref() * &x_faer;
+        let mut max_residual = 0.0_f64;
         for i in 0..n {
-            let mut e_i = Array1::<f64>::zeros(n);
-            e_i[i] = 1.0;
-
-            match factorized.solvec(&e_i) {
-                Ok(col) => result.column_mut(i).assign(&col),
-                Err(_) => {
-                    cholesky_failed = true;
-                    break;
-                }
+            for j in 0..n {
+                let expected = if i == j { 1.0 } else { 0.0 };
+                let residual = (a_times_inv[(i, j)] - expected).abs();
+                max_residual = max_residual.max(residual);
             }
         }
 
-        if !cholesky_failed {
-            return Ok(result);
+        // Threshold: detect truly singular matrices while allowing ill-conditioned ones
+        // Ill-conditioned matrices (high condition number) can have residuals up to ~1e-4
+        // while still producing usable results. Use 1e-4 * n as threshold.
+        let threshold = 1e-4 * (n as f64);
+        if max_residual > threshold {
+            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
+                format!(
+                    "Matrix inversion numerically unstable (residual={:.2e} > threshold={:.2e}). \
+                     Design matrix may be near-singular.",
+                    max_residual, threshold
+                )
+            ));
         }
     }
 
-    // Fallback to LU decomposition for near-singular or indefinite matrices
+    // Convert back to ndarray
     let mut result = Array2::<f64>::zeros((n, n));
     for i in 0..n {
-        let mut e_i = Array1::<f64>::zeros(n);
-        e_i[i] = 1.0;
-
-        let col = a.solve(&e_i).map_err(|e| {
-            PyErr::new::<pyo3::exceptions::PyValueError, _>(format!(
-                "Matrix inversion failed (likely rank-deficient X'X): {}. \
-                 If the design matrix is rank-deficient, use solve_ols without \
-                 skip_rank_check=True to enable R-style handling.",
-                e
-            ))
-        })?;
-
-        result.column_mut(i).assign(&col);
+        for j in 0..n {
+            result[[i, j]] = x_faer[(i, j)];
+        }
     }
 
     Ok(result)
@@ -314,4 +402,48 @@ mod tests {
         assert!((identity[[0, 1]]).abs() < 1e-10);
         assert!((identity[[1, 0]]).abs() < 1e-10);
     }
+
+    #[test]
+    fn test_ndarray_to_faer() {
+        let arr = array![[1.0, 2.0], [3.0, 4.0]];
+        let faer_mat = ndarray_to_faer(&arr);
+        assert_eq!(faer_mat[(0, 0)], 1.0);
+        assert_eq!(faer_mat[(0, 1)], 2.0);
+        assert_eq!(faer_mat[(1, 0)], 3.0);
+        assert_eq!(faer_mat[(1, 1)], 4.0);
+    }
+
+    #[test]
+    fn test_svd_underdetermined_dimensions() {
+        // Underdetermined system: n=2 observations, k=3 coefficients
+        // X is (2, 3), y is (2,)
+        // This test verifies that thin SVD returns the correct dimensions
+        // for underdetermined systems and that our code handles them correctly
+        let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
+        let _y = array![7.0, 8.0];
+
+        // Convert to faer and compute thin SVD
+        let x_faer = ndarray_to_faer(&x);
+        let svd = x_faer.thin_svd().unwrap();
+
+        // For n=2 < k=3: U is (2, 2), S has 2 values, V is (3, 2)
+        assert_eq!(svd.U().nrows(), 2, "U should have n=2 rows");
+        assert_eq!(svd.U().ncols(), 2, "U should have min(n,k)=2 cols");
+        assert_eq!(svd.S().column_vector().nrows(), 2, "S should have min(n,k)=2 singular values");
+        assert_eq!(svd.V().nrows(), 3, "V should have k=3 rows");
+        assert_eq!(svd.V().ncols(), 2, "V should have min(n,k)=2 cols");
+
+        // Verify s_inv_uty dimension calculation
+        let s_len = svd.S().column_vector().nrows();
+        assert_eq!(s_len, 2, "s.len() should be min(n,k)=2, not k=3");
+
+        // This is the key fix: s_inv_uty must have dimension s.len()=min(n,k),
+        // not k, otherwise vt.t().dot(&s_inv_uty) will have mismatched dimensions
+    }
+
+    // Note: Singular and near-singular matrix tests removed because:
+    // 1. invert_symmetric() returns PyResult, which requires Python initialization
+    //    to create PyErr - `cargo test` without Python causes panic
+    // 2. These edge cases are tested at the Python integration level in
+    //    tests/test_linalg.py with proper fallback handling
 }
diff --git a/rust/src/trop.rs b/rust/src/trop.rs
index 2f26994..9465de7 100644
--- a/rust/src/trop.rs
+++ b/rust/src/trop.rs
@@ -10,7 +10,7 @@
 //! Panel Estimators. Working Paper.
 
 use ndarray::{Array1, Array2, ArrayView1, ArrayView2, Axis};
-use numpy::{IntoPyArray, PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2};
+use numpy::{PyArray1, PyArray2, PyReadonlyArray1, PyReadonlyArray2, ToPyArray};
 use pyo3::prelude::*;
 use rayon::prelude::*;
 
@@ -38,13 +38,13 @@ pub fn compute_unit_distance_matrix<'py>(
     py: Python<'py>,
     y: PyReadonlyArray2<'py, f64>,
     d: PyReadonlyArray2<'py, f64>,
-) -> PyResult<&'py PyArray2<f64>> {
+) -> PyResult<Bound<'py, PyArray2<f64>>> {
     let y_arr = y.as_array();
     let d_arr = d.as_array();
 
     let dist_matrix = compute_unit_distance_matrix_internal(&y_arr, &d_arr);
 
-    Ok(dist_matrix.into_pyarray(py))
+    Ok(dist_matrix.to_pyarray_bound(py))
 }
 
 /// Internal implementation of unit distance matrix computation.
@@ -778,42 +778,71 @@ fn soft_threshold_svd(m: &Array2<f64>, threshold: f64) -> Option<Array2<f64>> {
         return Some(Array2::zeros(m.raw_dim()));
     }
 
-    // Compute SVD using ndarray-linalg
-    use ndarray_linalg::SVD;
+    let n_rows = m.nrows();
+    let n_cols = m.ncols();
+
+    // Convert ndarray to faer using Mat::from_fn (pure Rust, no external deps)
+    let m_faer = faer::Mat::from_fn(n_rows, n_cols, |i, j| m[[i, j]]);
 
-    let (u, s, vt) = match m.svd(true, true) {
-        Ok((Some(u), s, Some(vt))) => (u, s, vt),
-        _ => return Some(Array2::zeros(m.raw_dim())),
+    // Compute thin SVD using faer (capitalized methods in faer 0.24)
+    let svd = match m_faer.thin_svd() {
+        Ok(s) => s,
+        Err(_) => return Some(Array2::zeros(m.raw_dim())),
     };
 
+    let u_faer = svd.U();
+    let s_diag = svd.S();  // Returns diagonal view
+    let s_col = s_diag.column_vector();  // Get as column vector
+    let v_faer = svd.V();  // This is V, not V^T
+
+    let s_len = s_col.nrows();
+
     // Check for non-finite SVD output
-    if !u.iter().all(|&x| x.is_finite())
-        || !s.iter().all(|&x| x.is_finite())
-        || !vt.iter().all(|&x| x.is_finite())
-    {
-        return Some(Array2::zeros(m.raw_dim()));
+    for i in 0..u_faer.nrows() {
+        for j in 0..u_faer.ncols() {
+            if !u_faer[(i, j)].is_finite() {
+                return Some(Array2::zeros(m.raw_dim()));
+            }
+        }
+    }
+    for i in 0..s_len {
+        if !s_col[i].is_finite() {
+            return Some(Array2::zeros(m.raw_dim()));
+        }
+    }
+    for i in 0..v_faer.nrows() {
+        for j in 0..v_faer.ncols() {
+            if !v_faer[(i, j)].is_finite() {
+                return Some(Array2::zeros(m.raw_dim()));
+            }
+        }
     }
 
-    // Soft-threshold singular values
-    let s_thresh: Array1<f64> = s.mapv(|sv| (sv - threshold).max(0.0));
-
-    // Count non-zero singular values
-    let nonzero_count = s_thresh.iter().filter(|&&sv| sv > 1e-10).count();
+    // Soft-threshold singular values and count non-zero
+    let mut s_thresh = Vec::with_capacity(s_len);
+    let mut nonzero_count = 0;
+    for i in 0..s_len {
+        let sv = s_col[i];
+        let sv_thresh = (sv - threshold).max(0.0);
+        s_thresh.push(sv_thresh);
+        if sv_thresh > 1e-10 {
+            nonzero_count += 1;
+        }
+    }
 
     if nonzero_count == 0 {
         return Some(Array2::zeros(m.raw_dim()));
     }
 
-    // Truncated reconstruction: U @ diag(s_thresh) @ Vt
-    let n_rows = m.nrows();
-    let n_cols = m.ncols();
+    // Truncated reconstruction: U @ diag(s_thresh) @ V^T
     let mut result = Array2::<f64>::zeros((n_rows, n_cols));
 
-    for k in 0..nonzero_count {
+    for k in 0..s_thresh.len() {
         if s_thresh[k] > 1e-10 {
             for i in 0..n_rows {
                 for j in 0..n_cols {
-                    result[[i, j]] += s_thresh[k] * u[[i, k]] * vt[[k, j]];
+                    // u_faer[(i, k)] * s_thresh[k] * v_faer[(j, k)] (since V^T[k,j] = V[j,k])
+                    result[[i, j]] += s_thresh[k] * u_faer[(i, k)] * v_faer[(j, k)];
                 }
             }
         }
@@ -878,7 +907,7 @@ pub fn bootstrap_trop_variance<'py>(
     max_iter: usize,
     tol: f64,
     seed: u64,
-) -> PyResult<(&'py PyArray1<f64>, f64)> {
+) -> PyResult<(Bound<'py, PyArray1<f64>>, f64)> {
     let y_arr = y.as_array().to_owned();
     let d_arr = d.as_array().to_owned();
     let control_mask_arr = control_mask.as_array().to_owned();
@@ -1007,8 +1036,9 @@ pub fn bootstrap_trop_variance<'py>(
         .collect();
 
     // Compute standard error
+    // Return NaN when < 2 samples to properly propagate undefined inference
     let se = if bootstrap_estimates.len() < 2 {
-        0.0
+        f64::NAN
     } else {
         let n = bootstrap_estimates.len() as f64;
         let mean = bootstrap_estimates.iter().sum::<f64>() / n;
@@ -1021,7 +1051,7 @@ pub fn bootstrap_trop_variance<'py>(
     };
 
     let estimates_arr = Array1::from_vec(bootstrap_estimates);
-    Ok((estimates_arr.into_pyarray(py), se))
+    Ok((estimates_arr.to_pyarray_bound(py), se))
 }
 
 // ============================================================================
@@ -1566,7 +1596,7 @@ pub fn bootstrap_trop_variance_joint<'py>(
     max_iter: usize,
     tol: f64,
     seed: u64,
-) -> PyResult<(&'py PyArray1<f64>, f64)> {
+) -> PyResult<(Bound<'py, PyArray1<f64>>, f64)> {
     let y_arr = y.as_array().to_owned();
     let d_arr = d.as_array().to_owned();
 
@@ -1672,8 +1702,9 @@ pub fn bootstrap_trop_variance_joint<'py>(
         .collect();
 
     // Compute standard error
+    // Return NaN when < 2 samples to properly propagate undefined inference
     let se = if bootstrap_estimates.len() < 2 {
-        0.0
+        f64::NAN
     } else {
         let n = bootstrap_estimates.len() as f64;
         let mean = bootstrap_estimates.iter().sum::<f64>() / n;
@@ -1686,7 +1717,7 @@ pub fn bootstrap_trop_variance_joint<'py>(
     };
 
     let estimates_arr = Array1::from_vec(bootstrap_estimates);
-    Ok((estimates_arr.into_pyarray(py), se))
+    Ok((estimates_arr.to_pyarray_bound(py), se))
 }
 
 #[cfg(test)]
diff --git a/rust/src/weights.rs b/rust/src/weights.rs
index 6e7a4e5..40e3ae0 100644
--- a/rust/src/weights.rs
+++ b/rust/src/weights.rs
@@ -5,7 +5,7 @@
 //! - Simplex projection
 
 use ndarray::{Array1, ArrayView1, ArrayView2};
-use numpy::{IntoPyArray, PyArray1, PyReadonlyArray1, PyReadonlyArray2};
+use numpy::{PyArray1, PyReadonlyArray1, PyReadonlyArray2, ToPyArray};
 use pyo3::prelude::*;
 
 /// Maximum number of optimization iterations.
@@ -40,14 +40,14 @@ pub fn compute_synthetic_weights<'py>(
     lambda_reg: f64,
     max_iter: Option<usize>,
     tol: Option<f64>,
-) -> PyResult<&'py PyArray1<f64>> {
+) -> PyResult<Bound<'py, PyArray1<f64>>> {
     let y_control_arr = y_control.as_array();
     let y_treated_arr = y_treated.as_array();
 
     let weights =
         compute_synthetic_weights_internal(&y_control_arr, &y_treated_arr, lambda_reg, max_iter, tol)?;
 
-    Ok(weights.into_pyarray(py))
+    Ok(weights.to_pyarray_bound(py))
 }
 
 /// Internal implementation of synthetic weight computation.
@@ -123,10 +123,10 @@ fn compute_synthetic_weights_internal(
 pub fn project_simplex<'py>(
     py: Python<'py>,
     v: PyReadonlyArray1<'py, f64>,
-) -> PyResult<&'py PyArray1<f64>> {
+) -> PyResult<Bound<'py, PyArray1<f64>>> {
     let v_arr = v.as_array();
     let result = project_simplex_internal(&v_arr);
-    Ok(result.into_pyarray(py))
+    Ok(result.to_pyarray_bound(py))
 }
 
 /// Internal implementation of simplex projection.
diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 456671f..ac519d9 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -520,6 +520,41 @@ def test_cluster_robust_symmetric(self, ols_data):
 
         np.testing.assert_array_almost_equal(vcov, vcov.T)
 
+    def test_numerical_instability_fallback_warns(self, ols_data):
+        """Test that numerical instability in Rust backend triggers warning and fallback."""
+        from unittest.mock import patch
+        import warnings
+
+        from diff_diff import HAS_RUST_BACKEND
+
+        if not HAS_RUST_BACKEND:
+            pytest.skip("Rust backend not available")
+
+        X, residuals = ols_data
+
+        # Mock _rust_compute_robust_vcov to raise numerical instability error
+        def mock_rust_vcov(*args, **kwargs):
+            raise ValueError("Matrix inversion numerically unstable")
+
+        with patch("diff_diff.linalg._rust_compute_robust_vcov", mock_rust_vcov):
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                warnings.simplefilter("always")
+                vcov = compute_robust_vcov(X, residuals)
+
+                # Verify warning was emitted
+                instability_warnings = [
+                    w for w in caught_warnings
+                    if "numerical instability" in str(w.message).lower()
+                ]
+                assert len(instability_warnings) == 1, (
+                    f"Expected 1 numerical instability warning, got {len(instability_warnings)}"
+                )
+
+                # Verify fallback produced valid vcov matrix
+                assert vcov.shape == (X.shape[1], X.shape[1])
+                assert np.allclose(vcov, vcov.T)  # Symmetric
+                assert np.all(np.linalg.eigvalsh(vcov) >= -1e-10)  # PSD
+
 
 class TestComputeRSquared:
     """Tests for compute_r_squared function."""