diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt
index e8a9b5dce..af1415fa9 100644
--- a/cpp/src/dual_simplex/CMakeLists.txt
+++ b/cpp/src/dual_simplex/CMakeLists.txt
@@ -1,5 +1,5 @@
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 
@@ -31,6 +31,7 @@ set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/triangle_solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vector_math.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pinned_host_allocator.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/diving_heuristics.cpp
   )
 
 # Uncomment to enable debug info
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index db24f55a2..f5cd54053 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -613,6 +613,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                 const std::vector<f_t>& lower,
+                 const std::vector<f_t>& upper,
                  const std::vector<i_t>& deficient,
                  const std::vector<i_t>& slacks_needed,
                  std::vector<i_t>& basis_list,
@@ -658,7 +660,15 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
     nonbasic_list[nonbasic_map[replace_j]] = bad_j;
     vstatus[replace_j]                     = variable_status_t::BASIC;
     // This is the main issue. What value should bad_j take on.
-    vstatus[bad_j] = variable_status_t::NONBASIC_FREE;
+    if (lower[bad_j] == -inf && upper[bad_j] == inf) {
+      vstatus[bad_j] = variable_status_t::NONBASIC_FREE;
+    } else if (lower[bad_j] > -inf) {
+      vstatus[bad_j] = variable_status_t::NONBASIC_LOWER;
+    } else if (upper[bad_j] < inf) {
+      vstatus[bad_j] = variable_status_t::NONBASIC_UPPER;
+    } else {
+      assert(1 == 0);
+    }
   }
 
   return 0;
@@ -849,6 +859,8 @@ template int factorize_basis<int>(const csc_matrix_t<int, double>& A,
 
 template int basis_repair<int, double>(const csc_matrix_t<int, double>& A,
                                        const simplex_solver_settings_t<int, double>& settings,
+                                       const std::vector<double>& lower,
+                                       const std::vector<double>& upper,
                                        const std::vector<int>& deficient,
                                        const std::vector<int>& slacks_needed,
                                        std::vector<int>& basis_list,
diff --git a/cpp/src/dual_simplex/basis_solves.hpp b/cpp/src/dual_simplex/basis_solves.hpp
index b668c0f46..295bedccd 100644
--- a/cpp/src/dual_simplex/basis_solves.hpp
+++ b/cpp/src/dual_simplex/basis_solves.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -42,6 +42,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                 const std::vector<f_t>& lower,
+                 const std::vector<f_t>& upper,
                  const std::vector<i_t>& deficient,
                  const std::vector<i_t>& slacks_needed,
                  std::vector<i_t>& basis_list,
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
index 6b79f3c86..2c781a515 100644
--- a/cpp/src/dual_simplex/basis_updates.cpp
+++ b/cpp/src/dual_simplex/basis_updates.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -2046,6 +2046,8 @@ template <typename i_t, typename f_t>
 int basis_update_mpf_t<i_t, f_t>::refactor_basis(
   const csc_matrix_t<i_t, f_t>& A,
   const simplex_solver_settings_t<i_t, f_t>& settings,
+  const std::vector<f_t>& lower,
+  const std::vector<f_t>& upper,
   std::vector<i_t>& basic_list,
   std::vector<i_t>& nonbasic_list,
   std::vector<variable_status_t>& vstatus)
@@ -2066,7 +2068,8 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
                       deficient,
                       slacks_needed) == -1) {
     settings.log.debug("Initial factorization failed\n");
-    basis_repair(A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+    basis_repair(
+      A, settings, lower, upper, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
 
 #ifdef CHECK_BASIS_REPAIR
     const i_t m = A.m;
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index cea907074..038db59b9 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -223,6 +223,9 @@ class basis_update_mpf_t {
     reset_stats();
   }
 
+  basis_update_mpf_t(const basis_update_mpf_t& other)            = default;
+  basis_update_mpf_t& operator=(const basis_update_mpf_t& other) = default;
+
   void print_stats() const
   {
     i_t total_L_transpose_calls = total_sparse_L_transpose_ + total_dense_L_transpose_;
@@ -373,6 +376,8 @@ class basis_update_mpf_t {
   // Compute L*U = A(p, basic_list)
   int refactor_basis(const csc_matrix_t<i_t, f_t>& A,
                      const simplex_solver_settings_t<i_t, f_t>& settings,
+                     const std::vector<f_t>& lower,
+                     const std::vector<f_t>& upper,
                      std::vector<i_t>& basic_list,
                      std::vector<i_t>& nonbasic_list,
                      std::vector<variable_status_t>& vstatus);
diff --git a/cpp/src/dual_simplex/bnb_worker.hpp b/cpp/src/dual_simplex/bnb_worker.hpp
new file mode 100644
index 000000000..e77b64639
--- /dev/null
+++ b/cpp/src/dual_simplex/bnb_worker.hpp
@@ -0,0 +1,280 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+#include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/phase2.hpp>
+
+#include <array>
+#include <deque>
+#include <mutex>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+constexpr int bnb_num_worker_types = 5;
+
+// Indicate the search and variable selection algorithms used by each thread
+// in B&B (See [1]).
+//
+// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
+// Berlin, 2007. doi: 10.14279/depositonce-1634.
+enum bnb_worker_type_t : int {
+  EXPLORATION        = 0,  // Best-First + Plunging.
+  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
+  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
+  GUIDED_DIVING      = 3,  // Guided diving (9.2.3).
+  COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
+};
+
+template <typename i_t, typename f_t>
+struct bnb_stats_t {
+  f_t start_time                         = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time  = 0.0;
+  omp_atomic_t<i_t> nodes_explored       = 0;
+  omp_atomic_t<i_t> nodes_unexplored     = 0;
+  omp_atomic_t<f_t> total_lp_iters       = 0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+  omp_atomic_t<f_t> last_log             = 0.0;
+};
+
+template <typename i_t, typename f_t>
+class bnb_worker_data_t {
+ public:
+  const i_t worker_id;
+  omp_atomic_t<bnb_worker_type_t> worker_type;
+  omp_atomic_t<bool> is_active;
+  omp_atomic_t<f_t> lower_bound;
+
+  lp_problem_t<i_t, f_t> leaf_problem;
+  lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<f_t> leaf_edge_norms;
+
+  basis_update_mpf_t<i_t, f_t> basis_factors;
+  std::vector<i_t> basic_list;
+  std::vector<i_t> nonbasic_list;
+
+  bounds_strengthening_t<i_t, f_t> node_presolver;
+  std::vector<bool> bounds_changed;
+
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+  mip_node_t<i_t, f_t>* start_node;
+
+  bool recompute_basis  = true;
+  bool recompute_bounds = true;
+
+  bnb_worker_data_t(i_t worker_id,
+                    const lp_problem_t<i_t, f_t>& original_lp,
+                    const csr_matrix_t<i_t, f_t>& Arow,
+                    const std::vector<variable_type_t>& var_type,
+                    const simplex_solver_settings_t<i_t, f_t>& settings)
+    : worker_id(worker_id),
+      worker_type(EXPLORATION),
+      is_active(false),
+      lower_bound(-std::numeric_limits<f_t>::infinity()),
+      leaf_problem(original_lp),
+      leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      basis_factors(original_lp.num_rows, settings.refactor_frequency),
+      basic_list(original_lp.num_rows),
+      nonbasic_list(),
+      node_presolver(leaf_problem, Arow, {}, var_type),
+      bounds_changed(original_lp.num_cols, false)
+  {
+  }
+
+  // Set the `start_node` for best-first search.
+  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+  {
+    start_node  = node;
+    start_lower = original_lp.lower;
+    start_upper = original_lp.upper;
+    worker_type = EXPLORATION;
+    lower_bound = node->lower_bound;
+    is_active   = true;
+  }
+
+  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
+  // `start_upper`. Returns `true` if the starting node is feasible via
+  // bounds propagation.
+  bool init_diving(mip_node_t<i_t, f_t>* node,
+                   bnb_worker_type_t type,
+                   const lp_problem_t<i_t, f_t>& original_lp,
+                   const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    internal_node = node->detach_copy();
+    start_node    = &internal_node;
+
+    start_lower = original_lp.lower;
+    start_upper = original_lp.upper;
+    worker_type = type;
+    lower_bound = node->lower_bound;
+    is_active   = true;
+
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
+
+    return node_presolver.bounds_strengthening(start_lower, start_upper, bounds_changed, settings);
+  }
+
+  // Set the variables bounds for the LP relaxation of the current node.
+  bool set_lp_variable_bounds_for(mip_node_t<i_t, f_t>* node_ptr,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    // Reset the bound_changed markers
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+
+    // Set the correct bounds for the leaf problem
+    if (recompute_bounds) {
+      leaf_problem.lower = start_lower;
+      leaf_problem.upper = start_upper;
+      node_ptr->get_variable_bounds(leaf_problem.lower, leaf_problem.upper, bounds_changed);
+
+    } else {
+      node_ptr->update_branched_variable_bounds(
+        leaf_problem.lower, leaf_problem.upper, bounds_changed);
+    }
+
+    return node_presolver.bounds_strengthening(
+      leaf_problem.lower, leaf_problem.upper, bounds_changed, settings);
+  }
+
+ private:
+  // For diving, we need to store the full node instead of
+  // of just a pointer, since it is not store in the tree anymore.
+  // To keep the same interface across all worker types,
+  // this will be used as a temporary storage and
+  // will be pointed by `start_node`.
+  // For exploration, this will not be used.
+  mip_node_t<i_t, f_t> internal_node;
+};
+
+template <typename i_t, typename f_t>
+class bnb_worker_pool_t {
+ public:
+  void init(i_t num_workers,
+            const lp_problem_t<i_t, f_t>& original_lp,
+            const csr_matrix_t<i_t, f_t>& Arow,
+            const std::vector<variable_type_t>& var_type,
+            const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    workers_.resize(num_workers);
+    num_idle_workers_ = num_workers;
+    for (i_t i = 0; i < num_workers; ++i) {
+      workers_[i] =
+        std::make_unique<bnb_worker_data_t<i_t, f_t>>(i, original_lp, Arow, var_type, settings);
+      idle_workers_.push_front(i);
+    }
+  }
+
+  bnb_worker_data_t<i_t, f_t>* get_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+
+    if (idle_workers_.empty()) {
+      return nullptr;
+    } else {
+      i_t idx = idle_workers_.front();
+      return workers_[idx].get();
+    }
+  }
+
+  void pop_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    if (!idle_workers_.empty()) {
+      idle_workers_.pop_front();
+      num_idle_workers_--;
+    }
+  }
+
+  bnb_worker_data_t<i_t, f_t>* get_and_pop_idle_worker()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+
+    if (idle_workers_.empty()) {
+      return nullptr;
+    } else {
+      i_t idx = idle_workers_.front();
+      idle_workers_.pop_front();
+      num_idle_workers_--;
+      return workers_[idx].get();
+    }
+  }
+
+  void return_worker_to_pool(bnb_worker_data_t<i_t, f_t>* worker)
+  {
+    worker->is_active = false;
+    std::lock_guard<omp_mutex_t> lock(mutex_);
+    idle_workers_.push_back(worker->worker_id);
+    num_idle_workers_++;
+  }
+
+  f_t get_lower_bounds()
+  {
+    f_t lower_bound = std::numeric_limits<f_t>::infinity();
+
+    for (i_t i = 0; i < workers_.size(); ++i) {
+      if (workers_[i]->worker_type == EXPLORATION && workers_[i]->is_active) {
+        lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
+      }
+    }
+
+    return lower_bound;
+  }
+
+  i_t num_idle_workers() { return num_idle_workers_; }
+
+ private:
+  // Worker pool
+  std::vector<std::unique_ptr<bnb_worker_data_t<i_t, f_t>>> workers_;
+
+  omp_mutex_t mutex_;
+  std::deque<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_;
+};
+
+template <typename f_t, typename i_t>
+std::vector<bnb_worker_type_t> bnb_get_worker_types(diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  std::vector<bnb_worker_type_t> types;
+  types.reserve(bnb_num_worker_types);
+  types.push_back(EXPLORATION);
+  if (!settings.disable_pseudocost_diving) { types.push_back(PSEUDOCOST_DIVING); }
+  if (!settings.disable_line_search_diving) { types.push_back(LINE_SEARCH_DIVING); }
+  if (!settings.disable_guided_diving) { types.push_back(GUIDED_DIVING); }
+  if (!settings.disable_coefficient_diving) { types.push_back(COEFFICIENT_DIVING); }
+  return types;
+}
+
+template <typename f_t, typename i_t>
+std::array<i_t, bnb_num_worker_types> bnb_get_num_workers_round_robin(
+  i_t num_threads, diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  std::array<i_t, bnb_num_worker_types> max_num_workers;
+  auto worker_types = bnb_get_worker_types(settings);
+
+  max_num_workers.fill(0);
+  max_num_workers[EXPLORATION] = std::max(1, num_threads / 2);
+
+  i_t diving_workers = 2 * settings.num_diving_workers;
+  i_t m              = worker_types.size() - 1;
+
+  for (size_t i = 1, k = 0; i < worker_types.size(); ++i) {
+    i_t start                        = (double)k * diving_workers / m;
+    i_t end                          = (double)(k + 1) * diving_workers / m;
+    max_num_workers[worker_types[i]] = end - start;
+    ++k;
+  }
+
+  return max_num_workers;
+}
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp
index f1bf52c1e..712f25837 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.cpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -59,8 +59,7 @@ bounds_strengthening_t<i_t, f_t>::bounds_strengthening_t(
   const csr_matrix_t<i_t, f_t>& Arow,
   const std::vector<char>& row_sense,
   const std::vector<variable_type_t>& var_types)
-  : bounds_changed(problem.num_cols, false),
-    A(problem.A),
+  : A(problem.A),
     Arow(Arow),
     var_types(var_types),
     delta_min_activity(problem.num_rows),
@@ -93,6 +92,7 @@ template <typename i_t, typename f_t>
 bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
   std::vector<f_t>& lower_bounds,
   std::vector<f_t>& upper_bounds,
+  const std::vector<bool>& bounds_changed,
   const simplex_solver_settings_t<i_t, f_t>& settings)
 {
   const i_t m = A.m;
@@ -154,7 +154,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       bool is_infeasible =
         check_infeasibility<i_t, f_t>(min_a, max_a, cnst_lb, cnst_ub, settings.primal_tol);
       if (is_infeasible) {
-        settings.log.printf(
+        settings.log.debug(
           "Iter:: %d, Infeasible constraint %d, cnst_lb %e, cnst_ub %e, min_a %e, max_a %e\n",
           iter,
           i,
@@ -211,7 +211,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       new_ub = std::min(new_ub, upper_bounds[k]);
 
       if (new_lb > new_ub + 1e-6) {
-        settings.log.printf(
+        settings.log.debug(
           "Iter:: %d, Infeasible variable after update %d, %e > %e\n", iter, k, new_lb, new_ub);
         return false;
       }
diff --git a/cpp/src/dual_simplex/bounds_strengthening.hpp b/cpp/src/dual_simplex/bounds_strengthening.hpp
index e7e218b82..6a43ef44c 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.hpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -22,10 +22,9 @@ class bounds_strengthening_t {
 
   bool bounds_strengthening(std::vector<f_t>& lower_bounds,
                             std::vector<f_t>& upper_bounds,
+                            const std::vector<bool>& bounds_changed,
                             const simplex_solver_settings_t<i_t, f_t>& settings);
 
-  std::vector<bool> bounds_changed;
-
  private:
   const csc_matrix_t<i_t, f_t>& A;
   const csr_matrix_t<i_t, f_t>& Arow;
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 6161f4d3f..d6a95e82e 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -1,14 +1,13 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
 
-#include <omp.h>
-#include <algorithm>
-#include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/branch_and_bound.hpp>
+
+#include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/crossover.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/logger.hpp>
@@ -20,6 +19,9 @@
 #include <dual_simplex/tic_toc.hpp>
 #include <dual_simplex/user_problem.hpp>
 
+#include <omp.h>
+
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
@@ -183,30 +185,27 @@ std::string user_mip_gap(f_t obj_value, f_t lower_bound)
 {
   const f_t user_mip_gap = relative_gap(obj_value, lower_bound);
   if (user_mip_gap == std::numeric_limits<f_t>::infinity()) {
-    return "  -  ";
+    return "   -  ";
   } else {
     constexpr int BUFFER_LEN = 32;
     char buffer[BUFFER_LEN];
-    snprintf(buffer, BUFFER_LEN - 1, "%4.1f%%", user_mip_gap * 100);
+    snprintf(buffer, BUFFER_LEN - 1, "%5.1f%%", user_mip_gap * 100);
     return std::string(buffer);
   }
 }
 
-inline const char* feasible_solution_symbol(thread_type_t type)
+inline const char* feasible_solution_symbol(bnb_worker_type_t type)
 {
   switch (type) {
-    case thread_type_t::EXPLORATION: return "B";
-    case thread_type_t::DIVING: return "D";
-    default: return "U";
+    case bnb_worker_type_t::EXPLORATION: return "B ";
+    case bnb_worker_type_t::COEFFICIENT_DIVING: return "D ";
+    case bnb_worker_type_t::LINE_SEARCH_DIVING: return "D ";
+    case bnb_worker_type_t::PSEUDOCOST_DIVING: return "D ";
+    case bnb_worker_type_t::GUIDED_DIVING: return "D ";
+    default: return "U ";
   }
 }
 
-inline bool has_children(node_solve_info_t status)
-{
-  return status == node_solve_info_t::UP_CHILD_FIRST ||
-         status == node_solve_info_t::DOWN_CHILD_FIRST;
-}
-
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -216,53 +215,74 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   : original_problem_(user_problem),
     settings_(solver_settings),
     original_lp_(user_problem.handle_ptr, 1, 1, 1),
+    Arow_(1, 1, 0),
     incumbent_(1),
     root_relax_soln_(1, 1),
     root_crossover_soln_(1, 1),
     pc_(1),
-    solver_status_(mip_exploration_status_t::UNSET)
+    solver_status_(mip_status_t::UNSET)
 {
   exploration_stats_.start_time = tic();
   dualize_info_t<i_t, f_t> dualize_info;
   convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info);
   full_variable_types(original_problem_, original_lp_, var_types_);
 
-  mutex_upper_.lock();
   upper_bound_ = inf;
-  mutex_upper_.unlock();
 }
 
 template <typename i_t, typename f_t>
-f_t branch_and_bound_t<i_t, f_t>::get_upper_bound()
+f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  mutex_upper_.lock();
-  const f_t upper_bound = upper_bound_;
-  mutex_upper_.unlock();
-  return upper_bound;
+  f_t lower_bound      = lower_bound_ceiling_.load();
+  f_t heap_lower_bound = node_queue_.get_lower_bound();
+  lower_bound          = std::min(heap_lower_bound, lower_bound);
+  lower_bound          = std::min(worker_pool_.get_lower_bounds(), lower_bound);
+  return std::isfinite(lower_bound) ? lower_bound : -inf;
 }
 
 template <typename i_t, typename f_t>
-f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
+void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
 {
-  f_t lower_bound = lower_bound_ceiling_.load();
-  mutex_heap_.lock();
-  if (heap_.size() > 0) { lower_bound = std::min(heap_.top()->lower_bound, lower_bound); }
-  mutex_heap_.unlock();
+  if (is_running) {
+    f_t user_obj         = compute_user_objective(original_lp_, obj);
+    f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
+    std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
 
-  for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
-    lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
+    settings_.log.printf(
+      "H                            %+13.6e    %+10.6e                        %s %9.2f\n",
+      user_obj,
+      user_lower,
+      user_gap.c_str(),
+      toc(exploration_stats_.start_time));
+  } else {
+    settings_.log.printf("New solution from primal heuristics. Objective %+.6e. Time %.2f\n",
+                         compute_user_objective(original_lp_, obj),
+                         toc(exploration_stats_.start_time));
   }
-
-  return lower_bound;
 }
 
 template <typename i_t, typename f_t>
-i_t branch_and_bound_t<i_t, f_t>::get_heap_size()
+void branch_and_bound_t<i_t, f_t>::report(std::string symbol,
+                                          f_t obj,
+                                          f_t lower_bound,
+                                          i_t node_depth)
 {
-  mutex_heap_.lock();
-  i_t size = heap_.size();
-  mutex_heap_.unlock();
-  return size;
+  i_t nodes_explored   = exploration_stats_.nodes_explored;
+  i_t nodes_unexplored = exploration_stats_.nodes_unexplored;
+  f_t user_obj         = compute_user_objective(original_lp_, obj);
+  f_t user_lower       = compute_user_objective(original_lp_, lower_bound);
+  f_t iter_node        = exploration_stats_.total_lp_iters / nodes_explored;
+  std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
+  settings_.log.printf("%s%10d   %10lu    %+13.6e    %+10.6e  %6d    %7.1e     %s %9.2f\n",
+                       symbol.c_str(),
+                       nodes_explored,
+                       nodes_unexplored,
+                       user_obj,
+                       user_lower,
+                       node_depth,
+                       iter_node,
+                       user_gap.c_str(),
+                       toc(exploration_stats_.start_time));
 }
 
 template <typename i_t, typename f_t>
@@ -303,25 +323,7 @@ void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solu
   }
   mutex_upper_.unlock();
 
-  if (is_feasible) {
-    if (solver_status_ == mip_exploration_status_t::RUNNING) {
-      f_t user_obj    = compute_user_objective(original_lp_, obj);
-      f_t user_lower  = compute_user_objective(original_lp_, get_lower_bound());
-      std::string gap = user_mip_gap<f_t>(user_obj, user_lower);
-
-      settings_.log.printf(
-        "H                           %+13.6e    %+10.6e                        %s %9.2f\n",
-        user_obj,
-        user_lower,
-        gap.c_str(),
-        toc(exploration_stats_.start_time));
-    } else {
-      settings_.log.printf("New solution from primal heuristics. Objective %+.6e. Time %.2f\n",
-                           compute_user_objective(original_lp_, obj),
-                           toc(exploration_stats_.start_time));
-    }
-  }
-
+  if (is_feasible) { report_heuristic(obj); }
   if (attempt_repair) {
     mutex_repair_.lock();
     repair_queue_.push_back(crushed_solution);
@@ -417,17 +419,7 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
         if (repaired_obj < upper_bound_) {
           upper_bound_ = repaired_obj;
           incumbent_.set_incumbent_solution(repaired_obj, repaired_solution);
-
-          f_t obj              = compute_user_objective(original_lp_, repaired_obj);
-          f_t lower            = compute_user_objective(original_lp_, get_lower_bound());
-          std::string user_gap = user_mip_gap<f_t>(obj, lower);
-
-          settings_.log.printf(
-            "H                           %+13.6e    %+10.6e                        %s %9.2f\n",
-            obj,
-            lower,
-            user_gap.c_str(),
-            toc(exploration_stats_.start_time));
+          report_heuristic(repaired_obj);
 
           if (settings_.solution_callback != nullptr) {
             std::vector<f_t> original_x;
@@ -443,30 +435,24 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
 }
 
 template <typename i_t, typename f_t>
-mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>& solution,
-                                                              f_t lower_bound)
+void branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>& solution,
+                                                      f_t lower_bound)
 {
-  mip_status_t mip_status = mip_status_t::UNSET;
-
-  if (solver_status_ == mip_exploration_status_t::NUMERICAL) {
+  if (solver_status_ == mip_status_t::NUMERICAL) {
     settings_.log.printf("Numerical issue encountered. Stopping the solver...\n");
-    mip_status = mip_status_t::NUMERICAL;
   }
 
-  if (solver_status_ == mip_exploration_status_t::TIME_LIMIT) {
+  if (solver_status_ == mip_status_t::TIME_LIMIT) {
     settings_.log.printf("Time limit reached. Stopping the solver...\n");
-    mip_status = mip_status_t::TIME_LIMIT;
   }
-  if (solver_status_ == mip_exploration_status_t::NODE_LIMIT) {
+  if (solver_status_ == mip_status_t::NODE_LIMIT) {
     settings_.log.printf("Node limit reached. Stopping the solver...\n");
-    mip_status = mip_status_t::NODE_LIMIT;
   }
 
-  f_t upper_bound      = get_upper_bound();
-  f_t gap              = upper_bound - lower_bound;
-  f_t obj              = compute_user_objective(original_lp_, upper_bound);
+  f_t gap              = upper_bound_ - lower_bound;
+  f_t obj              = compute_user_objective(original_lp_, upper_bound_.load());
   f_t user_bound       = compute_user_objective(original_lp_, lower_bound);
-  f_t gap_rel          = user_relative_gap(original_lp_, upper_bound, lower_bound);
+  f_t gap_rel          = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
   bool is_maximization = original_lp_.obj_scale < 0.0;
 
   settings_.log.printf("Explored %d nodes in %.2fs.\n",
@@ -479,7 +465,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t
                        user_bound);
 
   if (gap <= settings_.absolute_mip_gap_tol || gap_rel <= settings_.relative_mip_gap_tol) {
-    mip_status = mip_status_t::OPTIMAL;
+    solver_status_ = mip_status_t::OPTIMAL;
     if (gap > 0 && gap <= settings_.absolute_mip_gap_tol) {
       settings_.log.printf("Optimal solution found within absolute MIP gap tolerance (%.1e)\n",
                            settings_.absolute_mip_gap_tol);
@@ -494,18 +480,18 @@ mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t
     }
   }
 
-  if (solver_status_ == mip_exploration_status_t::COMPLETED) {
+  if (solver_status_ == mip_status_t::UNSET) {
     if (exploration_stats_.nodes_explored > 0 && exploration_stats_.nodes_unexplored == 0 &&
-        upper_bound == inf) {
+        upper_bound_ == inf) {
       settings_.log.printf("Integer infeasible.\n");
-      mip_status = mip_status_t::INFEASIBLE;
+      solver_status_ = mip_status_t::INFEASIBLE;
       if (settings_.heuristic_preemption_callback != nullptr) {
         settings_.heuristic_preemption_callback();
       }
     }
   }
 
-  if (upper_bound != inf) {
+  if (upper_bound_ != inf) {
     assert(incumbent_.has_incumbent);
     uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x);
   }
@@ -513,39 +499,25 @@ mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t
   solution.lower_bound        = lower_bound;
   solution.nodes_explored     = exploration_stats_.nodes_explored;
   solution.simplex_iterations = exploration_stats_.total_lp_iters;
-
-  return mip_status;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
                                                          const std::vector<f_t>& leaf_solution,
                                                          i_t leaf_depth,
-                                                         thread_type_t thread_type)
+                                                         bnb_worker_type_t thread_type)
 {
-  bool send_solution   = false;
-  i_t nodes_explored   = exploration_stats_.nodes_explored;
-  i_t nodes_unexplored = exploration_stats_.nodes_unexplored;
+  bool send_solution = false;
+
+  settings_.log.debug("%s found a feasible solution with obj=%.10e.\n",
+                      feasible_solution_symbol(thread_type),
+                      compute_user_objective(original_lp_, leaf_objective));
 
   mutex_upper_.lock();
   if (leaf_objective < upper_bound_) {
     incumbent_.set_incumbent_solution(leaf_objective, leaf_solution);
-    upper_bound_    = leaf_objective;
-    f_t lower_bound = get_lower_bound();
-    f_t obj         = compute_user_objective(original_lp_, upper_bound_);
-    f_t lower       = compute_user_objective(original_lp_, lower_bound);
-    settings_.log.printf(
-      "%s%10d   %10lu    %+13.6e    %+10.6e   %6d   %7.1e     %s %9.2f\n",
-      feasible_solution_symbol(thread_type),
-      nodes_explored,
-      nodes_unexplored,
-      obj,
-      lower,
-      leaf_depth,
-      nodes_explored > 0 ? exploration_stats_.total_lp_iters / nodes_explored : 0,
-      user_mip_gap<f_t>(obj, lower).c_str(),
-      toc(exploration_stats_.start_time));
-
+    upper_bound_ = leaf_objective;
+    report(feasible_solution_symbol(thread_type), leaf_objective, get_lower_bound(), leaf_depth);
     send_solution = true;
   }
 
@@ -557,16 +529,18 @@ void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
   mutex_upper_.unlock();
 }
 
-template <typename i_t, typename f_t>
-rounding_direction_t branch_and_bound_t<i_t, f_t>::child_selection(mip_node_t<i_t, f_t>* node_ptr)
+// Martin's criteria for the preferred rounding direction (see [1])
+// [1] A. Martin, “Integer Programs with Block Structure,”
+// Technische Universit¨at Berlin, Berlin, 1999. Accessed: Aug. 08, 2025.
+// [Online]. Available: https://opus4.kobv.de/opus4-zib/frontdoor/index/index/docId/391
+template <typename f_t>
+rounding_direction_t martin_criteria(f_t val, f_t root_val)
 {
-  const i_t branch_var     = node_ptr->get_down_child()->branch_var;
-  const f_t branch_var_val = node_ptr->get_down_child()->fractional_val;
-  const f_t down_val       = std::floor(root_relax_soln_.x[branch_var]);
-  const f_t up_val         = std::ceil(root_relax_soln_.x[branch_var]);
-  const f_t down_dist      = branch_var_val - down_val;
-  const f_t up_dist        = up_val - branch_var_val;
-  constexpr f_t eps        = 1e-6;
+  const f_t down_val  = std::floor(root_val);
+  const f_t up_val    = std::ceil(root_val);
+  const f_t down_dist = val - down_val;
+  const f_t up_dist   = up_val - val;
+  constexpr f_t eps   = 1e-6;
 
   if (down_dist < up_dist + eps) {
     return rounding_direction_t::DOWN;
@@ -577,128 +551,193 @@ rounding_direction_t branch_and_bound_t<i_t, f_t>::child_selection(mip_node_t<i_
 }
 
 template <typename i_t, typename f_t>
-node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
+branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   mip_node_t<i_t, f_t>* node_ptr,
-  search_tree_t<i_t, f_t>& search_tree,
-  lp_problem_t<i_t, f_t>& leaf_problem,
-  basis_update_mpf_t<i_t, f_t>& basis_factors,
-  std::vector<i_t>& basic_list,
-  std::vector<i_t>& nonbasic_list,
-  bounds_strengthening_t<i_t, f_t>& node_presolver,
-  thread_type_t thread_type,
-  bool recompute_bounds_and_basis,
-  const std::vector<f_t>& root_lower,
-  const std::vector<f_t>& root_upper,
-  logger_t& log)
+  const std::vector<i_t>& fractional,
+  bnb_worker_data_t<i_t, f_t>* worker_data)
 {
-  const f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10;
-  const f_t upper_bound    = get_upper_bound();
+  logger_t log;
+  log.log                        = false;
+  i_t branch_var                 = -1;
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  std::vector<f_t> current_incumbent;
+  std::vector<f_t>& solution = worker_data->leaf_solution.x;
+
+  switch (worker_data->worker_type) {
+    case bnb_worker_type_t::EXPLORATION:
+
+      // RINS/SubMIP path
+      if (!enable_concurrent_lp_root_solve()) {
+        branch_var = pc_.variable_selection(fractional, solution, log);
+      } else {
+        branch_var = pc_.reliable_variable_selection(worker_data->leaf_problem,
+                                                     settings_,
+                                                     var_types_,
+                                                     node_ptr->vstatus,
+                                                     worker_data->leaf_edge_norms,
+                                                     fractional,
+                                                     solution,
+                                                     worker_data->basis_factors,
+                                                     worker_data->basic_list,
+                                                     worker_data->nonbasic_list,
+                                                     node_ptr->lower_bound,
+                                                     upper_bound_,
+                                                     exploration_stats_.total_lp_iters,
+                                                     exploration_stats_.nodes_explored,
+                                                     log);
+      }
+
+      round_dir = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
 
-  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
+      // Note that the exploration thread is the only one that can insert new nodes into the heap,
+      // and thus, we only need to calculate the objective estimate here (it is used for
+      // sorting the nodes for diving).
+      node_ptr->objective_estimate =
+        pc_.obj_estimate(fractional, solution, node_ptr->lower_bound, log);
+      return {branch_var, round_dir};
+
+    case bnb_worker_type_t::COEFFICIENT_DIVING:
+      return coefficient_diving(original_lp_, fractional, solution, log);
+
+    case bnb_worker_type_t::LINE_SEARCH_DIVING:
+      return line_search_diving(fractional, solution, root_relax_soln_.x, log);
+
+    case bnb_worker_type_t::PSEUDOCOST_DIVING:
+      return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
+
+    case bnb_worker_type_t::GUIDED_DIVING:
+      mutex_upper_.lock();
+      current_incumbent = incumbent_.x;
+      mutex_upper_.unlock();
+      return guided_diving(pc_, fractional, solution, current_incumbent, log);
+
+    default:
+      log.debug("Unknown variable selection method: %d\n", worker_data->worker_type);
+      return {-1, rounding_direction_t::NONE};
+  }
+}
+
+template <typename i_t, typename f_t>
+dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
+                                                           bnb_worker_data_t<i_t, f_t>* worker_data,
+                                                           bnb_stats_t<i_t, f_t>& stats,
+                                                           logger_t& log)
+{
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == leaf_problem.num_cols);
+  assert(leaf_vstatus.size() == worker_data->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
-  lp_settings.cut_off       = upper_bound + settings_.dual_tol;
+  lp_settings.cut_off       = upper_bound_ + settings_.dual_tol;
   lp_settings.inside_mip    = 2;
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
   lp_settings.scale_columns = false;
 
+  if (worker_data->worker_type != bnb_worker_type_t::EXPLORATION) {
+    i_t bnb_lp_iters            = exploration_stats_.total_lp_iters;
+    f_t factor                  = settings_.diving_settings.iteration_limit_factor;
+    f_t max_iter                = factor * bnb_lp_iters;
+    lp_settings.iteration_limit = max_iter - stats.total_lp_iters;
+    if (lp_settings.iteration_limit <= 0) { return dual::status_t::ITERATION_LIMIT; }
+  }
+
 #ifdef LOG_NODE_SIMPLEX
   lp_settings.set_log(true);
   std::stringstream ss;
   ss << "simplex-" << std::this_thread::get_id() << ".log";
   std::string logname;
   ss >> logname;
-  lp_settings.set_log_filename(logname);
-  lp_settings.log.enable_log_to_file("a+");
+  lp_settings.log.set_log_file(logname, "a");
   lp_settings.log.log_to_console = false;
   lp_settings.log.printf(
-    "%s node id = %d, branch var = %d, fractional val = %f, variable lower bound = %f, variable "
-    "upper bound = %f\n",
+    "%scurrent node: id = %d, depth = %d, branch var = %d, branch dir = %s, fractional val = "
+    "%f, variable lower bound = %f, variable upper bound = %f, branch vstatus = %d\n\n",
     settings_.log.log_prefix.c_str(),
     node_ptr->node_id,
+    node_ptr->depth,
     node_ptr->branch_var,
+    node_ptr->branch_dir == rounding_direction_t::DOWN ? "DOWN" : "UP",
     node_ptr->fractional_val,
     node_ptr->branch_var_lower,
-    node_ptr->branch_var_upper);
+    node_ptr->branch_var_upper,
+    node_ptr->vstatus[node_ptr->branch_var]);
 #endif
 
-  // Reset the bound_changed markers
-  std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-
-  // Set the correct bounds for the leaf problem
-  if (recompute_bounds_and_basis) {
-    leaf_problem.lower = root_lower;
-    leaf_problem.upper = root_upper;
-    node_ptr->get_variable_bounds(
-      leaf_problem.lower, leaf_problem.upper, node_presolver.bounds_changed);
-
-  } else {
-    node_ptr->update_branched_variable_bounds(
-      leaf_problem.lower, leaf_problem.upper, node_presolver.bounds_changed);
-  }
-
-  bool feasible =
-    node_presolver.bounds_strengthening(leaf_problem.lower, leaf_problem.upper, lp_settings);
+  bool is_feasible             = worker_data->set_lp_variable_bounds_for(node_ptr, settings_);
+  dual::status_t lp_status     = dual::status_t::DUAL_UNBOUNDED;
+  worker_data->leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
 
-  dual::status_t lp_status = dual::status_t::DUAL_UNBOUNDED;
-
-  if (feasible) {
-    i_t node_iter                    = 0;
-    f_t lp_start_time                = tic();
-    std::vector<f_t> leaf_edge_norms = edge_norms_;  // = node.steepest_edge_norms;
+  if (is_feasible) {
+    i_t node_iter     = 0;
+    f_t lp_start_time = tic();
 
     lp_status = dual_phase2_with_advanced_basis(2,
                                                 0,
-                                                recompute_bounds_and_basis,
+                                                worker_data->recompute_basis,
                                                 lp_start_time,
-                                                leaf_problem,
+                                                worker_data->leaf_problem,
                                                 lp_settings,
                                                 leaf_vstatus,
-                                                basis_factors,
-                                                basic_list,
-                                                nonbasic_list,
-                                                leaf_solution,
+                                                worker_data->basis_factors,
+                                                worker_data->basic_list,
+                                                worker_data->nonbasic_list,
+                                                worker_data->leaf_solution,
                                                 node_iter,
-                                                leaf_edge_norms);
+                                                worker_data->leaf_edge_norms);
 
     if (lp_status == dual::status_t::NUMERICAL) {
       log.printf("Numerical issue node %d. Resolving from scratch.\n", node_ptr->node_id);
-      lp_status_t second_status = solve_linear_program_with_advanced_basis(leaf_problem,
-                                                                           lp_start_time,
-                                                                           lp_settings,
-                                                                           leaf_solution,
-                                                                           basis_factors,
-                                                                           basic_list,
-                                                                           nonbasic_list,
-                                                                           leaf_vstatus,
-                                                                           leaf_edge_norms);
+      lp_status_t second_status =
+        solve_linear_program_with_advanced_basis(worker_data->leaf_problem,
+                                                 lp_start_time,
+                                                 lp_settings,
+                                                 worker_data->leaf_solution,
+                                                 worker_data->basis_factors,
+                                                 worker_data->basic_list,
+                                                 worker_data->nonbasic_list,
+                                                 leaf_vstatus,
+                                                 worker_data->leaf_edge_norms);
 
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
 
-    if (thread_type == thread_type_t::EXPLORATION) {
-      exploration_stats_.total_lp_solve_time += toc(lp_start_time);
-      exploration_stats_.total_lp_iters += node_iter;
-    }
+    stats.total_lp_solve_time += toc(lp_start_time);
+    stats.total_lp_iters += node_iter;
   }
 
+#ifdef LOG_NODE_SIMPLEX
+  lp_settings.log.printf("\nLP status: %d\n\n", lp_status);
+#endif
+
+  return lp_status;
+}
+template <typename i_t, typename f_t>
+std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
+  mip_node_t<i_t, f_t>* node_ptr,
+  search_tree_t<i_t, f_t>& search_tree,
+  bnb_worker_data_t<i_t, f_t>* worker_data,
+  dual::status_t lp_status,
+  logger_t& log)
+{
+  const f_t abs_fathom_tol                     = settings_.absolute_mip_gap_tol / 10;
+  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
+  lp_problem_t<i_t, f_t>& leaf_problem         = worker_data->leaf_problem;
+  lp_solution_t<i_t, f_t>& leaf_solution       = worker_data->leaf_solution;
+
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     // Node was infeasible. Do not branch
     node_ptr->lower_bound = inf;
     search_tree.graphviz_node(log, node_ptr, "infeasible", 0.0);
     search_tree.update(node_ptr, node_status_t::INFEASIBLE);
-    return node_solve_info_t::NO_CHILDREN;
+    return {node_status_t::INFEASIBLE, rounding_direction_t::NONE};
 
   } else if (lp_status == dual::status_t::CUTOFF) {
     // Node was cut off. Do not branch
-    node_ptr->lower_bound = upper_bound;
+    node_ptr->lower_bound = upper_bound_;
     f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
     search_tree.graphviz_node(log, node_ptr, "cut off", leaf_objective);
     search_tree.update(node_ptr, node_status_t::FATHOMED);
-    return node_solve_info_t::NO_CHILDREN;
+    return {node_status_t::FATHOMED, rounding_direction_t::NONE};
 
   } else if (lp_status == dual::status_t::OPTIMAL) {
     // LP was feasible
@@ -711,48 +750,42 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
     search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective);
     pc_.update_pseudo_costs(node_ptr, leaf_objective);
 
-    if (settings_.node_processed_callback != nullptr) {
-      std::vector<f_t> original_x;
-      uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
-      settings_.node_processed_callback(original_x, leaf_objective);
+    if (worker_data->worker_type == bnb_worker_type_t::EXPLORATION) {
+      if (settings_.node_processed_callback != nullptr) {
+        std::vector<f_t> original_x;
+        uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
+        settings_.node_processed_callback(original_x, leaf_objective);
+      }
     }
 
     if (leaf_num_fractional == 0) {
       // Found a integer feasible solution
-      add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type);
+      add_feasible_solution(
+        leaf_objective, leaf_solution.x, node_ptr->depth, worker_data->worker_type);
       search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective);
       search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE);
-      return node_solve_info_t::NO_CHILDREN;
+      return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE};
 
-    } else if (leaf_objective <= upper_bound + abs_fathom_tol) {
+    } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      const i_t branch_var =
-        pc_.variable_selection(leaf_fractional, leaf_solution.x, lp_settings.log);
+      auto [branch_var, round_dir] = variable_selection(node_ptr, leaf_fractional, worker_data);
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
+      assert(branch_var >= 0);
+      assert(round_dir != rounding_direction_t::NONE);
+
       search_tree.branch(
         node_ptr, branch_var, leaf_solution.x[branch_var], leaf_vstatus, leaf_problem, log);
       search_tree.update(node_ptr, node_status_t::HAS_CHILDREN);
-
-      rounding_direction_t round_dir = child_selection(node_ptr);
-
-      if (round_dir == rounding_direction_t::UP) {
-        return node_solve_info_t::UP_CHILD_FIRST;
-      } else {
-        return node_solve_info_t::DOWN_CHILD_FIRST;
-      }
+      return {node_status_t::HAS_CHILDREN, round_dir};
 
     } else {
       search_tree.graphviz_node(log, node_ptr, "fathomed", leaf_objective);
       search_tree.update(node_ptr, node_status_t::FATHOMED);
-      return node_solve_info_t::NO_CHILDREN;
+      return {node_status_t::FATHOMED, rounding_direction_t::NONE};
     }
-  } else if (lp_status == dual::status_t::TIME_LIMIT) {
-    search_tree.graphviz_node(log, node_ptr, "timeout", 0.0);
-    return node_solve_info_t::TIME_LIMIT;
-
   } else {
-    if (thread_type == thread_type_t::EXPLORATION) {
+    if (worker_data->worker_type == bnb_worker_type_t::EXPLORATION) {
       fetch_min(lower_bound_ceiling_, node_ptr->lower_bound);
       log.printf(
         "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set "
@@ -765,149 +798,24 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
 
     search_tree.graphviz_node(log, node_ptr, "numerical", 0.0);
     search_tree.update(node_ptr, node_status_t::NUMERICAL);
-    return node_solve_info_t::NUMERICAL;
-  }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                                                       search_tree_t<i_t, f_t>* search_tree,
-                                                       const csr_matrix_t<i_t, f_t>& Arow,
-                                                       i_t initial_heap_size)
-{
-  if (solver_status_ != mip_exploration_status_t::RUNNING) { return; }
-
-  // Note that we do not know which thread will execute the
-  // `exploration_ramp_up` task, so we allow to any thread
-  // to repair the heuristic solution.
-  repair_heuristic_solutions();
-
-  f_t lower_bound = node->lower_bound;
-  f_t upper_bound = get_upper_bound();
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  f_t abs_gap     = upper_bound - lower_bound;
-
-  if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-    search_tree->graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
-    search_tree->update(node, node_status_t::FATHOMED);
-    --exploration_stats_.nodes_unexplored;
-    return;
-  }
-
-  f_t now = toc(exploration_stats_.start_time);
-  f_t time_since_last_log =
-    exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-  if (((exploration_stats_.nodes_since_last_log >= 10 ||
-        abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-       (time_since_last_log >= 1)) ||
-      (time_since_last_log > 30) || now > settings_.time_limit) {
-    bool should_report = should_report_.exchange(false);
-
-    if (should_report) {
-      f_t obj              = compute_user_objective(original_lp_, upper_bound);
-      f_t user_lower       = compute_user_objective(original_lp_, root_objective_);
-      std::string gap_user = user_mip_gap<f_t>(obj, user_lower);
-      i_t nodes_explored   = exploration_stats_.nodes_explored;
-      i_t nodes_unexplored = exploration_stats_.nodes_unexplored;
-
-      settings_.log.printf(
-        " %10d   %10lu    %+13.6e    %+10.6e   %6d   %7.1e     %s %9.2f\n",
-        nodes_explored,
-        nodes_unexplored,
-        obj,
-        user_lower,
-        node->depth,
-        nodes_explored > 0 ? exploration_stats_.total_lp_iters / nodes_explored : 0,
-        gap_user.c_str(),
-        now);
-
-      exploration_stats_.nodes_since_last_log = 0;
-      exploration_stats_.last_log             = tic();
-      should_report_                          = true;
-    }
-  }
-
-  if (now > settings_.time_limit) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return;
-  }
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  node_solve_info_t status = solve_node(node,
-                                        *search_tree,
-                                        leaf_problem,
-                                        basis_factors,
-                                        basic_list,
-                                        nonbasic_list,
-                                        node_presolver,
-                                        thread_type_t::EXPLORATION,
-                                        true,
-                                        original_lp_.lower,
-                                        original_lp_.upper,
-                                        settings_.log);
-
-  ++exploration_stats_.nodes_since_last_log;
-  ++exploration_stats_.nodes_explored;
-  --exploration_stats_.nodes_unexplored;
-
-  if (status == node_solve_info_t::TIME_LIMIT) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return;
-
-  } else if (has_children(status)) {
-    exploration_stats_.nodes_unexplored += 2;
-
-    // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase
-    if (exploration_stats_.nodes_unexplored < initial_heap_size) {
-#pragma omp task
-      exploration_ramp_up(node->get_down_child(), search_tree, Arow, initial_heap_size);
-
-#pragma omp task
-      exploration_ramp_up(node->get_up_child(), search_tree, Arow, initial_heap_size);
-
-    } else {
-      // We've generated enough nodes, push further nodes onto the heap
-      mutex_heap_.lock();
-      heap_.push(node->get_down_child());
-      heap_.push(node->get_up_child());
-      mutex_heap_.unlock();
-    }
+    return {node_status_t::NUMERICAL, rounding_direction_t::NONE};
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
-                                                   mip_node_t<i_t, f_t>* start_node,
-                                                   search_tree_t<i_t, f_t>& search_tree,
-                                                   lp_problem_t<i_t, f_t>& leaf_problem,
-                                                   bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                                   basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                                   std::vector<i_t>& basic_list,
-                                                   std::vector<i_t>& nonbasic_list)
+void branch_and_bound_t<i_t, f_t>::plunge_with(bnb_worker_data_t<i_t, f_t>* worker_data)
 {
-  bool recompute_bounds_and_basis = true;
   std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(start_node);
-
-  while (stack.size() > 0 && solver_status_ == mip_exploration_status_t::RUNNING) {
-    if (task_id == 0) { repair_heuristic_solutions(); }
+  stack.push_front(worker_data->start_node);
+  worker_data->recompute_basis  = true;
+  worker_data->recompute_bounds = true;
 
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
     f_t lower_bound = node_ptr->lower_bound;
-    f_t upper_bound = get_upper_bound();
-    f_t abs_gap     = upper_bound - lower_bound;
+    f_t upper_bound = upper_bound_;
     f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
     // This is based on three assumptions:
@@ -916,80 +824,48 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
     // - The current node and its siblings uses the lower bound of the parent before solving the LP
     // relaxation
     // - The lower bound of the parent is lower or equal to its children
-    assert(task_id < local_lower_bounds_.size());
-    local_lower_bounds_[task_id] = lower_bound;
+    worker_data->lower_bound = lower_bound;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-      search_tree.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
-      search_tree.update(node_ptr, node_status_t::FATHOMED);
+      search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
+      search_tree_.update(node_ptr, node_status_t::FATHOMED);
+      worker_data->recompute_basis  = true;
+      worker_data->recompute_bounds = true;
       --exploration_stats_.nodes_unexplored;
       continue;
     }
 
-    f_t now = toc(exploration_stats_.start_time);
-
-    if (task_id == 0) {
-      f_t time_since_last_log =
-        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-
-      if (((exploration_stats_.nodes_since_last_log >= 1000 ||
-            abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-           time_since_last_log >= 1) ||
-          (time_since_last_log > 30) || now > settings_.time_limit) {
-        f_t obj              = compute_user_objective(original_lp_, upper_bound);
-        f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
-        std::string gap_user = user_mip_gap<f_t>(obj, user_lower);
-        i_t nodes_explored   = exploration_stats_.nodes_explored;
-        i_t nodes_unexplored = exploration_stats_.nodes_unexplored;
-
-        settings_.log.printf(
-          " %10d   %10lu    %+13.6e    %+10.6e   %6d   %7.1e     %s %9.2f\n",
-          nodes_explored,
-          nodes_unexplored,
-          obj,
-          user_lower,
-          node_ptr->depth,
-          nodes_explored > 0 ? exploration_stats_.total_lp_iters / nodes_explored : 0,
-          gap_user.c_str(),
-          now);
-        exploration_stats_.last_log             = tic();
-        exploration_stats_.nodes_since_last_log = 0;
-      }
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
     }
 
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-      return;
-    }
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
-      solver_status_ = mip_exploration_status_t::NODE_LIMIT;
-      return;
+      solver_status_ = mip_status_t::NODE_LIMIT;
+      break;
     }
 
-    node_solve_info_t status = solve_node(node_ptr,
-                                          search_tree,
-                                          leaf_problem,
-                                          basis_factors,
-                                          basic_list,
-                                          nonbasic_list,
-                                          node_presolver,
-                                          thread_type_t::EXPLORATION,
-                                          recompute_bounds_and_basis,
-                                          original_lp_.lower,
-                                          original_lp_.upper,
-                                          settings_.log);
-
-    recompute_bounds_and_basis = !has_children(status);
+    dual::status_t lp_status =
+      solve_node_lp(node_ptr, worker_data, exploration_stats_, settings_.log);
+
+    if (lp_status == dual::status_t::TIME_LIMIT) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+      break;
+    }
 
     ++exploration_stats_.nodes_since_last_log;
     ++exploration_stats_.nodes_explored;
     --exploration_stats_.nodes_unexplored;
 
-    if (status == node_solve_info_t::TIME_LIMIT) {
-      solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-      return;
+    auto [node_status, round_dir] =
+      update_tree(node_ptr, search_tree_, worker_data, lp_status, settings_.log);
+
+    worker_data->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker_data->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
 
-    } else if (has_children(status)) {
+    if (node_status == node_status_t::HAS_CHILDREN) {
       // The stack should only contain the children of the current parent.
       // If the stack size is greater than 0,
       // we pop the current node from the stack and place it in the global heap,
@@ -997,217 +873,109 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
       if (stack.size() > 0) {
         mip_node_t<i_t, f_t>* node = stack.back();
         stack.pop_back();
-
-        // The order here matters. We want to create a copy of the node
-        // before adding to the global heap. Otherwise,
-        // some thread may consume the node (possibly fathoming it)
-        // before we had the chance to add to the diving queue.
-        // This lead to a SIGSEGV. Although, in this case, it
-        // would be better if we discard the node instead.
-        if (get_heap_size() > settings_.num_bfs_threads) {
-          std::vector<f_t> lower = original_lp_.lower;
-          std::vector<f_t> upper = original_lp_.upper;
-          std::fill(
-            node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-          node->get_variable_bounds(lower, upper, node_presolver.bounds_changed);
-
-          mutex_dive_queue_.lock();
-          diving_queue_.emplace(node->detach_copy(), std::move(lower), std::move(upper));
-          mutex_dive_queue_.unlock();
-        }
-
-        mutex_heap_.lock();
-        heap_.push(node);
-        mutex_heap_.unlock();
+        node_queue_.push(node);
       }
 
       exploration_stats_.nodes_unexplored += 2;
 
-      if (status == node_solve_info_t::UP_CHILD_FIRST) {
-        stack.push_front(node_ptr->get_down_child());
+      if (round_dir == rounding_direction_t::UP) {
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_down_child());
+        } else {
+          stack.push_front(node_ptr->get_down_child());
+        }
+
         stack.push_front(node_ptr->get_up_child());
       } else {
-        stack.push_front(node_ptr->get_up_child());
+        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
+          node_queue_.push(node_ptr->get_up_child());
+        } else {
+          stack.push_front(node_ptr->get_up_child());
+        }
+
         stack.push_front(node_ptr->get_down_child());
       }
     }
   }
+
+  worker_pool_.return_worker_to_pool(worker_data);
+  active_workers_per_type[EXPLORATION]--;
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id,
-                                                     search_tree_t<i_t, f_t>& search_tree,
-                                                     const csr_matrix_t<i_t, f_t>& Arow)
+void branch_and_bound_t<i_t, f_t>::dive_with(bnb_worker_data_t<i_t, f_t>* worker_data)
 {
-  f_t lower_bound = -inf;
-  f_t upper_bound = inf;
-  f_t abs_gap     = inf;
-  f_t rel_gap     = inf;
-
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  while (solver_status_ == mip_exploration_status_t::RUNNING &&
-         abs_gap > settings_.absolute_mip_gap_tol && rel_gap > settings_.relative_mip_gap_tol &&
-         (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    mip_node_t<i_t, f_t>* start_node = nullptr;
-
-    // If there any node left in the heap, we pop the top node and explore it.
-    mutex_heap_.lock();
-    if (heap_.size() > 0) {
-      start_node = heap_.top();
-      heap_.pop();
-      active_subtrees_++;
-    }
-    mutex_heap_.unlock();
-
-    if (start_node != nullptr) {
-      if (get_upper_bound() < start_node->lower_bound) {
-        // This node was put on the heap earlier but its lower bound is now greater than the
-        // current upper bound
-        search_tree.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
-        search_tree.update(start_node, node_status_t::FATHOMED);
-        active_subtrees_--;
-        continue;
-      }
+  logger_t log;
+  log.log = false;
 
-      // Best-first search with plunging
-      explore_subtree(task_id,
-                      start_node,
-                      search_tree,
-                      leaf_problem,
-                      node_presolver,
-                      basis_factors,
-                      basic_list,
-                      nonbasic_list);
-
-      active_subtrees_--;
-    }
+  bnb_worker_type_t diving_type = worker_data->worker_type;
+  const i_t node_limit          = settings_.diving_settings.node_limit;
+  const i_t backtrack           = settings_.diving_settings.backtrack;
 
-    lower_bound = get_lower_bound();
-    upper_bound = get_upper_bound();
-    abs_gap     = upper_bound - lower_bound;
-    rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  }
+  worker_data->recompute_basis  = true;
+  worker_data->recompute_bounds = true;
 
-  // Check if it is the last thread that exited the loop and no
-  // timeout or numerical error has happen.
-  if (solver_status_ == mip_exploration_status_t::RUNNING) {
-    if (active_subtrees_ == 0) {
-      solver_status_ = mip_exploration_status_t::COMPLETED;
-    } else {
-      local_lower_bounds_[task_id] = inf;
-    }
-  }
-}
+  search_tree_t<i_t, f_t> dive_tree(std::move(*worker_data->start_node));
+  std::deque<mip_node_t<i_t, f_t>*> stack;
+  stack.push_front(&dive_tree.root);
 
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::diving_thread(const csr_matrix_t<i_t, f_t>& Arow)
-{
-  logger_t log;
-  log.log = false;
-  // Make a copy of the original LP. We will modify its bounds at each leaf
-  lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
-  std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow, row_sense, var_types_);
-
-  const i_t m = leaf_problem.num_rows;
-  basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
-  std::vector<i_t> basic_list(m);
-  std::vector<i_t> nonbasic_list;
-
-  while (solver_status_ == mip_exploration_status_t::RUNNING &&
-         (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    std::optional<diving_root_t<i_t, f_t>> start_node;
-
-    mutex_dive_queue_.lock();
-    if (diving_queue_.size() > 0) { start_node = diving_queue_.pop(); }
-    mutex_dive_queue_.unlock();
-
-    if (start_node.has_value()) {
-      if (get_upper_bound() < start_node->node.lower_bound) { continue; }
-
-      bool recompute_bounds_and_basis = true;
-      i_t nodes_explored              = 0;
-      search_tree_t<i_t, f_t> subtree(std::move(start_node->node));
-      std::deque<mip_node_t<i_t, f_t>*> stack;
-      stack.push_front(&subtree.root);
-
-      while (stack.size() > 0 && solver_status_ == mip_exploration_status_t::RUNNING) {
-        mip_node_t<i_t, f_t>* node_ptr = stack.front();
-        stack.pop_front();
-        f_t upper_bound = get_upper_bound();
-        f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, node_ptr->lower_bound);
-
-        if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-          recompute_bounds_and_basis = true;
-          continue;
-        }
+  bnb_stats_t<i_t, f_t> dive_stats;
+  dive_stats.total_lp_iters      = 0;
+  dive_stats.total_lp_solve_time = 0;
+  dive_stats.nodes_explored      = 0;
+  dive_stats.nodes_unexplored    = 0;
 
-        if (toc(exploration_stats_.start_time) > settings_.time_limit) { return; }
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET) {
+    mip_node_t<i_t, f_t>* node_ptr = stack.front();
+    stack.pop_front();
 
-        if (nodes_explored >= 1000) { break; }
+    f_t lower_bound          = node_ptr->lower_bound;
+    f_t upper_bound          = upper_bound_;
+    f_t rel_gap              = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    worker_data->lower_bound = lower_bound;
 
-        node_solve_info_t status = solve_node(node_ptr,
-                                              subtree,
-                                              leaf_problem,
-                                              basis_factors,
-                                              basic_list,
-                                              nonbasic_list,
-                                              node_presolver,
-                                              thread_type_t::DIVING,
-                                              recompute_bounds_and_basis,
-                                              start_node->lower,
-                                              start_node->upper,
-                                              log);
+    if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
+      worker_data->recompute_basis  = true;
+      worker_data->recompute_bounds = true;
+      continue;
+    }
 
-        nodes_explored++;
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
+    if (dive_stats.nodes_explored > node_limit) { break; }
 
-        recompute_bounds_and_basis = !has_children(status);
+    dual::status_t lp_status = solve_node_lp(node_ptr, worker_data, dive_stats, log);
 
-        if (status == node_solve_info_t::TIME_LIMIT) {
-          solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-          return;
+    if (lp_status == dual::status_t::TIME_LIMIT) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+      break;
+    }
 
-        } else if (has_children(status)) {
-          if (status == node_solve_info_t::UP_CHILD_FIRST) {
-            stack.push_front(node_ptr->get_down_child());
-            stack.push_front(node_ptr->get_up_child());
-          } else {
-            stack.push_front(node_ptr->get_up_child());
-            stack.push_front(node_ptr->get_down_child());
-          }
-        }
+    ++dive_stats.nodes_explored;
 
-        if (stack.size() > 1) {
-          // If the diving thread is consuming the nodes faster than the
-          // best first search, then we split the current subtree at the
-          // lowest possible point and move to the queue, so it can
-          // be picked by another thread.
-          if (std::lock_guard<omp_mutex_t> lock(mutex_dive_queue_);
-              diving_queue_.size() < min_diving_queue_size_) {
-            mip_node_t<i_t, f_t>* new_node = stack.back();
-            stack.pop_back();
-
-            std::vector<f_t> lower = start_node->lower;
-            std::vector<f_t> upper = start_node->upper;
-            std::fill(
-              node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-            new_node->get_variable_bounds(lower, upper, node_presolver.bounds_changed);
-
-            diving_queue_.emplace(new_node->detach_copy(), std::move(lower), std::move(upper));
-          }
-        }
+    auto [node_status, round_dir] = update_tree(node_ptr, dive_tree, worker_data, lp_status, log);
+
+    worker_data->recompute_basis  = node_status != node_status_t::HAS_CHILDREN;
+    worker_data->recompute_bounds = node_status != node_status_t::HAS_CHILDREN;
+
+    if (node_status == node_status_t::HAS_CHILDREN) {
+      if (round_dir == rounding_direction_t::UP) {
+        stack.push_front(node_ptr->get_down_child());
+        stack.push_front(node_ptr->get_up_child());
+      } else {
+        stack.push_front(node_ptr->get_up_child());
+        stack.push_front(node_ptr->get_down_child());
       }
     }
+
+    if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > backtrack) {
+      stack.pop_back();
+    }
   }
+
+  worker_pool_.return_worker_to_pool(worker_data);
+  active_workers_per_type[diving_type]--;
 }
 
 template <typename i_t, typename f_t>
@@ -1291,9 +1059,11 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   logger_t log;
   log.log                             = false;
   log.log_prefix                      = settings_.log.log_prefix;
-  solver_status_                      = mip_exploration_status_t::UNSET;
+  solver_status_                      = mip_status_t::UNSET;
+  is_running                          = false;
   exploration_stats_.nodes_unexplored = 0;
   exploration_stats_.nodes_explored   = 0;
+  original_lp_.A.to_compressed_row(Arow_);
 
   if (guess_.size() != 0) {
     std::vector<f_t> crushed_guess;
@@ -1355,15 +1125,15 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   }
 
   if (root_status == lp_status_t::TIME_LIMIT) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return set_final_solution(solution, -inf);
+    solver_status_ = mip_status_t::TIME_LIMIT;
+    set_final_solution(solution, -inf);
+    return solver_status_;
   }
 
   assert(root_vstatus_.size() == original_lp_.num_cols);
   set_uninitialized_steepest_edge_norms<i_t, f_t>(edge_norms_);
 
   root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
-  local_lower_bounds_.assign(settings_.num_bfs_threads, root_objective_);
 
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
@@ -1421,8 +1191,9 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                              pc_);
 
   if (toc(exploration_stats_.start_time) > settings_.time_limit) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return set_final_solution(solution, root_objective_);
+    solver_status_ = mip_status_t::TIME_LIMIT;
+    set_final_solution(solution, root_objective_);
+    return solver_status_;
   }
 
   // Choose variable to branch on
@@ -1437,62 +1208,185 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
+  node_queue_.push(search_tree_.root.get_down_child());
+  node_queue_.push(search_tree_.root.get_up_child());
+
+  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
+  bool is_ramp_up_finished                               = false;
 
-  csr_matrix_t<i_t, f_t> Arow(1, 1, 0);
-  original_lp_.A.to_compressed_row(Arow);
+  std::vector<bnb_worker_type_t> worker_types = {EXPLORATION};
+  std::array<i_t, bnb_num_worker_types> max_num_workers_per_type;
+  max_num_workers_per_type.fill(0);
+  max_num_workers_per_type[EXPLORATION] = settings_.num_threads;
+  worker_pool_.init(2 * settings_.num_threads, original_lp_, Arow_, var_types_, settings_);
+  active_workers_per_type.fill(0);
 
   settings_.log.printf("Exploring the B&B tree using %d threads (best-first = %d, diving = %d)\n",
                        settings_.num_threads,
-                       settings_.num_bfs_threads,
-                       settings_.num_diving_threads);
-
-  settings_.log.printf(
-    " | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
-    "|  Time  |\n");
+                       settings_.num_bfs_workers,
+                       settings_.num_threads - settings_.num_bfs_workers);
 
-  exploration_stats_.nodes_explored       = 0;
+  exploration_stats_.nodes_explored       = 1;
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
-  exploration_stats_.last_log             = tic();
-  active_subtrees_                        = 0;
-  min_diving_queue_size_                  = 4 * settings_.num_diving_threads;
-  solver_status_                          = mip_exploration_status_t::RUNNING;
+  exploration_stats_.last_log             = 0.0;
+  is_running                              = true;
   lower_bound_ceiling_                    = inf;
-  should_report_                          = true;
+  min_node_queue_size_                    = 2 * settings_.num_threads;
+
+  settings_.log.printf(
+    "  | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
+    "|  Time  |\n");
 
 #pragma omp parallel num_threads(settings_.num_threads)
   {
 #pragma omp master
     {
-      auto down_child  = search_tree_.root.get_down_child();
-      auto up_child    = search_tree_.root.get_up_child();
-      i_t initial_size = 2 * settings_.num_threads;
+      f_t lower_bound     = get_lower_bound();
+      f_t abs_gap         = upper_bound_ - lower_bound;
+      f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+      i_t last_node_depth = 0;
+
+      while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+             rel_gap > settings_.relative_mip_gap_tol &&
+             (active_workers_per_type[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
+        bool launched_any_task = false;
+        lower_bound            = get_lower_bound();
+        abs_gap                = upper_bound_ - lower_bound;
+        rel_gap                = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+        repair_heuristic_solutions();
+
+        if (!is_ramp_up_finished) {
+          if (node_queue_.best_first_queue_size() >= min_node_queue_size_) {
+            if (!std::isfinite(upper_bound_)) { diving_settings.disable_guided_diving = true; }
+            max_num_workers_per_type =
+              bnb_get_num_workers_round_robin(settings_.num_threads, diving_settings);
+            worker_types        = bnb_get_worker_types(diving_settings);
+            is_ramp_up_finished = true;
+
+#ifdef CUOPT_LOG_DEBUG
+            settings_.log.debug(
+              "Ramp-up phase is finished. num active workers = %d, heap size = %d\n",
+              active_workers_per_type[EXPLORATION],
+              node_queue_.best_first_queue_size());
+
+            for (auto type : worker_types) {
+              settings_.log.debug("%s: max num of workers = %d",
+                                  feasible_solution_symbol(type),
+                                  max_num_workers_per_type[type]);
+            }
+#endif
+          }
+        }
 
-#pragma omp task
-      exploration_ramp_up(down_child, &search_tree_, Arow, initial_size);
+        // If the guided diving was disabled previously due to the lack of an incumbent solution,
+        // re-enable as soon as a new incumbent is found.
+        if (settings_.diving_settings.disable_guided_diving !=
+            diving_settings.disable_guided_diving) {
+          if (std::isfinite(upper_bound_)) {
+            diving_settings.disable_guided_diving = settings_.diving_settings.disable_guided_diving;
+            max_num_workers_per_type =
+              bnb_get_num_workers_round_robin(settings_.num_threads, diving_settings);
+            worker_types = bnb_get_worker_types(diving_settings);
+
+#ifdef CUOPT_LOG_DEBUG
+            for (auto type : worker_types) {
+              settings_.log.debug("%s: max num of workers = %d",
+                                  feasible_solution_symbol(type),
+                                  max_num_workers_per_type[type]);
+            }
+#endif
+          }
+        }
 
-#pragma omp task
-      exploration_ramp_up(up_child, &search_tree_, Arow, initial_size);
-    }
+        f_t now = toc(exploration_stats_.start_time);
+        f_t time_since_last_log =
+          exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+        i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+        if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+             time_since_last_log >= 1) ||
+            (time_since_last_log > 30) || now > settings_.time_limit) {
+          i_t depth = node_queue_.best_first_queue_size() > 0 ? node_queue_.bfs_top()->depth
+                                                              : last_node_depth;
+          report("  ", upper_bound_, lower_bound, depth);
+          exploration_stats_.last_log             = tic();
+          exploration_stats_.nodes_since_last_log = 0;
+        }
 
-#pragma omp barrier
+        if (now > settings_.time_limit) {
+          solver_status_ = mip_status_t::TIME_LIMIT;
+          break;
+        }
 
-#pragma omp master
-    {
-      for (i_t i = 0; i < settings_.num_bfs_threads; i++) {
-#pragma omp task
-        best_first_thread(i, search_tree_, Arow);
-      }
+        for (auto type : worker_types) {
+          if (active_workers_per_type[type] >= max_num_workers_per_type[type]) { continue; }
+
+          // Get an idle worker.
+          bnb_worker_data_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
+          if (worker == nullptr) { break; }
+
+          if (type == EXPLORATION) {
+            // If there any node left in the heap, we pop the top node and explore it.
+            std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+
+            if (!start_node.has_value()) { continue; }
+            if (upper_bound_ < start_node.value()->lower_bound) {
+              // This node was put on the heap earlier but its lower bound is now greater than the
+              // current upper bound
+              search_tree_.graphviz_node(
+                settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+              search_tree_.update(start_node.value(), node_status_t::FATHOMED);
+              continue;
+            }
+
+            // Remove the worker from the idle list.
+            worker_pool_.pop_idle_worker();
+            worker->init_best_first(start_node.value(), original_lp_);
+            last_node_depth = start_node.value()->depth;
+            active_workers_per_type[type]++;
+            launched_any_task = true;
+
+#pragma omp task affinity(worker)
+            plunge_with(worker);
+
+          } else {
+            std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
+
+            if (!start_node.has_value()) { continue; }
+            if (upper_bound_ < start_node.value()->lower_bound ||
+                start_node.value()->depth < diving_settings.min_node_depth) {
+              continue;
+            }
+
+            bool is_feasible =
+              worker->init_diving(start_node.value(), type, original_lp_, settings_);
+            if (!is_feasible) { continue; }
+
+            // Remove the worker from the idle list.
+            worker_pool_.pop_idle_worker();
+            active_workers_per_type[type]++;
+            launched_any_task = true;
+
+#pragma omp task affinity(worker)
+            dive_with(worker);
+          }
+        }
 
-      for (i_t i = 0; i < settings_.num_diving_threads; i++) {
-#pragma omp task
-        diving_thread(Arow);
+        // If no new task was launched in this iteration, suspend temporarily the
+        // execution of the master. As of 8/Jan/2026, GCC does not
+        // implement taskyield, but LLVM does.
+        if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
       }
     }
   }
 
-  f_t lower_bound = heap_.size() > 0 ? heap_.top()->lower_bound : search_tree_.root.lower_bound;
-  return set_final_solution(solution, lower_bound);
+  is_running      = false;
+  f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
+                                                            : search_tree_.root.lower_bound;
+  set_final_solution(solution, lower_bound);
+  return solver_status_;
 }
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index 38438cc9e..c4836b356 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -1,15 +1,17 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
 
 #pragma once
 
-#include <dual_simplex/diving_queue.hpp>
+#include <dual_simplex/bnb_worker.hpp>
+#include <dual_simplex/diving_heuristics.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/node_queue.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/pseudo_costs.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -20,7 +22,6 @@
 #include <utilities/omp_helpers.hpp>
 
 #include <omp.h>
-#include <queue>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
@@ -35,45 +36,12 @@ enum class mip_status_t {
   UNSET      = 6,  // The status is not set
 };
 
-enum class mip_exploration_status_t {
-  UNSET      = 0,  // The status is not set
-  TIME_LIMIT = 1,  // The solver reached a time limit
-  NODE_LIMIT = 2,  // The maximum number of nodes was reached (not implemented)
-  NUMERICAL  = 3,  // The solver encountered a numerical error
-  RUNNING    = 4,  // The solver is currently exploring the tree
-  COMPLETED  = 5,  // The solver finished exploring the tree
-};
-
-enum class node_solve_info_t {
-  NO_CHILDREN      = 0,  // The node does not produced children
-  UP_CHILD_FIRST   = 1,  // The up child should be explored first
-  DOWN_CHILD_FIRST = 2,  // The down child should be explored first
-  TIME_LIMIT       = 3,  // The solver reached a time limit
-  ITERATION_LIMIT  = 4,  // The solver reached a iteration limit
-  NUMERICAL        = 5   // The solver encounter a numerical error when solving the node
-};
-
-// Indicate the search and variable selection algorithms used by the thread (See [1]).
-//
-// [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
-// Berlin, 2007. doi: 10.14279/depositonce-1634.
-enum class thread_type_t {
-  EXPLORATION = 0,  // Best-First + Plunging. Pseudocost branching + Martin's criteria.
-  DIVING      = 1,
-};
-
-template <typename i_t, typename f_t>
-class bounds_strengthening_t;
-
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  template <typename T>
-  using mip_node_heap_t = std::priority_queue<T, std::vector<T>, node_compare_t<i_t, f_t>>;
-
   branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
                      const simplex_solver_settings_t<i_t, f_t>& solver_settings);
 
@@ -109,9 +77,7 @@ class branch_and_bound_t {
                        f_t& repaired_obj,
                        std::vector<f_t>& repaired_solution) const;
 
-  f_t get_upper_bound();
   f_t get_lower_bound();
-  i_t get_heap_size();
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
@@ -128,34 +94,22 @@ class branch_and_bound_t {
   std::vector<f_t> guess_;
 
   // LP relaxation
+  csr_matrix_t<i_t, f_t> Arow_;
   lp_problem_t<i_t, f_t> original_lp_;
   std::vector<i_t> new_slacks_;
   std::vector<variable_type_t> var_types_;
 
-  // Local lower bounds for each thread
-  std::vector<omp_atomic_t<f_t>> local_lower_bounds_;
-
   // Mutex for upper bound
   omp_mutex_t mutex_upper_;
 
   // Global variable for upper bound
-  f_t upper_bound_;
+  omp_atomic_t<f_t> upper_bound_;
 
   // Global variable for incumbent. The incumbent should be updated with the upper bound
   mip_solution_t<i_t, f_t> incumbent_;
 
   // Structure with the general info of the solver.
-  struct stats_t {
-    f_t start_time                        = 0.0;
-    omp_atomic_t<f_t> total_lp_solve_time = 0.0;
-    omp_atomic_t<i_t> nodes_explored      = 0;
-    omp_atomic_t<i_t> nodes_unexplored    = 0;
-    omp_atomic_t<f_t> total_lp_iters      = 0;
-
-    // This should only be used by the main thread
-    omp_atomic_t<f_t> last_log             = 0.0;
-    omp_atomic_t<i_t> nodes_since_last_log = 0;
-  } exploration_stats_;
+  bnb_stats_t<i_t, f_t> exploration_stats_;
 
   // Mutex for repair
   omp_mutex_t mutex_repair_;
@@ -175,86 +129,74 @@ class branch_and_bound_t {
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
 
-  // Heap storing the nodes to be explored.
-  omp_mutex_t mutex_heap_;
-  mip_node_heap_t<mip_node_t<i_t, f_t>*> heap_;
+  // Heap storing the nodes waiting to be explored.
+  node_queue_t<i_t, f_t> node_queue_;
 
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of subtrees that are currently being explored.
-  omp_atomic_t<i_t> active_subtrees_;
+  // Count the number of workers per type that either are being executed or
+  // are waiting to be executed.
+  std::array<omp_atomic_t<i_t>, bnb_num_worker_types> active_workers_per_type;
 
-  // Queue for storing the promising node for performing dives.
-  omp_mutex_t mutex_dive_queue_;
-  diving_queue_t<i_t, f_t> diving_queue_;
-  i_t min_diving_queue_size_;
+  // Worker pool
+  bnb_worker_pool_t<i_t, f_t> worker_pool_;
 
   // Global status of the solver.
-  omp_atomic_t<mip_exploration_status_t> solver_status_;
+  omp_atomic_t<mip_status_t> solver_status_;
+  omp_atomic_t<bool> is_running{false};
 
-  omp_atomic_t<bool> should_report_;
+  // Minimum number of node in the queue. When the queue size is less than
+  // this variable, the nodes are added directly to the queue instead of
+  // the local stack. This also determines the end of the ramp-up phase.
+  i_t min_node_queue_size_;
 
   // In case, a best-first thread encounters a numerical issue when solving a node,
   // its blocks the progression of the lower bound.
   omp_atomic_t<f_t> lower_bound_ceiling_;
 
+  void report_heuristic(f_t obj);
+  void report(std::string symbol, f_t obj, f_t lower_bound, i_t node_depth);
+
   // Set the final solution.
-  mip_status_t set_final_solution(mip_solution_t<i_t, f_t>& solution, f_t lower_bound);
+  void set_final_solution(mip_solution_t<i_t, f_t>& solution, f_t lower_bound);
 
   // Update the incumbent solution with the new feasible solution
   // found during branch and bound.
   void add_feasible_solution(f_t leaf_objective,
                              const std::vector<f_t>& leaf_solution,
                              i_t leaf_depth,
-                             thread_type_t thread_type);
+                             bnb_worker_type_t thread_type);
 
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
-  // Ramp-up phase of the solver, where we greedily expand the tree until
-  // there is enough unexplored nodes. This is done recursively using OpenMP tasks.
-  void exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                           search_tree_t<i_t, f_t>* search_tree,
-                           const csr_matrix_t<i_t, f_t>& Arow,
-                           i_t initial_heap_size);
-
-  // Explore the search tree using the best-first search with plunging strategy.
-  void explore_subtree(i_t task_id,
-                       mip_node_t<i_t, f_t>* start_node,
-                       search_tree_t<i_t, f_t>& search_tree,
-                       lp_problem_t<i_t, f_t>& leaf_problem,
-                       bounds_strengthening_t<i_t, f_t>& node_presolver,
-                       basis_update_mpf_t<i_t, f_t>& basis_update,
-                       std::vector<i_t>& basic_list,
-                       std::vector<i_t>& nonbasic_list);
-
-  // Each "main" thread pops a node from the global heap and then performs a plunge
-  // (i.e., a shallow dive) into the subtree determined by the node.
-  void best_first_thread(i_t task_id,
-                         search_tree_t<i_t, f_t>& search_tree,
-                         const csr_matrix_t<i_t, f_t>& Arow);
-
-  // Each diving thread pops the first node from the dive queue and then performs
-  // a deep dive into the subtree determined by the node.
-  void diving_thread(const csr_matrix_t<i_t, f_t>& Arow);
-
-  // Solve the LP relaxation of a leaf node and update the tree.
-  node_solve_info_t solve_node(mip_node_t<i_t, f_t>* node_ptr,
-                               search_tree_t<i_t, f_t>& search_tree,
-                               lp_problem_t<i_t, f_t>& leaf_problem,
-                               basis_update_mpf_t<i_t, f_t>& basis_factors,
-                               std::vector<i_t>& basic_list,
-                               std::vector<i_t>& nonbasic_list,
-                               bounds_strengthening_t<i_t, f_t>& node_presolver,
-                               thread_type_t thread_type,
-                               bool recompute_basis_and_bounds,
-                               const std::vector<f_t>& root_lower,
-                               const std::vector<f_t>& root_upper,
+  // Perform a plunge over a subtree using a given worker.
+  void plunge_with(bnb_worker_data_t<i_t, f_t>* worker_data);
+
+  // Perform a deep dive over a subtree using a given worker.
+  void dive_with(bnb_worker_data_t<i_t, f_t>* worker_data);
+
+  // Solve the LP relaxation of a leaf node
+  dual::status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
+                               bnb_worker_data_t<i_t, f_t>* worker_data,
+                               bnb_stats_t<i_t, f_t>& stats,
                                logger_t& log);
 
-  // Sort the children based on the Martin's criteria.
-  rounding_direction_t child_selection(mip_node_t<i_t, f_t>* node_ptr);
+  // Update the tree based on the LP relaxation. Returns the status
+  // of the node and, if appropriated, the preferred rounding direction
+  // when visiting the children.
+  std::pair<node_status_t, rounding_direction_t> update_tree(
+    mip_node_t<i_t, f_t>* node_ptr,
+    search_tree_t<i_t, f_t>& search_tree,
+    bnb_worker_data_t<i_t, f_t>* worker_data,
+    dual::status_t lp_status,
+    logger_t& log);
+
+  // Selects the variable to branch on.
+  branch_variable_t<i_t> variable_selection(mip_node_t<i_t, f_t>* node_ptr,
+                                            const std::vector<i_t>& fractional,
+                                            bnb_worker_data_t<i_t, f_t>* worker_data);
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 23d9a0e8e..8ee3fb0ce 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -785,8 +785,15 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
           factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
         if (rank != m) {
           settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-          basis_repair(
-            lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+          basis_repair(lp.A,
+                       settings,
+                       lp.lower,
+                       lp.upper,
+                       deficient,
+                       slacks_needed,
+                       basic_list,
+                       nonbasic_list,
+                       vstatus);
           if (factorize_basis(
                 lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
             settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
@@ -1132,7 +1139,15 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
   if (rank != m) {
     settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-    basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+    basis_repair(lp.A,
+                 settings,
+                 lp.lower,
+                 lp.upper,
+                 deficient,
+                 slacks_needed,
+                 basic_list,
+                 nonbasic_list,
+                 vstatus);
     if (factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) ==
         -1) {
       settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
@@ -1323,7 +1338,15 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
         factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
       if (rank != m) {
         settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-        basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+        basis_repair(lp.A,
+                     settings,
+                     lp.lower,
+                     lp.upper,
+                     deficient,
+                     slacks_needed,
+                     basic_list,
+                     nonbasic_list,
+                     vstatus);
         if (factorize_basis(
               lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
           settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
diff --git a/cpp/src/dual_simplex/diving_heuristics.cpp b/cpp/src/dual_simplex/diving_heuristics.cpp
new file mode 100644
index 000000000..b5e58a097
--- /dev/null
+++ b/cpp/src/dual_simplex/diving_heuristics.cpp
@@ -0,0 +1,302 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <dual_simplex/diving_heuristics.hpp>
+#include <tuple>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> line_search_diving(const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          const std::vector<f_t>& root_solution,
+                                          logger_t& log)
+{
+  constexpr f_t eps              = 1e-6;
+  i_t branch_var                 = -1;
+  f_t min_score                  = std::numeric_limits<f_t>::max();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+
+  for (auto j : fractional) {
+    f_t score                = inf;
+    rounding_direction_t dir = rounding_direction_t::NONE;
+
+    if (solution[j] < root_solution[j] - eps) {
+      f_t f = solution[j] - std::floor(solution[j]);
+      f_t d = root_solution[j] - solution[j];
+      score = f / d;
+      dir   = rounding_direction_t::DOWN;
+
+    } else if (solution[j] > root_solution[j] + eps) {
+      f_t f = std::ceil(solution[j]) - solution[j];
+      f_t d = solution[j] - root_solution[j];
+      score = f / d;
+      dir   = rounding_direction_t::UP;
+    }
+
+    if (min_score > score) {
+      min_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  // If the current solution is equal to the root solution, arbitrarily
+  // set the branch variable to the first fractional variable and round it down
+  if (round_dir == rounding_direction_t::NONE) {
+    branch_var = fractional[0];
+    round_dir  = rounding_direction_t::DOWN;
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug("Line search diving: selected var %d with val = %e, round dir = %d and score = %e\n",
+            branch_var,
+            solution[branch_var],
+            round_dir,
+            min_score);
+
+  return {branch_var, round_dir};
+}
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                         const std::vector<i_t>& fractional,
+                                         const std::vector<f_t>& solution,
+                                         const std::vector<f_t>& root_solution,
+                                         logger_t& log)
+{
+  i_t branch_var                 = -1;
+  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps              = 1e-6;
+
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+  pc.initialized(
+    num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  for (auto j : fractional) {
+    rounding_direction_t dir = rounding_direction_t::NONE;
+    f_t f_down               = solution[j] - std::floor(solution[j]);
+    f_t f_up                 = std::ceil(solution[j]) - solution[j];
+
+    pc.pseudo_cost_mutex[j].lock();
+    f_t pc_down = pc.pseudo_cost_num_down[j] != 0
+                    ? pc.pseudo_cost_sum_down[j] / pc.pseudo_cost_num_down[j]
+                    : pseudo_cost_down_avg;
+
+    f_t pc_up = pc.pseudo_cost_num_up[j] != 0 ? pc.pseudo_cost_sum_up[j] / pc.pseudo_cost_num_up[j]
+                                              : pseudo_cost_up_avg;
+    pc.pseudo_cost_mutex[j].unlock();
+
+    f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down);
+    f_t score_up   = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up);
+    f_t score      = 0;
+
+    if (solution[j] < root_solution[j] - 0.4) {
+      score = score_down;
+      dir   = rounding_direction_t::DOWN;
+    } else if (solution[j] > root_solution[j] + 0.4) {
+      score = score_up;
+      dir   = rounding_direction_t::UP;
+    } else if (f_down < 0.3) {
+      score = score_down;
+      dir   = rounding_direction_t::DOWN;
+    } else if (f_down > 0.7) {
+      score = score_up;
+      dir   = rounding_direction_t::UP;
+    } else if (pc_down < pc_up + eps) {
+      score = score_down;
+      dir   = rounding_direction_t::DOWN;
+    } else {
+      score = score_up;
+      dir   = rounding_direction_t::UP;
+    }
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug("Pseudocost diving: selected var %d with val = %e, round dir = %d and score = %e\n",
+            branch_var,
+            solution[branch_var],
+            round_dir,
+            max_score);
+
+  return {branch_var, round_dir};
+}
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                     const std::vector<i_t>& fractional,
+                                     const std::vector<f_t>& solution,
+                                     const std::vector<f_t>& incumbent,
+                                     logger_t& log)
+{
+  i_t branch_var                 = -1;
+  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps              = 1e-6;
+
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+  pc.initialized(
+    num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  for (auto j : fractional) {
+    f_t f_down    = solution[j] - std::floor(solution[j]);
+    f_t f_up      = std::ceil(solution[j]) - solution[j];
+    f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j]));
+    f_t up_dist   = std::abs(std::ceil(solution[j]) - incumbent[j]);
+    rounding_direction_t dir =
+      down_dist < up_dist + eps ? rounding_direction_t::DOWN : rounding_direction_t::UP;
+
+    pc.pseudo_cost_mutex[j].lock();
+    f_t pc_down = pc.pseudo_cost_num_down[j] != 0
+                    ? pc.pseudo_cost_sum_down[j] / pc.pseudo_cost_num_down[j]
+                    : pseudo_cost_down_avg;
+
+    f_t pc_up = pc.pseudo_cost_num_up[j] != 0 ? pc.pseudo_cost_sum_up[j] / pc.pseudo_cost_num_up[j]
+                                              : pseudo_cost_up_avg;
+    pc.pseudo_cost_mutex[j].unlock();
+
+    f_t score1 = dir == rounding_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up;
+    f_t score2 = dir == rounding_direction_t::DOWN ? pc_up * f_up : pc_down * f_down;
+    f_t score  = (score1 + score2) / 6;
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug("Guided diving: selected var %d with val = %e, round dir = %d and score = %e\n",
+            branch_var,
+            solution[branch_var],
+            round_dir,
+            max_score);
+
+  return {branch_var, round_dir};
+}
+
+template <typename i_t, typename f_t>
+std::tuple<i_t, i_t> calculate_variable_locks(const lp_problem_t<i_t, f_t>& lp_problem, i_t var_idx)
+{
+  i_t up_lock   = 0;
+  i_t down_lock = 0;
+  i_t start     = lp_problem.A.col_start[var_idx];
+  i_t end       = lp_problem.A.col_start[var_idx + 1];
+
+  for (i_t k = start; k < end; ++k) {
+    f_t nz_val = lp_problem.A.x[k];
+    i_t nz_row = lp_problem.A.i[k];
+
+    if (std::isfinite(lp_problem.upper[nz_row]) && std::isfinite(lp_problem.lower[nz_row])) {
+      down_lock += 1;
+      up_lock += 1;
+      continue;
+    }
+
+    f_t sign = std::isfinite(lp_problem.upper[nz_row]) ? 1 : -1;
+
+    if (nz_val * sign > 0) {
+      up_lock += 1;
+    } else {
+      down_lock += 1;
+    }
+  }
+
+  return {up_lock, down_lock};
+}
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> coefficient_diving(const lp_problem_t<i_t, f_t>& lp_problem,
+                                          const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          logger_t& log)
+{
+  i_t branch_var                 = -1;
+  i_t min_locks                  = std::numeric_limits<i_t>::max();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps              = 1e-6;
+
+  for (auto j : fractional) {
+    f_t f_down                = solution[j] - std::floor(solution[j]);
+    f_t f_up                  = std::ceil(solution[j]) - solution[j];
+    auto [up_lock, down_lock] = calculate_variable_locks(lp_problem, j);
+    i_t locks                 = std::min(up_lock, down_lock);
+
+    if (min_locks > locks) {
+      min_locks  = locks;
+      branch_var = j;
+
+      if (up_lock < down_lock) {
+        round_dir = rounding_direction_t::UP;
+      } else if (up_lock > down_lock) {
+        round_dir = rounding_direction_t::DOWN;
+      } else if (f_down < f_up + eps) {
+        round_dir = rounding_direction_t::DOWN;
+      } else {
+        round_dir = rounding_direction_t::UP;
+      }
+    }
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug(
+    "Coefficient diving: selected var %d with val = %e, round dir = %d and min locks = %d\n",
+    branch_var,
+    solution[branch_var],
+    round_dir,
+    min_locks);
+
+  return {branch_var, round_dir};
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+template branch_variable_t<int> line_search_diving(const std::vector<int>& fractional,
+                                                   const std::vector<double>& solution,
+                                                   const std::vector<double>& root_solution,
+                                                   logger_t& log);
+
+template branch_variable_t<int> pseudocost_diving(pseudo_costs_t<int, double>& pc,
+                                                  const std::vector<int>& fractional,
+                                                  const std::vector<double>& solution,
+                                                  const std::vector<double>& root_solution,
+                                                  logger_t& log);
+
+template branch_variable_t<int> guided_diving(pseudo_costs_t<int, double>& pc,
+                                              const std::vector<int>& fractional,
+                                              const std::vector<double>& solution,
+                                              const std::vector<double>& incumbent,
+                                              logger_t& log);
+
+template branch_variable_t<int> coefficient_diving(const lp_problem_t<int, double>& lp_problem,
+                                                   const std::vector<int>& fractional,
+                                                   const std::vector<double>& solution,
+                                                   logger_t& log);
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_heuristics.hpp b/cpp/src/dual_simplex/diving_heuristics.hpp
new file mode 100644
index 000000000..1f44fee31
--- /dev/null
+++ b/cpp/src/dual_simplex/diving_heuristics.hpp
@@ -0,0 +1,49 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+#include <dual_simplex/pseudo_costs.hpp>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t>
+struct branch_variable_t {
+  i_t variable;
+  rounding_direction_t direction;
+};
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> line_search_diving(const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          const std::vector<f_t>& root_solution,
+                                          logger_t& log);
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                         const std::vector<i_t>& fractional,
+                                         const std::vector<f_t>& solution,
+                                         const std::vector<f_t>& root_solution,
+                                         logger_t& log);
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                     const std::vector<i_t>& fractional,
+                                     const std::vector<f_t>& solution,
+                                     const std::vector<f_t>& incumbent,
+                                     logger_t& log);
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> coefficient_diving(const lp_problem_t<i_t, f_t>& lp_problem,
+                                          const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          logger_t& log);
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_queue.hpp b/cpp/src/dual_simplex/diving_queue.hpp
deleted file mode 100644
index f7035109e..000000000
--- a/cpp/src/dual_simplex/diving_queue.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <algorithm>
-#include <climits>
-#include <vector>
-
-#include <dual_simplex/mip_node.hpp>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-template <typename i_t, typename f_t>
-struct diving_root_t {
-  mip_node_t<i_t, f_t> node;
-  std::vector<f_t> lower;
-  std::vector<f_t> upper;
-
-  diving_root_t(mip_node_t<i_t, f_t>&& node, std::vector<f_t>&& lower, std::vector<f_t>&& upper)
-    : node(std::move(node)), lower(std::move(lower)), upper(std::move(upper))
-  {
-  }
-
-  friend bool operator>(const diving_root_t<i_t, f_t>& a, const diving_root_t<i_t, f_t>& b)
-  {
-    return a.node.lower_bound > b.node.lower_bound;
-  }
-};
-
-// A min-heap for storing the starting nodes for the dives.
-// This has a maximum size of 1024, such that the container
-// will discard the least promising node if the queue is full.
-template <typename i_t, typename f_t>
-class diving_queue_t {
- private:
-  std::vector<diving_root_t<i_t, f_t>> buffer;
-  static constexpr i_t max_size_ = 1024;
-
- public:
-  diving_queue_t() { buffer.reserve(max_size_); }
-
-  void push(diving_root_t<i_t, f_t>&& node)
-  {
-    buffer.push_back(std::move(node));
-    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
-    if (buffer.size() > max_size() - 1) { buffer.pop_back(); }
-  }
-
-  void emplace(mip_node_t<i_t, f_t>&& node, std::vector<f_t>&& lower, std::vector<f_t>&& upper)
-  {
-    buffer.emplace_back(std::move(node), std::move(lower), std::move(upper));
-    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
-    if (buffer.size() > max_size() - 1) { buffer.pop_back(); }
-  }
-
-  diving_root_t<i_t, f_t> pop()
-  {
-    std::pop_heap(buffer.begin(), buffer.end(), std::greater<>());
-    diving_root_t<i_t, f_t> node = std::move(buffer.back());
-    buffer.pop_back();
-    return node;
-  }
-
-  i_t size() const { return buffer.size(); }
-  constexpr i_t max_size() const { return max_size_; }
-  const diving_root_t<i_t, f_t>& top() const { return buffer.front(); }
-  void clear() { buffer.clear(); }
-};
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/logger.hpp b/cpp/src/dual_simplex/logger.hpp
index ac5e394f9..f81308670 100644
--- a/cpp/src/dual_simplex/logger.hpp
+++ b/cpp/src/dual_simplex/logger.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -30,22 +30,24 @@ class logger_t {
   {
   }
 
-  void enable_log_to_file(std::string mode = "w")
+  void enable_log_to_file(const char* mode = "w")
   {
     if (log_file != nullptr) { std::fclose(log_file); }
-    log_file    = std::fopen(log_filename.c_str(), mode.c_str());
+    log_file    = std::fopen(log_filename.c_str(), mode);
     log_to_file = true;
   }
 
-  void set_log_file(const std::string& filename)
+  void set_log_file(const std::string& filename, const char* mode = "w")
   {
     log_filename = filename;
-    enable_log_to_file();
+    enable_log_to_file(mode);
   }
 
   void close_log_file()
   {
     if (log_file != nullptr) { std::fclose(log_file); }
+    log_file    = nullptr;
+    log_to_file = false;
   }
 
   void printf(const char* fmt, ...)
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index 1d66a21f7..de147132a 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -45,6 +45,7 @@ class mip_node_t {
       branch_var_lower(-std::numeric_limits<f_t>::infinity()),
       branch_var_upper(std::numeric_limits<f_t>::infinity()),
       fractional_val(std::numeric_limits<f_t>::infinity()),
+      objective_estimate(std::numeric_limits<f_t>::infinity()),
       vstatus(0)
   {
     children[0] = nullptr;
@@ -59,6 +60,7 @@ class mip_node_t {
       node_id(0),
       branch_var(-1),
       branch_dir(rounding_direction_t::NONE),
+      objective_estimate(std::numeric_limits<f_t>::infinity()),
       vstatus(basis)
   {
     children[0] = nullptr;
@@ -80,6 +82,7 @@ class mip_node_t {
       branch_var(branch_variable),
       branch_dir(branch_direction),
       fractional_val(branch_var_value),
+      objective_estimate(parent_node->objective_estimate),
       vstatus(basis)
 
   {
@@ -227,17 +230,19 @@ class mip_node_t {
   mip_node_t<i_t, f_t> detach_copy() const
   {
     mip_node_t<i_t, f_t> copy(lower_bound, vstatus);
-    copy.branch_var       = branch_var;
-    copy.branch_dir       = branch_dir;
-    copy.branch_var_lower = branch_var_lower;
-    copy.branch_var_upper = branch_var_upper;
-    copy.fractional_val   = fractional_val;
-    copy.node_id          = node_id;
+    copy.branch_var         = branch_var;
+    copy.branch_dir         = branch_dir;
+    copy.branch_var_lower   = branch_var_lower;
+    copy.branch_var_upper   = branch_var_upper;
+    copy.fractional_val     = fractional_val;
+    copy.objective_estimate = objective_estimate;
+    copy.node_id            = node_id;
     return copy;
   }
 
   node_status_t status;
   f_t lower_bound;
+  f_t objective_estimate;
   i_t depth;
   i_t node_id;
   i_t branch_var;
@@ -262,22 +267,6 @@ void remove_fathomed_nodes(std::vector<mip_node_t<i_t, f_t>*>& stack)
   }
 }
 
-template <typename i_t, typename f_t>
-class node_compare_t {
- public:
-  bool operator()(const mip_node_t<i_t, f_t>& a, const mip_node_t<i_t, f_t>& b) const
-  {
-    return a.lower_bound >
-           b.lower_bound;  // True if a comes before b, elements that come before are output last
-  }
-
-  bool operator()(const mip_node_t<i_t, f_t>* a, const mip_node_t<i_t, f_t>* b) const
-  {
-    return a->lower_bound >
-           b->lower_bound;  // True if a comes before b, elements that come before are output last
-  }
-};
-
 template <typename i_t, typename f_t>
 class search_tree_t {
  public:
diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp
new file mode 100644
index 000000000..b5c9a9ea7
--- /dev/null
+++ b/cpp/src/dual_simplex/node_queue.hpp
@@ -0,0 +1,165 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <dual_simplex/mip_node.hpp>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+// This is a generic heap implementation based
+// on the STL functions. The main benefit here is
+// that we access the underlying container.
+template <typename T, typename Comp>
+class heap_t {
+ public:
+  heap_t()          = default;
+  virtual ~heap_t() = default;
+
+  void push(const T& node)
+  {
+    buffer.push_back(node);
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  void push(T&& node)
+  {
+    buffer.push_back(std::move(node));
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  template <typename... Args>
+  void emplace(Args&&... args)
+  {
+    buffer.emplace_back(std::forward<Args>(args)...);
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  std::optional<T> pop()
+  {
+    if (buffer.empty()) return std::nullopt;
+
+    std::pop_heap(buffer.begin(), buffer.end(), comp);
+    T node = std::move(buffer.back());
+    buffer.pop_back();
+    return node;
+  }
+
+  size_t size() const { return buffer.size(); }
+  T& top() { return buffer.front(); }
+  void clear() { buffer.clear(); }
+  bool empty() const { return buffer.empty(); }
+
+ private:
+  std::vector<T> buffer;
+  Comp comp;
+};
+
+// A queue storing the nodes waiting to be explored/dived from.
+template <typename i_t, typename f_t>
+class node_queue_t {
+ private:
+  struct heap_entry_t {
+    mip_node_t<i_t, f_t>* node = nullptr;
+    f_t lower_bound            = -inf;
+    f_t score                  = inf;
+
+    heap_entry_t(mip_node_t<i_t, f_t>* new_node)
+      : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate)
+    {
+    }
+  };
+
+  // Comparision function for ordering the nodes based on their lower bound with
+  // lowest one being explored first.
+  struct lower_bound_comp {
+    bool operator()(const std::shared_ptr<heap_entry_t>& a, const std::shared_ptr<heap_entry_t>& b)
+    {
+      // `a` will be placed after `b`
+      return a->lower_bound > b->lower_bound;
+    }
+  };
+
+  // Comparision function for ordering the nodes based on some score (currently the pseudocost
+  // estimate) with the lowest being explored first.
+  struct score_comp {
+    bool operator()(const std::shared_ptr<heap_entry_t>& a, const std::shared_ptr<heap_entry_t>& b)
+    {
+      // `a` will be placed after `b`
+      return a->score > b->score;
+    }
+  };
+
+  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap;
+  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap;
+  omp_mutex_t mutex;
+
+ public:
+  void push(mip_node_t<i_t, f_t>* new_node)
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    auto entry = std::make_shared<heap_entry_t>(new_node);
+    best_first_heap.push(entry);
+    diving_heap.push(entry);
+  }
+
+  std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    auto entry = best_first_heap.pop();
+
+    if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
+
+    return std::nullopt;
+  }
+
+  std::optional<mip_node_t<i_t, f_t>*> pop_diving()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+
+    while (!diving_heap.empty()) {
+      auto entry = diving_heap.pop();
+
+      if (entry.has_value()) {
+        if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
+      }
+    }
+
+    return std::nullopt;
+  }
+
+  i_t diving_queue_size()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return diving_heap.size();
+  }
+
+  i_t best_first_queue_size()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.size();
+  }
+
+  f_t get_lower_bound()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound;
+  }
+
+  mip_node_t<i_t, f_t>* bfs_top()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.empty() ? nullptr : best_first_heap.top()->node;
+  }
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 56298ef4d..2bc00f636 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -623,14 +623,17 @@ f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                            const std::vector<i_t>& basic_list,
                                            const std::vector<f_t>& x,
                                            std::vector<f_t>& squared_infeasibilities,
-                                           std::vector<i_t>& infeasibility_indices)
+                                           std::vector<i_t>& infeasibility_indices,
+                                           f_t& primal_inf)
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
-  squared_infeasibilities.resize(n, 0.0);
+  squared_infeasibilities.resize(n);
+  std::fill(squared_infeasibilities.begin(), squared_infeasibilities.end(), 0.0);
   infeasibility_indices.reserve(n);
   infeasibility_indices.clear();
-  f_t primal_inf = 0.0;
+  f_t primal_inf_squared = 0.0;
+  primal_inf             = 0.0;
   for (i_t k = 0; k < m; ++k) {
     const i_t j            = basic_list[k];
     const f_t lower_infeas = lp.lower[j] - x[j];
@@ -640,10 +643,11 @@ f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
       const f_t square_infeas    = infeas * infeas;
       squared_infeasibilities[j] = square_infeas;
       infeasibility_indices.push_back(j);
-      primal_inf += square_infeas;
+      primal_inf_squared += square_infeas;
+      primal_inf += infeas;
     }
   }
-  return primal_inf;
+  return primal_inf_squared;
 }
 
 template <typename i_t, typename f_t>
@@ -2241,7 +2245,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     assert(superbasic_list.size() == 0);
     assert(nonbasic_list.size() == n - m);
 
-    if (ft.refactor_basis(lp.A, settings, basic_list, nonbasic_list, vstatus) > 0) {
+    if (ft.refactor_basis(lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus) >
+        0) {
       return dual::status_t::NUMERICAL;
     }
 
@@ -2268,7 +2273,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
 #ifdef COMPUTE_DUAL_RESIDUAL
   std::vector<f_t> dual_res1;
-  compute_dual_residual(lp.A, objective, y, z, dual_res1);
+  phase2::compute_dual_residual(lp.A, objective, y, z, dual_res1);
   f_t dual_res_norm = vector_norm_inf<i_t, f_t>(dual_res1);
   if (dual_res_norm > settings.tight_tol) {
     settings.log.printf("|| A'*y + z - c || %e\n", dual_res_norm);
@@ -2357,8 +2362,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   std::vector<uint8_t> bounded_variables(n, 0);
   phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables);
 
-  f_t primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
-    lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+  f_t primal_infeasibility;
+  f_t primal_infeasibility_squared =
+    phase2::compute_initial_primal_infeasibilities(lp,
+                                                   settings,
+                                                   basic_list,
+                                                   x,
+                                                   squared_infeasibilities,
+                                                   infeasibility_indices,
+                                                   primal_infeasibility);
 
 #ifdef CHECK_BASIC_INFEASIBILITIES
   phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
@@ -2556,9 +2568,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
             std::vector<f_t> unperturbed_x(n);
             phase2::compute_primal_solution_from_basis(
               lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
-            x                    = unperturbed_x;
-            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
-              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+            x = unperturbed_x;
+            primal_infeasibility_squared =
+              phase2::compute_initial_primal_infeasibilities(lp,
+                                                             settings,
+                                                             basic_list,
+                                                             x,
+                                                             squared_infeasibilities,
+                                                             infeasibility_indices,
+                                                             primal_infeasibility);
             settings.log.printf("Updated primal infeasibility: %e\n", primal_infeasibility);
 
             objective = lp.objective;
@@ -2593,9 +2611,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
             std::vector<f_t> unperturbed_x(n);
             phase2::compute_primal_solution_from_basis(
               lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
-            x                    = unperturbed_x;
-            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
-              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+            x = unperturbed_x;
+            primal_infeasibility_squared =
+              phase2::compute_initial_primal_infeasibilities(lp,
+                                                             settings,
+                                                             basic_list,
+                                                             x,
+                                                             squared_infeasibilities,
+                                                             infeasibility_indices,
+                                                             primal_infeasibility);
 
             const f_t orig_dual_infeas = phase2::dual_infeasibility(
               lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
@@ -2810,7 +2834,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                           delta_xB_0_sparse.i,
                                           squared_infeasibilities,
                                           infeasibility_indices,
-                                          primal_infeasibility);
+                                          primal_infeasibility_squared);
     // Update primal infeasibilities due to changes in basic variables
     // from the leaving and entering variables
     phase2::update_primal_infeasibilities(lp,
@@ -2822,7 +2846,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                           scaled_delta_xB_sparse.i,
                                           squared_infeasibilities,
                                           infeasibility_indices,
-                                          primal_infeasibility);
+                                          primal_infeasibility_squared);
     // Update the entering variable
     phase2::update_single_primal_infeasibility(lp.lower,
                                                lp.upper,
@@ -2883,14 +2907,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
     if (should_refactor) {
       bool should_recompute_x = false;
-      if (ft.refactor_basis(lp.A, settings, basic_list, nonbasic_list, vstatus) > 0) {
+      if (ft.refactor_basis(
+            lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus) > 0) {
         should_recompute_x = true;
         settings.log.printf("Failed to factorize basis. Iteration %d\n", iter);
         if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
         i_t count = 0;
         i_t deficient_size;
-        while ((deficient_size =
-                  ft.refactor_basis(lp.A, settings, basic_list, nonbasic_list, vstatus)) > 0) {
+        while ((deficient_size = ft.refactor_basis(
+                  lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus)) > 0) {
           settings.log.printf("Failed to repair basis. Iteration %d. %d deficient columns.\n",
                               iter,
                               static_cast<int>(deficient_size));
@@ -2912,8 +2937,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
           lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
         x = unperturbed_x;
       }
-      phase2::compute_initial_primal_infeasibilities(
-        lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+      primal_infeasibility_squared =
+        phase2::compute_initial_primal_infeasibilities(lp,
+                                                       settings,
+                                                       basic_list,
+                                                       x,
+                                                       squared_infeasibilities,
+                                                       infeasibility_indices,
+                                                       primal_infeasibility);
     }
 #ifdef CHECK_BASIC_INFEASIBILITIES
     phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7);
@@ -2951,7 +2982,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                           iter,
                           compute_user_objective(lp, obj),
                           infeasibility_indices.size(),
-                          primal_infeasibility,
+                          primal_infeasibility_squared,
                           sum_perturb,
                           now);
     }
diff --git a/cpp/src/dual_simplex/presolve.cpp b/cpp/src/dual_simplex/presolve.cpp
index d247fbf67..6c8c3ae4a 100644
--- a/cpp/src/dual_simplex/presolve.cpp
+++ b/cpp/src/dual_simplex/presolve.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -621,7 +621,7 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
   }
 
   constexpr bool run_bounds_strengthening = false;
-  if (run_bounds_strengthening) {
+  if constexpr (run_bounds_strengthening) {
     csr_matrix_t<i_t, f_t> Arow(1, 1, 1);
     problem.A.to_compressed_row(Arow);
 
@@ -629,8 +629,8 @@ void convert_user_problem(const user_problem_t<i_t, f_t>& user_problem,
 
     // Empty var_types means that all variables are continuous
     bounds_strengthening_t<i_t, f_t> strengthening(problem, Arow, row_sense, {});
-    std::fill(strengthening.bounds_changed.begin(), strengthening.bounds_changed.end(), true);
-    strengthening.bounds_strengthening(problem.lower, problem.upper, settings);
+    std::vector<bool> bounds_changed(problem.num_cols, true);
+    strengthening.bounds_strengthening(problem.lower, problem.upper, bounds_changed, settings);
   }
 
   settings.log.debug(
diff --git a/cpp/src/dual_simplex/primal.cpp b/cpp/src/dual_simplex/primal.cpp
index 80406dcf0..69f15ba18 100644
--- a/cpp/src/dual_simplex/primal.cpp
+++ b/cpp/src/dual_simplex/primal.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -298,7 +298,15 @@ primal::status_t primal_phase2(i_t phase,
     factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
   if (rank != m) {
     settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-    basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+    basis_repair(lp.A,
+                 settings,
+                 lp.lower,
+                 lp.upper,
+                 deficient,
+                 slacks_needed,
+                 basic_list,
+                 nonbasic_list,
+                 vstatus);
     if (factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) ==
         -1) {
       settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 9f84e108d..e30471f9a 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -133,6 +133,75 @@ void strong_branch_helper(i_t start,
   }
 }
 
+template <typename i_t, typename f_t>
+f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
+                    const simplex_solver_settings_t<i_t, f_t>& settings,
+                    const std::vector<variable_type_t>& var_types,
+                    const std::vector<variable_status_t>& vstatus,
+                    const std::vector<f_t>& edge_norms,
+                    const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                    const std::vector<i_t>& basic_list,
+                    const std::vector<i_t>& nonbasic_list,
+                    i_t branch_var,
+                    f_t branch_var_lower,
+                    f_t branch_var_upper,
+                    f_t upper_bound,
+                    i_t bnb_lp_iter_per_node,
+                    omp_atomic_t<i_t>& total_lp_iter)
+{
+  lp_problem_t child_problem      = original_lp;
+  child_problem.lower[branch_var] = branch_var_lower;
+  child_problem.upper[branch_var] = branch_var_upper;
+
+  simplex_solver_settings_t<i_t, f_t> child_settings = settings;
+  child_settings.set_log(false);
+  f_t lp_start_time              = tic();
+  child_settings.iteration_limit = std::clamp(bnb_lp_iter_per_node, 10, 100);
+  child_settings.cut_off         = upper_bound + settings.dual_tol;
+  child_settings.inside_mip      = 2;
+  child_settings.scale_columns   = false;
+
+  lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
+  i_t iter                                         = 0;
+  std::vector<variable_status_t> child_vstatus     = vstatus;
+  std::vector<f_t> child_edge_norms                = edge_norms;
+  std::vector<i_t> child_basic_list                = basic_list;
+  std::vector<i_t> child_nonbasic_list             = nonbasic_list;
+  basis_update_mpf_t<i_t, f_t> child_basis_factors = basis_factors;
+
+  dual::status_t status = dual_phase2_with_advanced_basis(2,
+                                                          0,
+                                                          false,
+                                                          lp_start_time,
+                                                          child_problem,
+                                                          child_settings,
+                                                          child_vstatus,
+                                                          child_basis_factors,
+                                                          child_basic_list,
+                                                          child_nonbasic_list,
+                                                          solution,
+                                                          iter,
+                                                          child_edge_norms);
+  total_lp_iter += iter;
+  settings.log.debug("Trial branching on variable %d. Lo: %e Up: %e. Iter %d. Status %s. Obj %e\n",
+                     branch_var,
+                     child_problem.lower[branch_var],
+                     child_problem.upper[branch_var],
+                     iter,
+                     dual::status_to_string(status).c_str(),
+                     compute_objective(child_problem, solution.x));
+
+  if (status == dual::status_t::DUAL_UNBOUNDED) {
+    // LP was infeasible
+    return std::numeric_limits<f_t>::infinity();
+  } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
+             status == dual::status_t::CUTOFF) {
+    return compute_objective(child_problem, solution.x);
+  } else {
+    return std::numeric_limits<f_t>::quiet_NaN();
+  }
+}
+
 }  // namespace
 
 template <typename i_t, typename f_t>
@@ -148,8 +217,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       pseudo_costs_t<i_t, f_t>& pc)
 {
   pc.resize(original_lp.num_cols);
-  pc.strong_branch_down.resize(fractional.size());
-  pc.strong_branch_up.resize(fractional.size());
+  pc.strong_branch_down.assign(fractional.size(), 0);
+  pc.strong_branch_up.assign(fractional.size(), 0);
   pc.num_strong_branches_completed = 0;
 
   settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
@@ -199,7 +268,7 @@ template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
-  mutex.lock();
+  std::lock_guard<omp_mutex_t> lock(pseudo_cost_mutex[node_ptr->branch_var]);
   const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
   const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
@@ -211,14 +280,13 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_pt
     pseudo_cost_sum_up[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_up[node_ptr->branch_var]++;
   }
-  mutex.unlock();
 }
 
 template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
                                            i_t& num_initialized_up,
                                            f_t& pseudo_cost_down_avg,
-                                           f_t& pseudo_cost_up_avg) const
+                                           f_t& pseudo_cost_up_avg)
 {
   num_initialized_down = 0;
   num_initialized_up   = 0;
@@ -226,6 +294,8 @@ void pseudo_costs_t<i_t, f_t>::initialized(i_t& num_initialized_down,
   pseudo_cost_up_avg   = 0;
   const i_t n          = pseudo_cost_sum_down.size();
   for (i_t j = 0; j < n; j++) {
+    std::lock_guard<omp_mutex_t> lock(pseudo_cost_mutex[j]);
+
     if (pseudo_cost_num_down[j] > 0) {
       num_initialized_down++;
       if (std::isfinite(pseudo_cost_sum_down[j])) {
@@ -258,8 +328,6 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
                                                  const std::vector<f_t>& solution,
                                                  logger_t& log)
 {
-  mutex.lock();
-
   const i_t num_fractional = fractional.size();
   std::vector<f_t> pseudo_cost_up(num_fractional);
   std::vector<f_t> pseudo_cost_down(num_fractional);
@@ -272,14 +340,16 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
 
   initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
 
-  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
-             num_initialized_down,
-             num_initialized_up,
-             pseudo_cost_down_avg,
-             pseudo_cost_up_avg);
+  log.debug("PC: num initialized down %d up %d avg down %e up %e\n",
+            num_initialized_down,
+            num_initialized_up,
+            pseudo_cost_down_avg,
+            pseudo_cost_up_avg);
 
   for (i_t k = 0; k < num_fractional; k++) {
     const i_t j = fractional[k];
+
+    pseudo_cost_mutex[j].lock();
     if (pseudo_cost_num_down[j] != 0) {
       pseudo_cost_down[k] = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
     } else {
@@ -291,6 +361,8 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     } else {
       pseudo_cost_up[k] = pseudo_cost_up_avg;
     }
+    pseudo_cost_mutex[j].unlock();
+
     constexpr f_t eps = 1e-6;
     const f_t f_down  = solution[j] - std::floor(solution[j]);
     const f_t f_up    = std::ceil(solution[j]) - solution[j];
@@ -309,14 +381,196 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
     }
   }
 
-  log.printf(
-    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
+  log.debug("Pseudocost branching on %d. Value %e. Score %e.\n",
+            branch_var,
+            solution[branch_var],
+            score[select]);
+
+  return branch_var;
+}
+
+template <typename i_t, typename f_t>
+i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
+  const lp_problem_t<i_t, f_t>& lp,
+  const simplex_solver_settings_t<i_t, f_t>& settings,
+  const std::vector<variable_type_t>& var_types,
+  const std::vector<variable_status_t>& vstatus,
+  const std::vector<f_t>& edge_norms,
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& solution,
+  const basis_update_mpf_t<i_t, f_t>& basis_factors,
+  const std::vector<i_t>& basic_list,
+  const std::vector<i_t>& nonbasic_list,
+  f_t current_obj,
+  f_t upper_bound,
+  i_t bnb_lp_iter,
+  i_t bnb_explored_nodes,
+  logger_t& log)
+{
+  i_t branch_var = fractional[0];
+  f_t max_score  = -1;
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
 
-  mutex.unlock();
+  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  log.printf("PC: num initialized down %d up %d avg down %e up %e\n",
+             num_initialized_down,
+             num_initialized_up,
+             pseudo_cost_down_avg,
+             pseudo_cost_up_avg);
+
+  // const i_t max_iter     = 0.5 * bnb_lp_iter;
+  // const i_t gamma        = (max_iter - total_lp_iter) / (total_lp_iter + 1);
+  // const i_t max_v        = 5;
+  // const i_t min_v        = 1;
+  // i_t reliable_threshold = std::clamp((1 - gamma) * min_v + gamma * max_v, min_v, max_v);
+  // reliable_threshold     = total_lp_iter < max_iter ? reliable_threshold : 0;
+  i_t reliable_threshold = 1;
+
+  settings.log.debug("RB LP iterations = %d, B&B LP iterations = %d reliable_threshold = %d\n",
+                     total_lp_iter.load(),
+                     bnb_lp_iter,
+                     reliable_threshold);
+
+  std::vector<i_t> pending = fractional;
+  std::vector<i_t> conflict;
+  conflict.reserve(fractional.size());
+
+  while (!pending.empty()) {
+    for (auto j : pending) {
+      bool is_locked = pseudo_cost_mutex[j].try_lock();
+
+      if (!is_locked) {
+        conflict.push_back(j);
+        continue;
+      }
+
+      if (pseudo_cost_num_down[j] < reliable_threshold) {
+        // Do trial branching on the down branch
+        f_t obj = trial_branching(lp,
+                                  settings,
+                                  var_types,
+                                  vstatus,
+                                  edge_norms,
+                                  basis_factors,
+                                  basic_list,
+                                  nonbasic_list,
+                                  j,
+                                  lp.lower[j],
+                                  std::floor(solution[j]),
+                                  upper_bound,
+                                  bnb_lp_iter / bnb_explored_nodes,
+                                  total_lp_iter);
+        if (!std::isnan(obj)) {
+          f_t change_in_obj = obj - current_obj;
+          f_t change_in_x   = solution[j] - std::floor(solution[j]);
+          pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+          pseudo_cost_num_down[j]++;
+        }
+      }
+
+      if (pseudo_cost_num_up[j] < reliable_threshold) {
+        f_t obj = trial_branching(lp,
+                                  settings,
+                                  var_types,
+                                  vstatus,
+                                  edge_norms,
+                                  basis_factors,
+                                  basic_list,
+                                  nonbasic_list,
+                                  j,
+                                  std::ceil(solution[j]),
+                                  lp.upper[j],
+                                  upper_bound,
+                                  bnb_lp_iter / bnb_explored_nodes,
+                                  total_lp_iter);
+
+        if (!std::isnan(obj)) {
+          f_t change_in_obj = obj - current_obj;
+          f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+          pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+          pseudo_cost_num_up[j]++;
+        }
+      }
+
+      f_t pc_up   = pseudo_cost_num_up[j] > 0 ? pseudo_cost_sum_up[j] / pseudo_cost_num_up[j]
+                                              : pseudo_cost_up_avg;
+      f_t pc_down = pseudo_cost_sum_down[j] > 0 ? pseudo_cost_sum_down[j] / pseudo_cost_num_down[j]
+                                                : pseudo_cost_down_avg;
+
+      pseudo_cost_mutex[j].unlock();
+
+      constexpr f_t eps = 1e-6;
+      const f_t f_down  = solution[j] - std::floor(solution[j]);
+      const f_t f_up    = std::ceil(solution[j]) - solution[j];
+      f_t score         = std::max(f_down * pc_down, eps) * std::max(f_up * pc_up, eps);
+
+      if (score > max_score) {
+        max_score  = score;
+        branch_var = j;
+      }
+    }
+
+    std::swap(pending, conflict);
+    conflict.clear();
+  }
+
+  log.printf(
+    "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
 
   return branch_var;
 }
 
+template <typename i_t, typename f_t>
+f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
+                                           const std::vector<f_t>& solution,
+                                           f_t lower_bound,
+                                           logger_t& log)
+{
+  const i_t num_fractional = fractional.size();
+  f_t estimate             = lower_bound;
+
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+
+  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  for (i_t k = 0; k < num_fractional; k++) {
+    const i_t j = fractional[k];
+
+    f_t pseudo_cost_down = 0;
+    f_t pseudo_cost_up   = 0;
+
+    pseudo_cost_mutex[j].lock();
+    if (pseudo_cost_num_down[j] != 0) {
+      pseudo_cost_down = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
+    } else {
+      pseudo_cost_down = pseudo_cost_down_avg;
+    }
+
+    if (pseudo_cost_num_up[j] != 0) {
+      pseudo_cost_up = pseudo_cost_sum_up[j] / pseudo_cost_num_up[j];
+    } else {
+      pseudo_cost_up = pseudo_cost_up_avg;
+    }
+    pseudo_cost_mutex[j].unlock();
+
+    constexpr f_t eps = 1e-6;
+    const f_t f_down  = solution[j] - std::floor(solution[j]);
+    const f_t f_up    = std::ceil(solution[j]) - solution[j];
+    estimate +=
+      std::min(std::max(pseudo_cost_down * f_down, eps), std::max(pseudo_cost_up * f_up, eps));
+  }
+
+  log.debug("pseudocost estimate = %e\n", estimate);
+  return estimate;
+}
+
 template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
   const std::vector<i_t>& fractional, const std::vector<f_t>& root_soln)
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index 799cdc3ff..8e079d0da 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -1,12 +1,13 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
 
 #pragma once
 
+#include <dual_simplex/basis_updates.hpp>
 #include <dual_simplex/logger.hpp>
 #include <dual_simplex/mip_node.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -24,7 +25,8 @@ class pseudo_costs_t {
     : pseudo_cost_sum_down(num_variables),
       pseudo_cost_sum_up(num_variables),
       pseudo_cost_num_down(num_variables),
-      pseudo_cost_num_up(num_variables)
+      pseudo_cost_num_up(num_variables),
+      pseudo_cost_mutex(num_variables)
   {
   }
 
@@ -32,21 +34,43 @@ class pseudo_costs_t {
 
   void resize(i_t num_variables)
   {
-    pseudo_cost_sum_down.resize(num_variables);
-    pseudo_cost_sum_up.resize(num_variables);
-    pseudo_cost_num_down.resize(num_variables);
-    pseudo_cost_num_up.resize(num_variables);
+    pseudo_cost_sum_down.assign(num_variables, 0);
+    pseudo_cost_sum_up.assign(num_variables, 0);
+    pseudo_cost_num_down.assign(num_variables, 0);
+    pseudo_cost_num_up.assign(num_variables, 0);
+    pseudo_cost_mutex.resize(num_variables);
   }
 
   void initialized(i_t& num_initialized_down,
                    i_t& num_initialized_up,
                    f_t& pseudo_cost_down_avg,
-                   f_t& pseudo_cost_up_avg) const;
+                   f_t& pseudo_cost_up_avg);
 
   i_t variable_selection(const std::vector<i_t>& fractional,
                          const std::vector<f_t>& solution,
                          logger_t& log);
 
+  i_t reliable_variable_selection(const lp_problem_t<i_t, f_t>& lp,
+                                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                                  const std::vector<variable_type_t>& var_types,
+                                  const std::vector<variable_status_t>& vstatus,
+                                  const std::vector<f_t>& edge_norms,
+                                  const std::vector<i_t>& fractional,
+                                  const std::vector<f_t>& solution,
+                                  const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                  const std::vector<i_t>& basic_list,
+                                  const std::vector<i_t>& nonbasic_list,
+                                  f_t current_obj,
+                                  f_t upper_bound,
+                                  i_t bnb_lp_iter,
+                                  i_t bnb_explored_nodes,
+                                  logger_t& log);
+
+  f_t obj_estimate(const std::vector<i_t>& fractional,
+                   const std::vector<f_t>& solution,
+                   f_t lower_bound,
+                   logger_t& log);
+
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);
   std::vector<f_t> pseudo_cost_sum_up;
@@ -55,9 +79,9 @@ class pseudo_costs_t {
   std::vector<i_t> pseudo_cost_num_down;
   std::vector<f_t> strong_branch_down;
   std::vector<f_t> strong_branch_up;
-
-  omp_mutex_t mutex;
+  std::vector<omp_mutex_t> pseudo_cost_mutex;
   omp_atomic_t<i_t> num_strong_branches_completed = 0;
+  omp_atomic_t<i_t> total_lp_iter                 = 0;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index a1cc049e7..c46eda085 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -13,12 +13,28 @@
 #include <omp.h>
 #include <algorithm>
 #include <atomic>
+#include <cmath>
 #include <functional>
 #include <limits>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename i_t, typename f_t>
+struct diving_heuristics_settings_t {
+  i_t num_diving_workers = -1;
+
+  bool disable_line_search_diving = false;
+  bool disable_pseudocost_diving  = false;
+  bool disable_guided_diving      = false;
+  bool disable_coefficient_diving = false;
+
+  i_t min_node_depth         = 10;
+  i_t node_limit             = 500;
+  f_t iteration_limit_factor = 0.05;
+  i_t backtrack              = 5;
+};
+
 template <typename i_t, typename f_t>
 struct simplex_solver_settings_t {
  public:
@@ -70,14 +86,14 @@ struct simplex_solver_settings_t {
       iteration_log_frequency(1000),
       first_iteration_log(2),
       num_threads(omp_get_max_threads() - 1),
-      num_bfs_threads(std::min(num_threads / 4, 1)),
-      num_diving_threads(std::min(num_threads - num_bfs_threads, 1)),
+      num_bfs_workers(std::max(num_threads / 4, 1)),
       random_seed(0),
       inside_mip(0),
       solution_callback(nullptr),
       heuristic_preemption_callback(nullptr),
       concurrent_halt(nullptr)
   {
+    diving_settings.num_diving_workers = std::max(num_threads - num_bfs_workers, 1);
   }
 
   void set_log(bool logging) const { log.log = logging; }
@@ -137,8 +153,10 @@ struct simplex_solver_settings_t {
   i_t first_iteration_log;         // number of iterations to log at beginning of solve
   i_t num_threads;                 // number of threads to use
   i_t random_seed;                 // random seed
-  i_t num_bfs_threads;             // number of threads dedicated to the best-first search
-  i_t num_diving_threads;          // number of threads dedicated to diving
+  i_t num_bfs_workers;             // number of threads dedicated to the best-first search
+
+  diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
+
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   std::function<void(std::vector<f_t>&, f_t)> solution_callback;
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
diff --git a/cpp/src/mip/diversity/lns/rins.cu b/cpp/src/mip/diversity/lns/rins.cu
index ba648f30e..7394e2db6 100644
--- a/cpp/src/mip/diversity/lns/rins.cu
+++ b/cpp/src/mip/diversity/lns/rins.cu
@@ -256,13 +256,19 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
   branch_and_bound_settings.relative_mip_gap_tol =
     std::min(current_mip_gap, (f_t)settings.target_mip_gap);
-  branch_and_bound_settings.integer_tol        = context.settings.tolerances.integrality_tolerance;
-  branch_and_bound_settings.num_threads        = 2;
-  branch_and_bound_settings.num_bfs_threads    = 1;
-  branch_and_bound_settings.num_diving_threads = 1;
-  branch_and_bound_settings.log.log            = false;
-  branch_and_bound_settings.log.log_prefix     = "[RINS] ";
-  branch_and_bound_settings.solution_callback  = [this, &rins_solution_queue](
+  branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
+  branch_and_bound_settings.num_threads     = 2;
+  branch_and_bound_settings.num_bfs_workers = 1;
+
+  // In the future, let RINS use all the diving heuristics. For now,
+  // restricting to guided diving.
+  branch_and_bound_settings.diving_settings.num_diving_workers         = 1;
+  branch_and_bound_settings.diving_settings.disable_line_search_diving = true;
+  branch_and_bound_settings.diving_settings.disable_coefficient_diving = true;
+  branch_and_bound_settings.diving_settings.disable_pseudocost_diving  = true;
+  branch_and_bound_settings.log.log                                    = false;
+  branch_and_bound_settings.log.log_prefix                             = "[RINS] ";
+  branch_and_bound_settings.solution_callback = [this, &rins_solution_queue](
                                                   std::vector<f_t>& solution, f_t objective) {
     rins_solution_queue.push_back(solution);
   };
diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
index 5be807372..d46c6b31a 100644
--- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -103,9 +103,15 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
       branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
       branch_and_bound_settings.num_threads     = 2;
-      branch_and_bound_settings.num_bfs_threads = 1;
-      branch_and_bound_settings.num_diving_threads = 1;
-      branch_and_bound_settings.solution_callback  = [this](std::vector<f_t>& solution,
+      branch_and_bound_settings.num_bfs_workers = 1;
+
+      // In the future, let SubMIP use all the diving heuristics. For now,
+      // restricting to guided diving.
+      branch_and_bound_settings.diving_settings.num_diving_workers         = 1;
+      branch_and_bound_settings.diving_settings.disable_line_search_diving = true;
+      branch_and_bound_settings.diving_settings.disable_coefficient_diving = true;
+      branch_and_bound_settings.diving_settings.disable_pseudocost_diving  = true;
+      branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
                                                            f_t objective) {
         this->solution_callback(solution, objective);
       };
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index 7311a26fd..08e1806b9 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -173,13 +173,12 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     } else {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
-    CUOPT_LOG_INFO("Using %d CPU threads for B&B", branch_and_bound_settings.num_threads);
 
-    i_t num_threads                              = branch_and_bound_settings.num_threads;
-    i_t num_bfs_threads                          = std::max(1, num_threads / 4);
-    i_t num_diving_threads                       = std::max(1, num_threads - num_bfs_threads);
-    branch_and_bound_settings.num_bfs_threads    = num_bfs_threads;
-    branch_and_bound_settings.num_diving_threads = num_diving_threads;
+    i_t num_threads                           = branch_and_bound_settings.num_threads;
+    i_t num_bfs_workers                       = std::max(1, num_threads / 4);
+    i_t num_diving_workers                    = std::max(1, num_threads - num_bfs_workers);
+    branch_and_bound_settings.num_bfs_workers = num_bfs_workers;
+    branch_and_bound_settings.diving_settings.num_diving_workers = num_diving_workers;
 
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index 33eda66cb..e1b68bf88 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -53,7 +53,7 @@ class omp_atomic_t {
   T operator--() { return fetch_sub(T(1)) - 1; }
   T operator--(int) { return fetch_sub(T(1)); }
 
-  T load()
+  T load() const
   {
     T res;
 #pragma omp atomic read