diff --git a/cpp/src/dual_simplex/CMakeLists.txt b/cpp/src/dual_simplex/CMakeLists.txt
index e8a9b5dce..af1415fa9 100644
--- a/cpp/src/dual_simplex/CMakeLists.txt
+++ b/cpp/src/dual_simplex/CMakeLists.txt
@@ -1,5 +1,5 @@
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 
@@ -31,6 +31,7 @@ set(DUAL_SIMPLEX_SRC_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/triangle_solve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/vector_math.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/pinned_host_allocator.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/diving_heuristics.cpp
   )
 
 # Uncomment to enable debug info
diff --git a/cpp/src/dual_simplex/basis_solves.cpp b/cpp/src/dual_simplex/basis_solves.cpp
index db24f55a2..f5cd54053 100644
--- a/cpp/src/dual_simplex/basis_solves.cpp
+++ b/cpp/src/dual_simplex/basis_solves.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -613,6 +613,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                 const std::vector<f_t>& lower,
+                 const std::vector<f_t>& upper,
                  const std::vector<i_t>& deficient,
                  const std::vector<i_t>& slacks_needed,
                  std::vector<i_t>& basis_list,
@@ -658,7 +660,15 @@ i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
     nonbasic_list[nonbasic_map[replace_j]] = bad_j;
     vstatus[replace_j]                     = variable_status_t::BASIC;
     // This is the main issue. What value should bad_j take on.
-    vstatus[bad_j] = variable_status_t::NONBASIC_FREE;
+    if (lower[bad_j] == -inf && upper[bad_j] == inf) {
+      vstatus[bad_j] = variable_status_t::NONBASIC_FREE;
+    } else if (lower[bad_j] > -inf) {
+      vstatus[bad_j] = variable_status_t::NONBASIC_LOWER;
+    } else if (upper[bad_j] < inf) {
+      vstatus[bad_j] = variable_status_t::NONBASIC_UPPER;
+    } else {
+      assert(1 == 0);
+    }
   }
 
   return 0;
@@ -849,6 +859,8 @@ template int factorize_basis<int>(const csc_matrix_t<int, double>& A,
 
 template int basis_repair<int, double>(const csc_matrix_t<int, double>& A,
                                        const simplex_solver_settings_t<int, double>& settings,
+                                       const std::vector<double>& lower,
+                                       const std::vector<double>& upper,
                                        const std::vector<int>& deficient,
                                        const std::vector<int>& slacks_needed,
                                        std::vector<int>& basis_list,
diff --git a/cpp/src/dual_simplex/basis_solves.hpp b/cpp/src/dual_simplex/basis_solves.hpp
index b668c0f46..295bedccd 100644
--- a/cpp/src/dual_simplex/basis_solves.hpp
+++ b/cpp/src/dual_simplex/basis_solves.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -42,6 +42,8 @@ i_t factorize_basis(const csc_matrix_t<i_t, f_t>& A,
 template <typename i_t, typename f_t>
 i_t basis_repair(const csc_matrix_t<i_t, f_t>& A,
                  const simplex_solver_settings_t<i_t, f_t>& settings,
+                 const std::vector<f_t>& lower,
+                 const std::vector<f_t>& upper,
                  const std::vector<i_t>& deficient,
                  const std::vector<i_t>& slacks_needed,
                  std::vector<i_t>& basis_list,
diff --git a/cpp/src/dual_simplex/basis_updates.cpp b/cpp/src/dual_simplex/basis_updates.cpp
index 6b79f3c86..2c781a515 100644
--- a/cpp/src/dual_simplex/basis_updates.cpp
+++ b/cpp/src/dual_simplex/basis_updates.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -2046,6 +2046,8 @@ template <typename i_t, typename f_t>
 int basis_update_mpf_t<i_t, f_t>::refactor_basis(
   const csc_matrix_t<i_t, f_t>& A,
   const simplex_solver_settings_t<i_t, f_t>& settings,
+  const std::vector<f_t>& lower,
+  const std::vector<f_t>& upper,
   std::vector<i_t>& basic_list,
   std::vector<i_t>& nonbasic_list,
   std::vector<variable_status_t>& vstatus)
@@ -2066,7 +2068,8 @@ int basis_update_mpf_t<i_t, f_t>::refactor_basis(
                       deficient,
                       slacks_needed) == -1) {
     settings.log.debug("Initial factorization failed\n");
-    basis_repair(A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+    basis_repair(
+      A, settings, lower, upper, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
 
 #ifdef CHECK_BASIS_REPAIR
     const i_t m = A.m;
diff --git a/cpp/src/dual_simplex/basis_updates.hpp b/cpp/src/dual_simplex/basis_updates.hpp
index cea907074..afd4f4c9a 100644
--- a/cpp/src/dual_simplex/basis_updates.hpp
+++ b/cpp/src/dual_simplex/basis_updates.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -373,6 +373,8 @@ class basis_update_mpf_t {
   // Compute L*U = A(p, basic_list)
   int refactor_basis(const csc_matrix_t<i_t, f_t>& A,
                      const simplex_solver_settings_t<i_t, f_t>& settings,
+                     const std::vector<f_t>& lower,
+                     const std::vector<f_t>& upper,
                      std::vector<i_t>& basic_list,
                      std::vector<i_t>& nonbasic_list,
                      std::vector<variable_status_t>& vstatus);
diff --git a/cpp/src/dual_simplex/bounds_strengthening.cpp b/cpp/src/dual_simplex/bounds_strengthening.cpp
index f1bf52c1e..4114e7e09 100644
--- a/cpp/src/dual_simplex/bounds_strengthening.cpp
+++ b/cpp/src/dual_simplex/bounds_strengthening.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -154,7 +154,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       bool is_infeasible =
         check_infeasibility<i_t, f_t>(min_a, max_a, cnst_lb, cnst_ub, settings.primal_tol);
       if (is_infeasible) {
-        settings.log.printf(
+        settings.log.debug(
           "Iter:: %d, Infeasible constraint %d, cnst_lb %e, cnst_ub %e, min_a %e, max_a %e\n",
           iter,
           i,
@@ -211,7 +211,7 @@ bool bounds_strengthening_t<i_t, f_t>::bounds_strengthening(
       new_ub = std::min(new_ub, upper_bounds[k]);
 
       if (new_lb > new_ub + 1e-6) {
-        settings.log.printf(
+        settings.log.debug(
           "Iter:: %d, Infeasible variable after update %d, %e > %e\n", iter, k, new_lb, new_ub);
         return false;
       }
diff --git a/cpp/src/dual_simplex/branch_and_bound.cpp b/cpp/src/dual_simplex/branch_and_bound.cpp
index 81db8a341..b2c9f85d2 100644
--- a/cpp/src/dual_simplex/branch_and_bound.cpp
+++ b/cpp/src/dual_simplex/branch_and_bound.cpp
@@ -5,10 +5,9 @@
  */
 /* clang-format on */
 
-#include <omp.h>
-#include <algorithm>
-#include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/branch_and_bound.hpp>
+
+#include <dual_simplex/bounds_strengthening.hpp>
 #include <dual_simplex/crossover.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/logger.hpp>
@@ -20,6 +19,9 @@
 #include <dual_simplex/tic_toc.hpp>
 #include <dual_simplex/user_problem.hpp>
 
+#include <omp.h>
+
+#include <algorithm>
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
@@ -192,20 +194,31 @@ std::string user_mip_gap(f_t obj_value, f_t lower_bound)
   }
 }
 
-inline const char* feasible_solution_symbol(thread_type_t type)
+#ifdef SHOW_DIVING_TYPE
+inline char feasible_solution_symbol(bnb_worker_type_t type)
 {
   switch (type) {
-    case thread_type_t::EXPLORATION: return "B ";
-    case thread_type_t::DIVING: return "D ";
-    default: return "U ";
+    case bnb_worker_type_t::BEST_FIRST: return 'B';
+    case bnb_worker_type_t::COEFFICIENT_DIVING: return 'C';
+    case bnb_worker_type_t::LINE_SEARCH_DIVING: return 'L';
+    case bnb_worker_type_t::PSEUDOCOST_DIVING: return 'P';
+    case bnb_worker_type_t::GUIDED_DIVING: return 'G';
+    default: return 'U';
   }
 }
-
-inline bool has_children(node_solve_info_t status)
+#else
+inline char feasible_solution_symbol(bnb_worker_type_t type)
 {
-  return status == node_solve_info_t::UP_CHILD_FIRST ||
-         status == node_solve_info_t::DOWN_CHILD_FIRST;
+  switch (type) {
+    case bnb_worker_type_t::BEST_FIRST: return 'B';
+    case bnb_worker_type_t::COEFFICIENT_DIVING: return 'D';
+    case bnb_worker_type_t::LINE_SEARCH_DIVING: return 'D';
+    case bnb_worker_type_t::PSEUDOCOST_DIVING: return 'D';
+    case bnb_worker_type_t::GUIDED_DIVING: return 'D';
+    default: return 'U';
+  }
 }
+#endif
 
 }  // namespace
 
@@ -216,59 +229,39 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
   : original_problem_(user_problem),
     settings_(solver_settings),
     original_lp_(user_problem.handle_ptr, 1, 1, 1),
+    Arow_(1, 1, 0),
     incumbent_(1),
     root_relax_soln_(1, 1),
     root_crossover_soln_(1, 1),
     pc_(1),
-    solver_status_(mip_exploration_status_t::UNSET)
+    solver_status_(mip_status_t::UNSET)
 {
   exploration_stats_.start_time = tic();
   dualize_info_t<i_t, f_t> dualize_info;
   convert_user_problem(original_problem_, settings_, original_lp_, new_slacks_, dualize_info);
   full_variable_types(original_problem_, original_lp_, var_types_);
 
-  mutex_upper_.lock();
   upper_bound_ = inf;
-  mutex_upper_.unlock();
-}
-
-template <typename i_t, typename f_t>
-f_t branch_and_bound_t<i_t, f_t>::get_upper_bound()
-{
-  mutex_upper_.lock();
-  const f_t upper_bound = upper_bound_;
-  mutex_upper_.unlock();
-  return upper_bound;
 }
 
 template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  f_t lower_bound = lower_bound_ceiling_.load();
-  mutex_heap_.lock();
-  if (heap_.size() > 0) { lower_bound = std::min(heap_.top()->lower_bound, lower_bound); }
-  mutex_heap_.unlock();
+  f_t lower_bound      = lower_bound_ceiling_.load();
+  f_t heap_lower_bound = node_queue_.get_lower_bound();
+  lower_bound          = std::min(heap_lower_bound, lower_bound);
 
   for (i_t i = 0; i < local_lower_bounds_.size(); ++i) {
     lower_bound = std::min(local_lower_bounds_[i].load(), lower_bound);
   }
 
-  return lower_bound;
-}
-
-template <typename i_t, typename f_t>
-i_t branch_and_bound_t<i_t, f_t>::get_heap_size()
-{
-  mutex_heap_.lock();
-  i_t size = heap_.size();
-  mutex_heap_.unlock();
-  return size;
+  return std::isfinite(lower_bound) ? lower_bound : -inf;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
 {
-  if (solver_status_ == mip_exploration_status_t::RUNNING) {
+  if (is_running) {
     f_t user_obj         = compute_user_objective(original_lp_, obj);
     f_t user_lower       = compute_user_objective(original_lp_, get_lower_bound());
     std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
@@ -287,10 +280,7 @@ void branch_and_bound_t<i_t, f_t>::report_heuristic(f_t obj)
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::report(std::string symbol,
-                                          f_t obj,
-                                          f_t lower_bound,
-                                          i_t node_depth)
+void branch_and_bound_t<i_t, f_t>::report(char symbol, f_t obj, f_t lower_bound, i_t node_depth)
 {
   i_t nodes_explored   = exploration_stats_.nodes_explored;
   i_t nodes_unexplored = exploration_stats_.nodes_unexplored;
@@ -298,8 +288,8 @@ void branch_and_bound_t<i_t, f_t>::report(std::string symbol,
   f_t user_lower       = compute_user_objective(original_lp_, lower_bound);
   f_t iter_node        = exploration_stats_.total_lp_iters / nodes_explored;
   std::string user_gap = user_mip_gap<f_t>(user_obj, user_lower);
-  settings_.log.printf("%s%10d   %10lu    %+13.6e    %+10.6e  %6d    %7.1e     %s %9.2f\n",
-                       symbol.c_str(),
+  settings_.log.printf("%c %10d   %10lu    %+13.6e    %+10.6e  %6d    %7.1e     %s %9.2f\n",
+                       symbol,
                        nodes_explored,
                        nodes_unexplored,
                        user_obj,
@@ -460,30 +450,24 @@ void branch_and_bound_t<i_t, f_t>::repair_heuristic_solutions()
 }
 
 template <typename i_t, typename f_t>
-mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>& solution,
-                                                              f_t lower_bound)
+void branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>& solution,
+                                                      f_t lower_bound)
 {
-  mip_status_t mip_status = mip_status_t::UNSET;
-
-  if (solver_status_ == mip_exploration_status_t::NUMERICAL) {
+  if (solver_status_ == mip_status_t::NUMERICAL) {
     settings_.log.printf("Numerical issue encountered. Stopping the solver...\n");
-    mip_status = mip_status_t::NUMERICAL;
   }
 
-  if (solver_status_ == mip_exploration_status_t::TIME_LIMIT) {
+  if (solver_status_ == mip_status_t::TIME_LIMIT) {
     settings_.log.printf("Time limit reached. Stopping the solver...\n");
-    mip_status = mip_status_t::TIME_LIMIT;
   }
-  if (solver_status_ == mip_exploration_status_t::NODE_LIMIT) {
+  if (solver_status_ == mip_status_t::NODE_LIMIT) {
     settings_.log.printf("Node limit reached. Stopping the solver...\n");
-    mip_status = mip_status_t::NODE_LIMIT;
   }
 
-  f_t upper_bound      = get_upper_bound();
-  f_t gap              = upper_bound - lower_bound;
-  f_t obj              = compute_user_objective(original_lp_, upper_bound);
+  f_t gap              = upper_bound_ - lower_bound;
+  f_t obj              = compute_user_objective(original_lp_, upper_bound_.load());
   f_t user_bound       = compute_user_objective(original_lp_, lower_bound);
-  f_t gap_rel          = user_relative_gap(original_lp_, upper_bound, lower_bound);
+  f_t gap_rel          = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
   bool is_maximization = original_lp_.obj_scale < 0.0;
 
   settings_.log.printf("Explored %d nodes in %.2fs.\n",
@@ -496,7 +480,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t
                        user_bound);
 
   if (gap <= settings_.absolute_mip_gap_tol || gap_rel <= settings_.relative_mip_gap_tol) {
-    mip_status = mip_status_t::OPTIMAL;
+    solver_status_ = mip_status_t::OPTIMAL;
     if (gap > 0 && gap <= settings_.absolute_mip_gap_tol) {
       settings_.log.printf("Optimal solution found within absolute MIP gap tolerance (%.1e)\n",
                            settings_.absolute_mip_gap_tol);
@@ -511,18 +495,18 @@ mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t
     }
   }
 
-  if (solver_status_ == mip_exploration_status_t::COMPLETED) {
+  if (solver_status_ == mip_status_t::UNSET) {
     if (exploration_stats_.nodes_explored > 0 && exploration_stats_.nodes_unexplored == 0 &&
-        upper_bound == inf) {
+        upper_bound_ == inf) {
       settings_.log.printf("Integer infeasible.\n");
-      mip_status = mip_status_t::INFEASIBLE;
+      solver_status_ = mip_status_t::INFEASIBLE;
       if (settings_.heuristic_preemption_callback != nullptr) {
         settings_.heuristic_preemption_callback();
       }
     }
   }
 
-  if (upper_bound != inf) {
+  if (upper_bound_ != inf) {
     assert(incumbent_.has_incumbent);
     uncrush_primal_solution(original_problem_, original_lp_, incumbent_.x, solution.x);
   }
@@ -530,18 +514,20 @@ mip_status_t branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t
   solution.lower_bound        = lower_bound;
   solution.nodes_explored     = exploration_stats_.nodes_explored;
   solution.simplex_iterations = exploration_stats_.total_lp_iters;
-
-  return mip_status;
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
                                                          const std::vector<f_t>& leaf_solution,
                                                          i_t leaf_depth,
-                                                         thread_type_t thread_type)
+                                                         bnb_worker_type_t thread_type)
 {
   bool send_solution = false;
 
+  settings_.log.debug("%c found a feasible solution with obj=%.10e.\n",
+                      feasible_solution_symbol(thread_type),
+                      compute_user_objective(original_lp_, leaf_objective));
+
   mutex_upper_.lock();
   if (leaf_objective < upper_bound_) {
     incumbent_.set_incumbent_solution(leaf_objective, leaf_solution);
@@ -558,16 +544,18 @@ void branch_and_bound_t<i_t, f_t>::add_feasible_solution(f_t leaf_objective,
   mutex_upper_.unlock();
 }
 
-template <typename i_t, typename f_t>
-rounding_direction_t branch_and_bound_t<i_t, f_t>::child_selection(mip_node_t<i_t, f_t>* node_ptr)
+// Martin's criteria for the preferred rounding direction (see [1])
+// [1] A. Martin, “Integer Programs with Block Structure,”
+// Technische Universit¨at Berlin, Berlin, 1999. Accessed: Aug. 08, 2025.
+// [Online]. Available: https://opus4.kobv.de/opus4-zib/frontdoor/index/index/docId/391
+template <typename f_t>
+rounding_direction_t martin_criteria(f_t val, f_t root_val)
 {
-  const i_t branch_var     = node_ptr->get_down_child()->branch_var;
-  const f_t branch_var_val = node_ptr->get_down_child()->fractional_val;
-  const f_t down_val       = std::floor(root_relax_soln_.x[branch_var]);
-  const f_t up_val         = std::ceil(root_relax_soln_.x[branch_var]);
-  const f_t down_dist      = branch_var_val - down_val;
-  const f_t up_dist        = up_val - branch_var_val;
-  constexpr f_t eps        = 1e-6;
+  const f_t down_val  = std::floor(root_val);
+  const f_t up_val    = std::ceil(root_val);
+  const f_t down_dist = val - down_val;
+  const f_t up_dist   = up_val - val;
+  constexpr f_t eps   = 1e-6;
 
   if (down_dist < up_dist + eps) {
     return rounding_direction_t::DOWN;
@@ -578,34 +566,85 @@ rounding_direction_t branch_and_bound_t<i_t, f_t>::child_selection(mip_node_t<i_
 }
 
 template <typename i_t, typename f_t>
-node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
+branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
+  mip_node_t<i_t, f_t>* node_ptr,
+  const std::vector<i_t>& fractional,
+  const std::vector<f_t>& solution,
+  bnb_worker_type_t type)
+{
+  logger_t log;
+  log.log                        = false;
+  i_t branch_var                 = -1;
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  std::vector<f_t> current_incumbent;
+
+  // If there is no incumbent, use pseudocost diving instead of guided diving
+  if (upper_bound_ == inf && type == bnb_worker_type_t::GUIDED_DIVING) {
+    type = bnb_worker_type_t::PSEUDOCOST_DIVING;
+  }
+
+  switch (type) {
+    case bnb_worker_type_t::BEST_FIRST:
+      branch_var = pc_.variable_selection(fractional, solution, log);
+      round_dir  = martin_criteria(solution[branch_var], root_relax_soln_.x[branch_var]);
+      return {branch_var, round_dir};
+
+    case bnb_worker_type_t::COEFFICIENT_DIVING:
+      return coefficient_diving(
+        original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log);
+
+    case bnb_worker_type_t::LINE_SEARCH_DIVING:
+      return line_search_diving(fractional, solution, root_relax_soln_.x, log);
+
+    case bnb_worker_type_t::PSEUDOCOST_DIVING:
+      return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
+
+    case bnb_worker_type_t::GUIDED_DIVING:
+      mutex_upper_.lock();
+      current_incumbent = incumbent_.x;
+      mutex_upper_.unlock();
+      return guided_diving(pc_, fractional, solution, current_incumbent, log);
+
+    default:
+      log.debug("Unknown variable selection method: %d\n", type);
+      return {-1, rounding_direction_t::NONE};
+  }
+}
+
+template <typename i_t, typename f_t>
+dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   mip_node_t<i_t, f_t>* node_ptr,
-  search_tree_t<i_t, f_t>& search_tree,
   lp_problem_t<i_t, f_t>& leaf_problem,
+  lp_solution_t<i_t, f_t>& leaf_solution,
   basis_update_mpf_t<i_t, f_t>& basis_factors,
   std::vector<i_t>& basic_list,
   std::vector<i_t>& nonbasic_list,
   bounds_strengthening_t<i_t, f_t>& node_presolver,
-  thread_type_t thread_type,
+  bnb_worker_type_t thread_type,
   bool recompute_bounds_and_basis,
   const std::vector<f_t>& root_lower,
   const std::vector<f_t>& root_upper,
+  bnb_stats_t<i_t, f_t>& stats,
   logger_t& log)
 {
-  const f_t abs_fathom_tol = settings_.absolute_mip_gap_tol / 10;
-  const f_t upper_bound    = get_upper_bound();
-
-  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
   std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
   assert(leaf_vstatus.size() == leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.set_log(false);
-  lp_settings.cut_off       = upper_bound + settings_.dual_tol;
+  lp_settings.cut_off       = upper_bound_ + settings_.dual_tol;
   lp_settings.inside_mip    = 2;
   lp_settings.time_limit    = settings_.time_limit - toc(exploration_stats_.start_time);
   lp_settings.scale_columns = false;
 
+  if (thread_type != bnb_worker_type_t::BEST_FIRST) {
+    i_t bnb_lp_iters            = exploration_stats_.total_lp_iters;
+    f_t factor                  = settings_.diving_settings.iteration_limit_factor;
+    i_t max_iter                = factor * bnb_lp_iters;
+    lp_settings.iteration_limit = max_iter - stats.total_lp_iters;
+    if (lp_settings.iteration_limit <= 0) { return dual::status_t::ITERATION_LIMIT; }
+  }
+
 #ifdef LOG_NODE_SIMPLEX
   lp_settings.set_log(true);
   std::stringstream ss;
@@ -682,30 +721,44 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
       lp_status = convert_lp_status_to_dual_status(second_status);
     }
 
-    if (thread_type == thread_type_t::EXPLORATION) {
-      exploration_stats_.total_lp_solve_time += toc(lp_start_time);
-      exploration_stats_.total_lp_iters += node_iter;
-    }
+    stats.total_lp_solve_time += toc(lp_start_time);
+    stats.total_lp_iters += node_iter;
   }
 
 #ifdef LOG_NODE_SIMPLEX
   lp_settings.log.printf("\nLP status: %d\n\n", lp_status);
 #endif
 
+  return lp_status;
+}
+
+template <typename i_t, typename f_t>
+std::pair<node_status_t, rounding_direction_t> branch_and_bound_t<i_t, f_t>::update_tree(
+  mip_node_t<i_t, f_t>* node_ptr,
+  search_tree_t<i_t, f_t>& search_tree,
+  lp_problem_t<i_t, f_t>& leaf_problem,
+  lp_solution_t<i_t, f_t>& leaf_solution,
+  bnb_worker_type_t thread_type,
+  dual::status_t lp_status,
+  logger_t& log)
+{
+  const f_t abs_fathom_tol                     = settings_.absolute_mip_gap_tol / 10;
+  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
+
   if (lp_status == dual::status_t::DUAL_UNBOUNDED) {
     // Node was infeasible. Do not branch
     node_ptr->lower_bound = inf;
     search_tree.graphviz_node(log, node_ptr, "infeasible", 0.0);
     search_tree.update(node_ptr, node_status_t::INFEASIBLE);
-    return node_solve_info_t::NO_CHILDREN;
+    return {node_status_t::INFEASIBLE, rounding_direction_t::NONE};
 
   } else if (lp_status == dual::status_t::CUTOFF) {
     // Node was cut off. Do not branch
-    node_ptr->lower_bound = upper_bound;
+    node_ptr->lower_bound = upper_bound_;
     f_t leaf_objective    = compute_objective(leaf_problem, leaf_solution.x);
     search_tree.graphviz_node(log, node_ptr, "cut off", leaf_objective);
     search_tree.update(node_ptr, node_status_t::FATHOMED);
-    return node_solve_info_t::NO_CHILDREN;
+    return {node_status_t::FATHOMED, rounding_direction_t::NONE};
 
   } else if (lp_status == dual::status_t::OPTIMAL) {
     // LP was feasible
@@ -718,10 +771,12 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
     search_tree.graphviz_node(log, node_ptr, "lower bound", leaf_objective);
     pc_.update_pseudo_costs(node_ptr, leaf_objective);
 
-    if (settings_.node_processed_callback != nullptr) {
-      std::vector<f_t> original_x;
-      uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
-      settings_.node_processed_callback(original_x, leaf_objective);
+    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+      if (settings_.node_processed_callback != nullptr) {
+        std::vector<f_t> original_x;
+        uncrush_primal_solution(original_problem_, original_lp_, leaf_solution.x, original_x);
+        settings_.node_processed_callback(original_x, leaf_objective);
+      }
     }
 
     if (leaf_num_fractional == 0) {
@@ -729,37 +784,39 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
       add_feasible_solution(leaf_objective, leaf_solution.x, node_ptr->depth, thread_type);
       search_tree.graphviz_node(log, node_ptr, "integer feasible", leaf_objective);
       search_tree.update(node_ptr, node_status_t::INTEGER_FEASIBLE);
-      return node_solve_info_t::NO_CHILDREN;
+      return {node_status_t::INTEGER_FEASIBLE, rounding_direction_t::NONE};
 
-    } else if (leaf_objective <= upper_bound + abs_fathom_tol) {
+    } else if (leaf_objective <= upper_bound_ + abs_fathom_tol) {
       // Choose fractional variable to branch on
-      const i_t branch_var =
-        pc_.variable_selection(leaf_fractional, leaf_solution.x, lp_settings.log);
+      auto [branch_var, round_dir] =
+        variable_selection(node_ptr, leaf_fractional, leaf_solution.x, thread_type);
 
       assert(leaf_vstatus.size() == leaf_problem.num_cols);
+      assert(branch_var >= 0);
+      assert(round_dir != rounding_direction_t::NONE);
+
+      // Note that the exploration thread is the only one that can insert new nodes into the heap,
+      // and thus, we only need to calculate the objective estimate here (it is used for
+      // sorting the nodes for diving).
+      if (thread_type == bnb_worker_type_t::BEST_FIRST) {
+        logger_t pc_log;
+        pc_log.log = false;
+        node_ptr->objective_estimate =
+          pc_.obj_estimate(leaf_fractional, leaf_solution.x, node_ptr->lower_bound, pc_log);
+      }
+
       search_tree.branch(
         node_ptr, branch_var, leaf_solution.x[branch_var], leaf_vstatus, leaf_problem, log);
       search_tree.update(node_ptr, node_status_t::HAS_CHILDREN);
-
-      rounding_direction_t round_dir = child_selection(node_ptr);
-
-      if (round_dir == rounding_direction_t::UP) {
-        return node_solve_info_t::UP_CHILD_FIRST;
-      } else {
-        return node_solve_info_t::DOWN_CHILD_FIRST;
-      }
+      return {node_status_t::HAS_CHILDREN, round_dir};
 
     } else {
       search_tree.graphviz_node(log, node_ptr, "fathomed", leaf_objective);
       search_tree.update(node_ptr, node_status_t::FATHOMED);
-      return node_solve_info_t::NO_CHILDREN;
+      return {node_status_t::FATHOMED, rounding_direction_t::NONE};
     }
-  } else if (lp_status == dual::status_t::TIME_LIMIT) {
-    search_tree.graphviz_node(log, node_ptr, "timeout", 0.0);
-    return node_solve_info_t::TIME_LIMIT;
-
   } else {
-    if (thread_type == thread_type_t::EXPLORATION) {
+    if (thread_type == bnb_worker_type_t::BEST_FIRST) {
       fetch_min(lower_bound_ceiling_, node_ptr->lower_bound);
       log.printf(
         "LP returned status %d on node %d. This indicates a numerical issue. The best bound is set "
@@ -772,17 +829,15 @@ node_solve_info_t branch_and_bound_t<i_t, f_t>::solve_node(
 
     search_tree.graphviz_node(log, node_ptr, "numerical", 0.0);
     search_tree.update(node_ptr, node_status_t::NUMERICAL);
-    return node_solve_info_t::NUMERICAL;
+    return {node_status_t::NUMERICAL, rounding_direction_t::NONE};
   }
 }
 
 template <typename i_t, typename f_t>
 void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                                                       search_tree_t<i_t, f_t>* search_tree,
-                                                       const csr_matrix_t<i_t, f_t>& Arow,
                                                        i_t initial_heap_size)
 {
-  if (solver_status_ != mip_exploration_status_t::RUNNING) { return; }
+  if (solver_status_ != mip_status_t::UNSET) { return; }
 
   // Note that we do not know which thread will execute the
   // `exploration_ramp_up` task, so we allow to any thread
@@ -790,13 +845,13 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* nod
   repair_heuristic_solutions();
 
   f_t lower_bound = node->lower_bound;
-  f_t upper_bound = get_upper_bound();
+  f_t upper_bound = upper_bound_;
   f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
   f_t abs_gap     = upper_bound - lower_bound;
 
   if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-    search_tree->graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
-    search_tree->update(node, node_status_t::FATHOMED);
+    search_tree_.graphviz_node(settings_.log, node, "cutoff", node->lower_bound);
+    search_tree_.update(node, node_status_t::FATHOMED);
     --exploration_stats_.nodes_unexplored;
     return;
   }
@@ -812,7 +867,7 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* nod
     bool should_report = should_report_.exchange(false);
 
     if (should_report) {
-      report("  ", upper_bound, root_objective_, node->depth);
+      report(' ', upper_bound, root_objective_, node->depth);
       exploration_stats_.nodes_since_last_log = 0;
       exploration_stats_.last_log             = tic();
       should_report_                          = true;
@@ -820,84 +875,91 @@ void branch_and_bound_t<i_t, f_t>::exploration_ramp_up(mip_node_t<i_t, f_t>* nod
   }
 
   if (now > settings_.time_limit) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
+    solver_status_ = mip_status_t::TIME_LIMIT;
     return;
   }
 
   // Make a copy of the original LP. We will modify its bounds at each leaf
   lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
   std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow, row_sense, var_types_);
+  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
 
   const i_t m = leaf_problem.num_rows;
   basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
 
-  node_solve_info_t status = solve_node(node,
-                                        *search_tree,
-                                        leaf_problem,
-                                        basis_factors,
-                                        basic_list,
-                                        nonbasic_list,
-                                        node_presolver,
-                                        thread_type_t::EXPLORATION,
-                                        true,
-                                        original_lp_.lower,
-                                        original_lp_.upper,
-                                        settings_.log);
+  lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
+  dual::status_t lp_status = solve_node_lp(node,
+                                           leaf_problem,
+                                           leaf_solution,
+                                           basis_factors,
+                                           basic_list,
+                                           nonbasic_list,
+                                           node_presolver,
+                                           bnb_worker_type_t::BEST_FIRST,
+                                           true,
+                                           original_lp_.lower,
+                                           original_lp_.upper,
+                                           exploration_stats_,
+                                           settings_.log);
+  if (lp_status == dual::status_t::TIME_LIMIT) {
+    solver_status_ = mip_status_t::TIME_LIMIT;
+    return;
+  }
 
   ++exploration_stats_.nodes_since_last_log;
   ++exploration_stats_.nodes_explored;
   --exploration_stats_.nodes_unexplored;
 
-  if (status == node_solve_info_t::TIME_LIMIT) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return;
+  auto [node_status, round_dir] = update_tree(node,
+                                              search_tree_,
+                                              leaf_problem,
+                                              leaf_solution,
+                                              bnb_worker_type_t::BEST_FIRST,
+                                              lp_status,
+                                              settings_.log);
 
-  } else if (has_children(status)) {
+  if (node_status == node_status_t::HAS_CHILDREN) {
     exploration_stats_.nodes_unexplored += 2;
 
     // If we haven't generated enough nodes to keep the threads busy, continue the ramp up phase
     if (exploration_stats_.nodes_unexplored < initial_heap_size) {
 #pragma omp task
-      exploration_ramp_up(node->get_down_child(), search_tree, Arow, initial_heap_size);
+      exploration_ramp_up(node->get_down_child(), initial_heap_size);
 
 #pragma omp task
-      exploration_ramp_up(node->get_up_child(), search_tree, Arow, initial_heap_size);
+      exploration_ramp_up(node->get_up_child(), initial_heap_size);
 
     } else {
       // We've generated enough nodes, push further nodes onto the heap
-      mutex_heap_.lock();
-      heap_.push(node->get_down_child());
-      heap_.push(node->get_up_child());
-      mutex_heap_.unlock();
+      node_queue_.push(node->get_down_child());
+      node_queue_.push(node->get_up_child());
     }
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
-                                                   mip_node_t<i_t, f_t>* start_node,
-                                                   search_tree_t<i_t, f_t>& search_tree,
-                                                   lp_problem_t<i_t, f_t>& leaf_problem,
-                                                   bounds_strengthening_t<i_t, f_t>& node_presolver,
-                                                   basis_update_mpf_t<i_t, f_t>& basis_factors,
-                                                   std::vector<i_t>& basic_list,
-                                                   std::vector<i_t>& nonbasic_list)
+void branch_and_bound_t<i_t, f_t>::plunge_from(i_t task_id,
+                                               mip_node_t<i_t, f_t>* start_node,
+                                               lp_problem_t<i_t, f_t>& leaf_problem,
+                                               bounds_strengthening_t<i_t, f_t>& node_presolver,
+                                               basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                               std::vector<i_t>& basic_list,
+                                               std::vector<i_t>& nonbasic_list)
 {
   bool recompute_bounds_and_basis = true;
   std::deque<mip_node_t<i_t, f_t>*> stack;
   stack.push_front(start_node);
 
-  while (stack.size() > 0 && solver_status_ == mip_exploration_status_t::RUNNING) {
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
     if (task_id == 0) { repair_heuristic_solutions(); }
 
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
 
     f_t lower_bound = node_ptr->lower_bound;
-    f_t upper_bound = get_upper_bound();
+    f_t upper_bound = upper_bound_;
     f_t abs_gap     = upper_bound - lower_bound;
     f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
 
@@ -911,8 +973,9 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
     local_lower_bounds_[task_id] = lower_bound;
 
     if (lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-      search_tree.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
-      search_tree.update(node_ptr, node_status_t::FATHOMED);
+      search_tree_.graphviz_node(settings_.log, node_ptr, "cutoff", node_ptr->lower_bound);
+      search_tree_.update(node_ptr, node_status_t::FATHOMED);
+      recompute_bounds_and_basis = true;
       --exploration_stats_.nodes_unexplored;
       continue;
     }
@@ -927,45 +990,58 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
             abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
            time_since_last_log >= 1) ||
           (time_since_last_log > 30) || now > settings_.time_limit) {
-        report("  ", upper_bound, get_lower_bound(), node_ptr->depth);
+        report(' ', upper_bound, get_lower_bound(), node_ptr->depth);
         exploration_stats_.last_log             = tic();
         exploration_stats_.nodes_since_last_log = 0;
       }
     }
 
     if (now > settings_.time_limit) {
-      solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-      return;
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
     }
     if (exploration_stats_.nodes_explored >= settings_.node_limit) {
-      solver_status_ = mip_exploration_status_t::NODE_LIMIT;
-      return;
+      solver_status_ = mip_status_t::NODE_LIMIT;
+      break;
     }
 
-    node_solve_info_t status = solve_node(node_ptr,
-                                          search_tree,
-                                          leaf_problem,
-                                          basis_factors,
-                                          basic_list,
-                                          nonbasic_list,
-                                          node_presolver,
-                                          thread_type_t::EXPLORATION,
-                                          recompute_bounds_and_basis,
-                                          original_lp_.lower,
-                                          original_lp_.upper,
-                                          settings_.log);
-
-    recompute_bounds_and_basis = !has_children(status);
+    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
+    dual::status_t lp_status = solve_node_lp(node_ptr,
+                                             leaf_problem,
+                                             leaf_solution,
+                                             basis_factors,
+                                             basic_list,
+                                             nonbasic_list,
+                                             node_presolver,
+                                             bnb_worker_type_t::BEST_FIRST,
+                                             recompute_bounds_and_basis,
+                                             original_lp_.lower,
+                                             original_lp_.upper,
+                                             exploration_stats_,
+                                             settings_.log);
+
+    if (lp_status == dual::status_t::TIME_LIMIT) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+      break;
+    }
 
     ++exploration_stats_.nodes_since_last_log;
     ++exploration_stats_.nodes_explored;
     --exploration_stats_.nodes_unexplored;
 
-    if (status == node_solve_info_t::TIME_LIMIT) {
-      solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-      return;
+    auto [node_status, round_dir] = update_tree(node_ptr,
+                                                search_tree_,
+                                                leaf_problem,
+                                                leaf_solution,
+                                                bnb_worker_type_t::BEST_FIRST,
+                                                lp_status,
+                                                settings_.log);
+
+    recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN;
 
-    } else if (has_children(status)) {
+    if (node_status == node_status_t::HAS_CHILDREN) {
       // The stack should only contain the children of the current parent.
       // If the stack size is greater than 0,
       // we pop the current node from the stack and place it in the global heap,
@@ -973,33 +1049,12 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
       if (stack.size() > 0) {
         mip_node_t<i_t, f_t>* node = stack.back();
         stack.pop_back();
-
-        // The order here matters. We want to create a copy of the node
-        // before adding to the global heap. Otherwise,
-        // some thread may consume the node (possibly fathoming it)
-        // before we had the chance to add to the diving queue.
-        // This lead to a SIGSEGV. Although, in this case, it
-        // would be better if we discard the node instead.
-        if (get_heap_size() > settings_.num_bfs_threads) {
-          std::vector<f_t> lower = original_lp_.lower;
-          std::vector<f_t> upper = original_lp_.upper;
-          std::fill(
-            node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-          node->get_variable_bounds(lower, upper, node_presolver.bounds_changed);
-
-          mutex_dive_queue_.lock();
-          diving_queue_.emplace(node->detach_copy(), std::move(lower), std::move(upper));
-          mutex_dive_queue_.unlock();
-        }
-
-        mutex_heap_.lock();
-        heap_.push(node);
-        mutex_heap_.unlock();
+        node_queue_.push(node);
       }
 
       exploration_stats_.nodes_unexplored += 2;
 
-      if (status == node_solve_info_t::UP_CHILD_FIRST) {
+      if (round_dir == rounding_direction_t::UP) {
         stack.push_front(node_ptr->get_down_child());
         stack.push_front(node_ptr->get_up_child());
       } else {
@@ -1011,177 +1066,220 @@ void branch_and_bound_t<i_t, f_t>::explore_subtree(i_t task_id,
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id,
-                                                     search_tree_t<i_t, f_t>& search_tree,
-                                                     const csr_matrix_t<i_t, f_t>& Arow)
+void branch_and_bound_t<i_t, f_t>::best_first_thread(i_t task_id)
 {
   f_t lower_bound = -inf;
-  f_t upper_bound = inf;
   f_t abs_gap     = inf;
   f_t rel_gap     = inf;
 
   // Make a copy of the original LP. We will modify its bounds at each leaf
   lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
   std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow, row_sense, var_types_);
+  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
 
   const i_t m = leaf_problem.num_rows;
   basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
 
-  while (solver_status_ == mip_exploration_status_t::RUNNING &&
-         abs_gap > settings_.absolute_mip_gap_tol && rel_gap > settings_.relative_mip_gap_tol &&
-         (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    mip_node_t<i_t, f_t>* start_node = nullptr;
-
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol &&
+         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
+    // In the current implementation, we are use the active number of subtree to decide
+    // when to stop the execution. We need to increment the counter at the same
+    // time as we pop a node from the queue to avoid some threads exiting
+    // the main loop thinking that the solver has already finished.
+    // This will be not needed in the master-worker model.
+    node_queue_.lock();
     // If there any node left in the heap, we pop the top node and explore it.
-    mutex_heap_.lock();
-    if (heap_.size() > 0) {
-      start_node = heap_.top();
-      heap_.pop();
-      active_subtrees_++;
-    }
-    mutex_heap_.unlock();
+    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+    if (start_node.has_value()) { active_subtrees_++; };
+    node_queue_.unlock();
 
-    if (start_node != nullptr) {
-      if (get_upper_bound() < start_node->lower_bound) {
+    if (start_node.has_value()) {
+      if (upper_bound_ < start_node.value()->lower_bound) {
         // This node was put on the heap earlier but its lower bound is now greater than the
         // current upper bound
-        search_tree.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
-        search_tree.update(start_node, node_status_t::FATHOMED);
+        search_tree_.graphviz_node(
+          settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
+        search_tree_.update(start_node.value(), node_status_t::FATHOMED);
         active_subtrees_--;
         continue;
       }
 
       // Best-first search with plunging
-      explore_subtree(task_id,
-                      start_node,
-                      search_tree,
-                      leaf_problem,
-                      node_presolver,
-                      basis_factors,
-                      basic_list,
-                      nonbasic_list);
+      plunge_from(task_id,
+                  start_node.value(),
+                  leaf_problem,
+                  node_presolver,
+                  basis_factors,
+                  basic_list,
+                  nonbasic_list);
 
       active_subtrees_--;
     }
 
     lower_bound = get_lower_bound();
-    upper_bound = get_upper_bound();
-    abs_gap     = upper_bound - lower_bound;
-    rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
+    abs_gap     = upper_bound_ - lower_bound;
+    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
   }
 
+  is_running = false;
+
   // Check if it is the last thread that exited the loop and no
   // timeout or numerical error has happen.
-  if (solver_status_ == mip_exploration_status_t::RUNNING) {
-    if (active_subtrees_ == 0) {
-      solver_status_ = mip_exploration_status_t::COMPLETED;
-    } else {
-      local_lower_bounds_[task_id] = inf;
-    }
+  if (solver_status_ == mip_status_t::UNSET) {
+    if (active_subtrees_ > 0) { local_lower_bounds_[task_id] = inf; }
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::diving_thread(const csr_matrix_t<i_t, f_t>& Arow)
+void branch_and_bound_t<i_t, f_t>::dive_from(mip_node_t<i_t, f_t>& start_node,
+                                             const std::vector<f_t>& start_lower,
+                                             const std::vector<f_t>& start_upper,
+                                             lp_problem_t<i_t, f_t>& leaf_problem,
+                                             bounds_strengthening_t<i_t, f_t>& node_presolver,
+                                             basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                             std::vector<i_t>& basic_list,
+                                             std::vector<i_t>& nonbasic_list,
+                                             bnb_worker_type_t diving_type)
 {
   logger_t log;
   log.log = false;
+
+  const i_t diving_node_limit      = settings_.diving_settings.node_limit;
+  const i_t diving_backtrack_limit = settings_.diving_settings.backtrack_limit;
+  bool recompute_bounds_and_basis  = true;
+  search_tree_t<i_t, f_t> dive_tree(std::move(start_node));
+  std::deque<mip_node_t<i_t, f_t>*> stack;
+  stack.push_front(&dive_tree.root);
+
+  bnb_stats_t<i_t, f_t> dive_stats;
+  dive_stats.total_lp_iters      = 0;
+  dive_stats.total_lp_solve_time = 0;
+  dive_stats.nodes_explored      = 0;
+  dive_stats.nodes_unexplored    = 1;
+
+  while (stack.size() > 0 && solver_status_ == mip_status_t::UNSET && is_running) {
+    mip_node_t<i_t, f_t>* node_ptr = stack.front();
+    stack.pop_front();
+
+    f_t lower_bound = node_ptr->lower_bound;
+    f_t upper_bound = upper_bound_;
+    f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
+
+    if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
+      recompute_bounds_and_basis = true;
+      continue;
+    }
+
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
+    if (dive_stats.nodes_explored > diving_node_limit) { break; }
+
+    lp_solution_t<i_t, f_t> leaf_solution(leaf_problem.num_rows, leaf_problem.num_cols);
+    dual::status_t lp_status = solve_node_lp(node_ptr,
+                                             leaf_problem,
+                                             leaf_solution,
+                                             basis_factors,
+                                             basic_list,
+                                             nonbasic_list,
+                                             node_presolver,
+                                             diving_type,
+                                             recompute_bounds_and_basis,
+                                             start_lower,
+                                             start_upper,
+                                             dive_stats,
+                                             log);
+
+    if (lp_status == dual::status_t::TIME_LIMIT) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    } else if (lp_status == dual::status_t::ITERATION_LIMIT) {
+      break;
+    }
+
+    ++dive_stats.nodes_explored;
+
+    auto [node_status, round_dir] =
+      update_tree(node_ptr, dive_tree, leaf_problem, leaf_solution, diving_type, lp_status, log);
+    recompute_bounds_and_basis = node_status != node_status_t::HAS_CHILDREN;
+
+    if (node_status == node_status_t::HAS_CHILDREN) {
+      if (round_dir == rounding_direction_t::UP) {
+        stack.push_front(node_ptr->get_down_child());
+        stack.push_front(node_ptr->get_up_child());
+      } else {
+        stack.push_front(node_ptr->get_up_child());
+        stack.push_front(node_ptr->get_down_child());
+      }
+    }
+
+    // Remove nodes that we no longer can backtrack to (i.e., from the current node, we can only
+    // backtrack to a node that is has a depth of at most 5 levels lower than the current node).
+    if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > diving_backtrack_limit) {
+      stack.pop_back();
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::diving_thread(bnb_worker_type_t diving_type)
+{
   // Make a copy of the original LP. We will modify its bounds at each leaf
   lp_problem_t<i_t, f_t> leaf_problem = original_lp_;
   std::vector<char> row_sense;
-  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow, row_sense, var_types_);
+  bounds_strengthening_t<i_t, f_t> node_presolver(leaf_problem, Arow_, row_sense, var_types_);
 
   const i_t m = leaf_problem.num_rows;
   basis_update_mpf_t<i_t, f_t> basis_factors(m, settings_.refactor_frequency);
   std::vector<i_t> basic_list(m);
   std::vector<i_t> nonbasic_list;
 
-  while (solver_status_ == mip_exploration_status_t::RUNNING &&
-         (active_subtrees_ > 0 || get_heap_size() > 0)) {
-    std::optional<diving_root_t<i_t, f_t>> start_node;
+  std::vector<f_t> start_lower;
+  std::vector<f_t> start_upper;
+  bool reset_starting_bounds = true;
+
+  while (solver_status_ == mip_status_t::UNSET && is_running &&
+         (active_subtrees_ > 0 || node_queue_.best_first_queue_size() > 0)) {
+    if (reset_starting_bounds) {
+      start_lower = original_lp_.lower;
+      start_upper = original_lp_.upper;
+      std::fill(node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
+      reset_starting_bounds = false;
+    }
 
-    mutex_dive_queue_.lock();
-    if (diving_queue_.size() > 0) { start_node = diving_queue_.pop(); }
-    mutex_dive_queue_.unlock();
+    // In the current implementation, multiple threads can pop the nodes
+    // from the queue, so we need to initialize the lower and upper bound here
+    // to avoid other thread fathoming the node (i.e., deleting) before we can read
+    // the variable bounds from the tree.
+    // This will be not needed in the master-worker model.
+    node_queue_.lock();
+    std::optional<mip_node_t<i_t, f_t>*> node_ptr  = node_queue_.pop_diving();
+    std::optional<mip_node_t<i_t, f_t>> start_node = std::nullopt;
+
+    if (node_ptr.has_value()) {
+      node_ptr.value()->get_variable_bounds(
+        start_lower, start_upper, node_presolver.bounds_changed);
+      start_node = node_ptr.value()->detach_copy();
+    }
+    node_queue_.unlock();
 
     if (start_node.has_value()) {
-      if (get_upper_bound() < start_node->node.lower_bound) { continue; }
-
-      bool recompute_bounds_and_basis = true;
-      i_t nodes_explored              = 0;
-      search_tree_t<i_t, f_t> subtree(std::move(start_node->node));
-      std::deque<mip_node_t<i_t, f_t>*> stack;
-      stack.push_front(&subtree.root);
-
-      while (stack.size() > 0 && solver_status_ == mip_exploration_status_t::RUNNING) {
-        mip_node_t<i_t, f_t>* node_ptr = stack.front();
-        stack.pop_front();
-        f_t upper_bound = get_upper_bound();
-        f_t rel_gap     = user_relative_gap(original_lp_, upper_bound, node_ptr->lower_bound);
-
-        if (node_ptr->lower_bound > upper_bound || rel_gap < settings_.relative_mip_gap_tol) {
-          recompute_bounds_and_basis = true;
-          continue;
-        }
-
-        if (toc(exploration_stats_.start_time) > settings_.time_limit) { return; }
-
-        if (nodes_explored >= 1000) { break; }
-
-        node_solve_info_t status = solve_node(node_ptr,
-                                              subtree,
-                                              leaf_problem,
-                                              basis_factors,
-                                              basic_list,
-                                              nonbasic_list,
-                                              node_presolver,
-                                              thread_type_t::DIVING,
-                                              recompute_bounds_and_basis,
-                                              start_node->lower,
-                                              start_node->upper,
-                                              log);
-
-        nodes_explored++;
-
-        recompute_bounds_and_basis = !has_children(status);
-
-        if (status == node_solve_info_t::TIME_LIMIT) {
-          solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-          return;
-
-        } else if (has_children(status)) {
-          if (status == node_solve_info_t::UP_CHILD_FIRST) {
-            stack.push_front(node_ptr->get_down_child());
-            stack.push_front(node_ptr->get_up_child());
-          } else {
-            stack.push_front(node_ptr->get_up_child());
-            stack.push_front(node_ptr->get_down_child());
-          }
-        }
-
-        if (stack.size() > 1) {
-          // If the diving thread is consuming the nodes faster than the
-          // best first search, then we split the current subtree at the
-          // lowest possible point and move to the queue, so it can
-          // be picked by another thread.
-          if (std::lock_guard<omp_mutex_t> lock(mutex_dive_queue_);
-              diving_queue_.size() < min_diving_queue_size_) {
-            mip_node_t<i_t, f_t>* new_node = stack.back();
-            stack.pop_back();
-
-            std::vector<f_t> lower = start_node->lower;
-            std::vector<f_t> upper = start_node->upper;
-            std::fill(
-              node_presolver.bounds_changed.begin(), node_presolver.bounds_changed.end(), false);
-            new_node->get_variable_bounds(lower, upper, node_presolver.bounds_changed);
-
-            diving_queue_.emplace(new_node->detach_copy(), std::move(lower), std::move(upper));
-          }
-        }
-      }
+      reset_starting_bounds = true;
+
+      if (upper_bound_ < start_node->lower_bound) { continue; }
+      bool is_feasible = node_presolver.bounds_strengthening(start_lower, start_upper, settings_);
+      if (!is_feasible) { continue; }
+
+      dive_from(start_node.value(),
+                start_lower,
+                start_upper,
+                leaf_problem,
+                node_presolver,
+                basis_factors,
+                basic_list,
+                nonbasic_list,
+                diving_type);
     }
   }
 }
@@ -1267,9 +1365,35 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   logger_t log;
   log.log                             = false;
   log.log_prefix                      = settings_.log.log_prefix;
-  solver_status_                      = mip_exploration_status_t::UNSET;
+  solver_status_                      = mip_status_t::UNSET;
+  is_running                          = false;
   exploration_stats_.nodes_unexplored = 0;
   exploration_stats_.nodes_explored   = 0;
+  original_lp_.A.to_compressed_row(Arow_);
+
+  std::vector<bnb_worker_type_t> diving_strategies;
+  diving_strategies.reserve(4);
+
+  if (settings_.diving_settings.pseudocost_diving != 0) {
+    diving_strategies.push_back(bnb_worker_type_t::PSEUDOCOST_DIVING);
+  }
+
+  if (settings_.diving_settings.line_search_diving != 0) {
+    diving_strategies.push_back(bnb_worker_type_t::LINE_SEARCH_DIVING);
+  }
+
+  if (settings_.diving_settings.guided_diving != 0) {
+    diving_strategies.push_back(bnb_worker_type_t::GUIDED_DIVING);
+  }
+
+  if (settings_.diving_settings.coefficient_diving != 0) {
+    diving_strategies.push_back(bnb_worker_type_t::COEFFICIENT_DIVING);
+    calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
+  }
+
+  if (diving_strategies.empty()) {
+    settings_.log.printf("Warning: All diving heuristics are disabled!\n");
+  }
 
   if (guess_.size() != 0) {
     std::vector<f_t> crushed_guess;
@@ -1331,15 +1455,16 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   }
 
   if (root_status == lp_status_t::TIME_LIMIT) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return set_final_solution(solution, -inf);
+    solver_status_ = mip_status_t::TIME_LIMIT;
+    set_final_solution(solution, -inf);
+    return solver_status_;
   }
 
   assert(root_vstatus_.size() == original_lp_.num_cols);
   set_uninitialized_steepest_edge_norms<i_t, f_t>(edge_norms_);
 
   root_objective_ = compute_objective(original_lp_, root_relax_soln_.x);
-  local_lower_bounds_.assign(settings_.num_bfs_threads, root_objective_);
+  local_lower_bounds_.assign(settings_.num_bfs_workers, root_objective_);
 
   if (settings_.set_simplex_solution_callback != nullptr) {
     std::vector<f_t> original_x;
@@ -1397,8 +1522,9 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                              pc_);
 
   if (toc(exploration_stats_.start_time) > settings_.time_limit) {
-    solver_status_ = mip_exploration_status_t::TIME_LIMIT;
-    return set_final_solution(solution, root_objective_);
+    solver_status_ = mip_status_t::TIME_LIMIT;
+    set_final_solution(solution, root_objective_);
+    return solver_status_;
   }
 
   // Choose variable to branch on
@@ -1414,61 +1540,62 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       original_lp_,
                       log);
 
-  csr_matrix_t<i_t, f_t> Arow(1, 1, 0);
-  original_lp_.A.to_compressed_row(Arow);
-
   settings_.log.printf("Exploring the B&B tree using %d threads (best-first = %d, diving = %d)\n",
                        settings_.num_threads,
-                       settings_.num_bfs_threads,
-                       settings_.num_diving_threads);
-
-  settings_.log.printf(
-    "  | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
-    "|  Time  |\n");
+                       settings_.num_bfs_workers,
+                       settings_.num_threads - settings_.num_bfs_workers);
 
-  exploration_stats_.nodes_explored       = 1;
+  exploration_stats_.nodes_explored       = 0;
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
   active_subtrees_                        = 0;
-  min_diving_queue_size_                  = 4 * settings_.num_diving_threads;
-  solver_status_                          = mip_exploration_status_t::RUNNING;
+  is_running                              = true;
   lower_bound_ceiling_                    = inf;
   should_report_                          = true;
 
+  settings_.log.printf(
+    "  | Explored | Unexplored |    Objective    |     Bound     | Depth | Iter/Node |   Gap    "
+    "|  Time  |\n");
+
 #pragma omp parallel num_threads(settings_.num_threads)
   {
 #pragma omp master
     {
-      auto down_child  = search_tree_.root.get_down_child();
-      auto up_child    = search_tree_.root.get_up_child();
-      i_t initial_size = 2 * settings_.num_threads;
+      auto down_child          = search_tree_.root.get_down_child();
+      auto up_child            = search_tree_.root.get_up_child();
+      i_t initial_size         = 2 * settings_.num_threads;
+      const i_t num_strategies = diving_strategies.size();
 
+#pragma omp taskgroup
+      {
 #pragma omp task
-      exploration_ramp_up(down_child, &search_tree_, Arow, initial_size);
+        exploration_ramp_up(down_child, initial_size);
 
 #pragma omp task
-      exploration_ramp_up(up_child, &search_tree_, Arow, initial_size);
-    }
-
-#pragma omp barrier
+        exploration_ramp_up(up_child, initial_size);
+      }
 
-#pragma omp master
-    {
-      for (i_t i = 0; i < settings_.num_bfs_threads; i++) {
+      for (i_t i = 0; i < settings_.num_bfs_workers; i++) {
 #pragma omp task
-        best_first_thread(i, search_tree_, Arow);
+        best_first_thread(i);
       }
 
-      for (i_t i = 0; i < settings_.num_diving_threads; i++) {
+      if (!diving_strategies.empty()) {
+        for (i_t k = 0; k < settings_.diving_settings.num_diving_workers; k++) {
+          const bnb_worker_type_t diving_type = diving_strategies[k % num_strategies];
 #pragma omp task
-        diving_thread(Arow);
+          diving_thread(diving_type);
+        }
       }
     }
   }
 
-  f_t lower_bound = heap_.size() > 0 ? heap_.top()->lower_bound : search_tree_.root.lower_bound;
-  return set_final_solution(solution, lower_bound);
+  is_running      = false;
+  f_t lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
+                                                            : search_tree_.root.lower_bound;
+  set_final_solution(solution, lower_bound);
+  return solver_status_;
 }
 
 #ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
diff --git a/cpp/src/dual_simplex/branch_and_bound.hpp b/cpp/src/dual_simplex/branch_and_bound.hpp
index c71265729..dac1ab393 100644
--- a/cpp/src/dual_simplex/branch_and_bound.hpp
+++ b/cpp/src/dual_simplex/branch_and_bound.hpp
@@ -7,9 +7,10 @@
 
 #pragma once
 
-#include <dual_simplex/diving_queue.hpp>
+#include <dual_simplex/diving_heuristics.hpp>
 #include <dual_simplex/initial_basis.hpp>
 #include <dual_simplex/mip_node.hpp>
+#include <dual_simplex/node_queue.hpp>
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/pseudo_costs.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -20,7 +21,6 @@
 #include <utilities/omp_helpers.hpp>
 
 #include <omp.h>
-#include <queue>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
@@ -35,31 +35,17 @@ enum class mip_status_t {
   UNSET      = 6,  // The status is not set
 };
 
-enum class mip_exploration_status_t {
-  UNSET      = 0,  // The status is not set
-  TIME_LIMIT = 1,  // The solver reached a time limit
-  NODE_LIMIT = 2,  // The maximum number of nodes was reached (not implemented)
-  NUMERICAL  = 3,  // The solver encountered a numerical error
-  RUNNING    = 4,  // The solver is currently exploring the tree
-  COMPLETED  = 5,  // The solver finished exploring the tree
-};
-
-enum class node_solve_info_t {
-  NO_CHILDREN      = 0,  // The node does not produced children
-  UP_CHILD_FIRST   = 1,  // The up child should be explored first
-  DOWN_CHILD_FIRST = 2,  // The down child should be explored first
-  TIME_LIMIT       = 3,  // The solver reached a time limit
-  ITERATION_LIMIT  = 4,  // The solver reached a iteration limit
-  NUMERICAL        = 5   // The solver encounter a numerical error when solving the node
-};
-
-// Indicate the search and variable selection algorithms used by the thread (See [1]).
+// Indicate the search and variable selection algorithms used by each thread
+// in B&B (See [1]).
 //
 // [1] T. Achterberg, “Constraint Integer Programming,” PhD, Technischen Universität Berlin,
 // Berlin, 2007. doi: 10.14279/depositonce-1634.
-enum class thread_type_t {
-  EXPLORATION = 0,  // Best-First + Plunging. Pseudocost branching + Martin's criteria.
-  DIVING      = 1,
+enum class bnb_worker_type_t {
+  BEST_FIRST         = 0,  // Best-First + Plunging.
+  PSEUDOCOST_DIVING  = 1,  // Pseudocost diving (9.2.5)
+  LINE_SEARCH_DIVING = 2,  // Line search diving (9.2.4)
+  GUIDED_DIVING = 3,  // Guided diving (9.2.3). If no incumbent is found yet, use pseudocost diving.
+  COEFFICIENT_DIVING = 4  // Coefficient diving (9.2.1)
 };
 
 template <typename i_t, typename f_t>
@@ -68,12 +54,22 @@ class bounds_strengthening_t;
 template <typename i_t, typename f_t>
 void upper_bound_callback(f_t upper_bound);
 
+template <typename i_t, typename f_t>
+struct bnb_stats_t {
+  f_t start_time                        = 0.0;
+  omp_atomic_t<f_t> total_lp_solve_time = 0.0;
+  omp_atomic_t<i_t> nodes_explored      = 0;
+  omp_atomic_t<i_t> nodes_unexplored    = 0;
+  omp_atomic_t<f_t> total_lp_iters      = 0;
+
+  // This should only be used by the main thread
+  omp_atomic_t<f_t> last_log             = 0.0;
+  omp_atomic_t<i_t> nodes_since_last_log = 0;
+};
+
 template <typename i_t, typename f_t>
 class branch_and_bound_t {
  public:
-  template <typename T>
-  using mip_node_heap_t = std::priority_queue<T, std::vector<T>, node_compare_t<i_t, f_t>>;
-
   branch_and_bound_t(const user_problem_t<i_t, f_t>& user_problem,
                      const simplex_solver_settings_t<i_t, f_t>& solver_settings);
 
@@ -109,9 +105,7 @@ class branch_and_bound_t {
                        f_t& repaired_obj,
                        std::vector<f_t>& repaired_solution) const;
 
-  f_t get_upper_bound();
   f_t get_lower_bound();
-  i_t get_heap_size();
   bool enable_concurrent_lp_root_solve() const { return enable_concurrent_lp_root_solve_; }
   std::atomic<int>* get_root_concurrent_halt() { return &root_concurrent_halt_; }
   void set_root_concurrent_halt(int value) { root_concurrent_halt_ = value; }
@@ -128,10 +122,17 @@ class branch_and_bound_t {
   std::vector<f_t> guess_;
 
   // LP relaxation
+  csr_matrix_t<i_t, f_t> Arow_;
   lp_problem_t<i_t, f_t> original_lp_;
   std::vector<i_t> new_slacks_;
   std::vector<variable_type_t> var_types_;
 
+  // Variable locks (see definition 3.3 from T. Achterberg, “Constraint Integer Programming,”
+  // PhD, Technischen Universität Berlin, Berlin, 2007. doi: 10.14279/depositonce-1634).
+  // Here we assume that the constraints are in the form `Ax = b, l <= x <= u`.
+  std::vector<i_t> var_up_locks_;
+  std::vector<i_t> var_down_locks_;
+
   // Local lower bounds for each thread
   std::vector<omp_atomic_t<f_t>> local_lower_bounds_;
 
@@ -139,23 +140,13 @@ class branch_and_bound_t {
   omp_mutex_t mutex_upper_;
 
   // Global variable for upper bound
-  f_t upper_bound_;
+  omp_atomic_t<f_t> upper_bound_;
 
   // Global variable for incumbent. The incumbent should be updated with the upper bound
   mip_solution_t<i_t, f_t> incumbent_;
 
   // Structure with the general info of the solver.
-  struct stats_t {
-    f_t start_time                        = 0.0;
-    omp_atomic_t<f_t> total_lp_solve_time = 0.0;
-    omp_atomic_t<i_t> nodes_explored      = 0;
-    omp_atomic_t<i_t> nodes_unexplored    = 0;
-    omp_atomic_t<f_t> total_lp_iters      = 0;
-
-    // This should only be used by the main thread
-    omp_atomic_t<f_t> last_log             = 0.0;
-    omp_atomic_t<i_t> nodes_since_last_log = 0;
-  } exploration_stats_;
+  bnb_stats_t<i_t, f_t> exploration_stats_;
 
   // Mutex for repair
   omp_mutex_t mutex_repair_;
@@ -175,9 +166,8 @@ class branch_and_bound_t {
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
 
-  // Heap storing the nodes to be explored.
-  omp_mutex_t mutex_heap_;
-  mip_node_heap_t<mip_node_t<i_t, f_t>*> heap_;
+  // Heap storing the nodes waiting to be explored.
+  node_queue_t<i_t, f_t> node_queue_;
 
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
@@ -185,13 +175,9 @@ class branch_and_bound_t {
   // Count the number of subtrees that are currently being explored.
   omp_atomic_t<i_t> active_subtrees_;
 
-  // Queue for storing the promising node for performing dives.
-  omp_mutex_t mutex_dive_queue_;
-  diving_queue_t<i_t, f_t> diving_queue_;
-  i_t min_diving_queue_size_;
-
   // Global status of the solver.
-  omp_atomic_t<mip_exploration_status_t> solver_status_;
+  omp_atomic_t<mip_status_t> solver_status_;
+  omp_atomic_t<bool> is_running{false};
 
   omp_atomic_t<bool> should_report_;
 
@@ -200,64 +186,87 @@ class branch_and_bound_t {
   omp_atomic_t<f_t> lower_bound_ceiling_;
 
   void report_heuristic(f_t obj);
-  void report(std::string symbol, f_t obj, f_t lower_bound, i_t node_depth);
+  void report(char symbol, f_t obj, f_t lower_bound, i_t node_depth);
 
   // Set the final solution.
-  mip_status_t set_final_solution(mip_solution_t<i_t, f_t>& solution, f_t lower_bound);
+  void set_final_solution(mip_solution_t<i_t, f_t>& solution, f_t lower_bound);
 
   // Update the incumbent solution with the new feasible solution
   // found during branch and bound.
   void add_feasible_solution(f_t leaf_objective,
                              const std::vector<f_t>& leaf_solution,
                              i_t leaf_depth,
-                             thread_type_t thread_type);
+                             bnb_worker_type_t thread_type);
 
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
   // Ramp-up phase of the solver, where we greedily expand the tree until
   // there is enough unexplored nodes. This is done recursively using OpenMP tasks.
-  void exploration_ramp_up(mip_node_t<i_t, f_t>* node,
-                           search_tree_t<i_t, f_t>* search_tree,
-                           const csr_matrix_t<i_t, f_t>& Arow,
-                           i_t initial_heap_size);
-
-  // Explore the search tree using the best-first search with plunging strategy.
-  void explore_subtree(i_t task_id,
-                       mip_node_t<i_t, f_t>* start_node,
-                       search_tree_t<i_t, f_t>& search_tree,
-                       lp_problem_t<i_t, f_t>& leaf_problem,
-                       bounds_strengthening_t<i_t, f_t>& node_presolver,
-                       basis_update_mpf_t<i_t, f_t>& basis_update,
-                       std::vector<i_t>& basic_list,
-                       std::vector<i_t>& nonbasic_list);
+  void exploration_ramp_up(mip_node_t<i_t, f_t>* node, i_t initial_heap_size);
+
+  // We use best-first to pick the `start_node` and then perform a depth-first search
+  // from this node (i.e., a plunge). It can only backtrack to a sibling node.
+  // Unexplored nodes in the subtree are inserted back into the global heap.
+  void plunge_from(i_t task_id,
+                   mip_node_t<i_t, f_t>* start_node,
+                   lp_problem_t<i_t, f_t>& leaf_problem,
+                   bounds_strengthening_t<i_t, f_t>& node_presolver,
+                   basis_update_mpf_t<i_t, f_t>& basis_update,
+                   std::vector<i_t>& basic_list,
+                   std::vector<i_t>& nonbasic_list);
 
   // Each "main" thread pops a node from the global heap and then performs a plunge
   // (i.e., a shallow dive) into the subtree determined by the node.
-  void best_first_thread(i_t task_id,
-                         search_tree_t<i_t, f_t>& search_tree,
-                         const csr_matrix_t<i_t, f_t>& Arow);
+  void best_first_thread(i_t task_id);
+
+  // Perform a deep dive in the subtree determined by the `start_node` in order
+  // to find integer feasible solutions.
+  void dive_from(mip_node_t<i_t, f_t>& start_node,
+                 const std::vector<f_t>& start_lower,
+                 const std::vector<f_t>& start_upper,
+                 lp_problem_t<i_t, f_t>& leaf_problem,
+                 bounds_strengthening_t<i_t, f_t>& node_presolver,
+                 basis_update_mpf_t<i_t, f_t>& basis_update,
+                 std::vector<i_t>& basic_list,
+                 std::vector<i_t>& nonbasic_list,
+                 bnb_worker_type_t diving_type);
 
   // Each diving thread pops the first node from the dive queue and then performs
   // a deep dive into the subtree determined by the node.
-  void diving_thread(const csr_matrix_t<i_t, f_t>& Arow);
+  void diving_thread(bnb_worker_type_t diving_type);
 
-  // Solve the LP relaxation of a leaf node and update the tree.
-  node_solve_info_t solve_node(mip_node_t<i_t, f_t>* node_ptr,
-                               search_tree_t<i_t, f_t>& search_tree,
+  // Solve the LP relaxation of a leaf node
+  dual::status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
                                lp_problem_t<i_t, f_t>& leaf_problem,
+                               lp_solution_t<i_t, f_t>& leaf_solution,
                                basis_update_mpf_t<i_t, f_t>& basis_factors,
                                std::vector<i_t>& basic_list,
                                std::vector<i_t>& nonbasic_list,
                                bounds_strengthening_t<i_t, f_t>& node_presolver,
-                               thread_type_t thread_type,
-                               bool recompute_basis_and_bounds,
+                               bnb_worker_type_t thread_type,
+                               bool recompute_bounds_and_basis,
                                const std::vector<f_t>& root_lower,
                                const std::vector<f_t>& root_upper,
+                               bnb_stats_t<i_t, f_t>& stats,
                                logger_t& log);
 
-  // Sort the children based on the Martin's criteria.
-  rounding_direction_t child_selection(mip_node_t<i_t, f_t>* node_ptr);
+  // Update the tree based on the LP relaxation. Returns the status
+  // of the node and, if appropriated, the preferred rounding direction
+  // when visiting the children.
+  std::pair<node_status_t, rounding_direction_t> update_tree(mip_node_t<i_t, f_t>* node_ptr,
+                                                             search_tree_t<i_t, f_t>& search_tree,
+                                                             lp_problem_t<i_t, f_t>& leaf_problem,
+                                                             lp_solution_t<i_t, f_t>& leaf_solution,
+                                                             bnb_worker_type_t thread_type,
+                                                             dual::status_t lp_status,
+                                                             logger_t& log);
+
+  // Selects the variable to branch on.
+  branch_variable_t<i_t> variable_selection(mip_node_t<i_t, f_t>* node_ptr,
+                                            const std::vector<i_t>& fractional,
+                                            const std::vector<f_t>& solution,
+                                            bnb_worker_type_t type);
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/crossover.cpp b/cpp/src/dual_simplex/crossover.cpp
index 23d9a0e8e..8ee3fb0ce 100644
--- a/cpp/src/dual_simplex/crossover.cpp
+++ b/cpp/src/dual_simplex/crossover.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -785,8 +785,15 @@ i_t primal_push(const lp_problem_t<i_t, f_t>& lp,
           factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
         if (rank != m) {
           settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-          basis_repair(
-            lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+          basis_repair(lp.A,
+                       settings,
+                       lp.lower,
+                       lp.upper,
+                       deficient,
+                       slacks_needed,
+                       basic_list,
+                       nonbasic_list,
+                       vstatus);
           if (factorize_basis(
                 lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
             settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
@@ -1132,7 +1139,15 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
   rank = factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
   if (rank != m) {
     settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-    basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+    basis_repair(lp.A,
+                 settings,
+                 lp.lower,
+                 lp.upper,
+                 deficient,
+                 slacks_needed,
+                 basic_list,
+                 nonbasic_list,
+                 vstatus);
     if (factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) ==
         -1) {
       settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
@@ -1323,7 +1338,15 @@ crossover_status_t crossover(const lp_problem_t<i_t, f_t>& lp,
         factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
       if (rank != m) {
         settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-        basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+        basis_repair(lp.A,
+                     settings,
+                     lp.lower,
+                     lp.upper,
+                     deficient,
+                     slacks_needed,
+                     basic_list,
+                     nonbasic_list,
+                     vstatus);
         if (factorize_basis(
               lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) == -1) {
           settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
diff --git a/cpp/src/dual_simplex/diving_heuristics.cpp b/cpp/src/dual_simplex/diving_heuristics.cpp
new file mode 100644
index 000000000..a56b4cce3
--- /dev/null
+++ b/cpp/src/dual_simplex/diving_heuristics.cpp
@@ -0,0 +1,306 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <dual_simplex/diving_heuristics.hpp>
+#include <tuple>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> line_search_diving(const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          const std::vector<f_t>& root_solution,
+                                          logger_t& log)
+{
+  constexpr f_t eps              = 1e-6;
+  i_t branch_var                 = -1;
+  f_t min_score                  = std::numeric_limits<f_t>::max();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+
+  for (i_t j : fractional) {
+    f_t score                = inf;
+    rounding_direction_t dir = rounding_direction_t::NONE;
+
+    if (solution[j] < root_solution[j] - eps) {
+      f_t f = solution[j] - std::floor(solution[j]);
+      f_t d = root_solution[j] - solution[j];
+      score = f / d;
+      dir   = rounding_direction_t::DOWN;
+
+    } else if (solution[j] > root_solution[j] + eps) {
+      f_t f = std::ceil(solution[j]) - solution[j];
+      f_t d = solution[j] - root_solution[j];
+      score = f / d;
+      dir   = rounding_direction_t::UP;
+    }
+
+    if (min_score > score) {
+      min_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  // If the current solution is equal to the root solution, arbitrarily
+  // set the branch variable to the first fractional variable and round it down
+  if (round_dir == rounding_direction_t::NONE) {
+    branch_var = fractional[0];
+    round_dir  = rounding_direction_t::DOWN;
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug("Line search diving: selected var %d with val = %e, round dir = %d and score = %e\n",
+            branch_var,
+            solution[branch_var],
+            round_dir,
+            min_score);
+
+  return {branch_var, round_dir};
+}
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                         const std::vector<i_t>& fractional,
+                                         const std::vector<f_t>& solution,
+                                         const std::vector<f_t>& root_solution,
+                                         logger_t& log)
+{
+  std::lock_guard<omp_mutex_t> lock(pc.mutex);
+  i_t branch_var                 = -1;
+  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps              = 1e-6;
+
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+  pc.initialized(
+    num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  for (i_t j : fractional) {
+    rounding_direction_t dir = rounding_direction_t::NONE;
+    f_t f_down               = solution[j] - std::floor(solution[j]);
+    f_t f_up                 = std::ceil(solution[j]) - solution[j];
+
+    f_t pc_down = pc.pseudo_cost_num_down[j] != 0
+                    ? pc.pseudo_cost_sum_down[j] / pc.pseudo_cost_num_down[j]
+                    : pseudo_cost_down_avg;
+
+    f_t pc_up = pc.pseudo_cost_num_up[j] != 0 ? pc.pseudo_cost_sum_up[j] / pc.pseudo_cost_num_up[j]
+                                              : pseudo_cost_up_avg;
+
+    f_t score_down = std::sqrt(f_up) * (1 + pc_up) / (1 + pc_down);
+    f_t score_up   = std::sqrt(f_down) * (1 + pc_down) / (1 + pc_up);
+    f_t score      = 0;
+
+    if (solution[j] < root_solution[j] - 0.4) {
+      score = score_down;
+      dir   = rounding_direction_t::DOWN;
+    } else if (solution[j] > root_solution[j] + 0.4) {
+      score = score_up;
+      dir   = rounding_direction_t::UP;
+    } else if (f_down < 0.3) {
+      score = score_down;
+      dir   = rounding_direction_t::DOWN;
+    } else if (f_down > 0.7) {
+      score = score_up;
+      dir   = rounding_direction_t::UP;
+    } else if (pc_down < pc_up + eps) {
+      score = score_down;
+      dir   = rounding_direction_t::DOWN;
+    } else {
+      score = score_up;
+      dir   = rounding_direction_t::UP;
+    }
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug("Pseudocost diving: selected var %d with val = %e, round dir = %d and score = %e\n",
+            branch_var,
+            solution[branch_var],
+            round_dir,
+            max_score);
+
+  return {branch_var, round_dir};
+}
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                     const std::vector<i_t>& fractional,
+                                     const std::vector<f_t>& solution,
+                                     const std::vector<f_t>& incumbent,
+                                     logger_t& log)
+{
+  std::lock_guard<omp_mutex_t> lock(pc.mutex);
+  i_t branch_var                 = -1;
+  f_t max_score                  = std::numeric_limits<f_t>::lowest();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps              = 1e-6;
+
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+  pc.initialized(
+    num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  for (i_t j : fractional) {
+    f_t f_down    = solution[j] - std::floor(solution[j]);
+    f_t f_up      = std::ceil(solution[j]) - solution[j];
+    f_t down_dist = std::abs(incumbent[j] - std::floor(solution[j]));
+    f_t up_dist   = std::abs(std::ceil(solution[j]) - incumbent[j]);
+    rounding_direction_t dir =
+      down_dist < up_dist + eps ? rounding_direction_t::DOWN : rounding_direction_t::UP;
+
+    f_t pc_down = pc.pseudo_cost_num_down[j] != 0
+                    ? pc.pseudo_cost_sum_down[j] / pc.pseudo_cost_num_down[j]
+                    : pseudo_cost_down_avg;
+
+    f_t pc_up = pc.pseudo_cost_num_up[j] != 0 ? pc.pseudo_cost_sum_up[j] / pc.pseudo_cost_num_up[j]
+                                              : pseudo_cost_up_avg;
+
+    f_t score1 = dir == rounding_direction_t::DOWN ? 5 * pc_down * f_down : 5 * pc_up * f_up;
+    f_t score2 = dir == rounding_direction_t::DOWN ? pc_up * f_up : pc_down * f_down;
+    f_t score  = (score1 + score2) / 6;
+
+    if (score > max_score) {
+      max_score  = score;
+      branch_var = j;
+      round_dir  = dir;
+    }
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug("Guided diving: selected var %d with val = %e, round dir = %d and score = %e\n",
+            branch_var,
+            solution[branch_var],
+            round_dir,
+            max_score);
+
+  return {branch_var, round_dir};
+}
+
+template <typename i_t, typename f_t>
+void calculate_variable_locks(const lp_problem_t<i_t, f_t>& lp_problem,
+                              std::vector<i_t>& up_locks,
+                              std::vector<i_t>& down_locks)
+{
+  constexpr f_t eps = 1E-6;
+  up_locks.assign(lp_problem.num_cols, 0);
+  down_locks.assign(lp_problem.num_cols, 0);
+
+  for (i_t j = 0; j < lp_problem.num_cols; ++j) {
+    i_t start = lp_problem.A.col_start[j];
+    i_t end   = lp_problem.A.col_start[j + 1];
+
+    for (i_t p = start; p < end; ++p) {
+      f_t val = lp_problem.A.x[p];
+      if (std::abs(val) > eps) {
+        up_locks[j]++;
+        down_locks[j]++;
+      }
+    }
+  }
+}
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> coefficient_diving(const lp_problem_t<i_t, f_t>& lp_problem,
+                                          const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          const std::vector<i_t>& up_locks,
+                                          const std::vector<i_t>& down_locks,
+                                          logger_t& log)
+{
+  i_t branch_var                 = -1;
+  i_t min_locks                  = std::numeric_limits<i_t>::max();
+  rounding_direction_t round_dir = rounding_direction_t::NONE;
+  constexpr f_t eps              = 1e-6;
+
+  for (i_t j : fractional) {
+    f_t f_down    = solution[j] - std::floor(solution[j]);
+    f_t f_up      = std::ceil(solution[j]) - solution[j];
+    i_t up_lock   = up_locks[j];
+    i_t down_lock = down_locks[j];
+    f_t upper     = lp_problem.upper[j];
+    f_t lower     = lp_problem.lower[j];
+    if (std::isfinite(upper)) { up_lock++; }
+    if (std::isfinite(lower)) { down_lock++; }
+    i_t alpha = std::min(up_lock, down_lock);
+
+    if (min_locks > alpha) {
+      min_locks  = alpha;
+      branch_var = j;
+
+      if (up_lock < down_lock) {
+        round_dir = rounding_direction_t::UP;
+      } else if (up_lock > down_lock) {
+        round_dir = rounding_direction_t::DOWN;
+      } else if (f_down < f_up + eps) {
+        round_dir = rounding_direction_t::DOWN;
+      } else {
+        round_dir = rounding_direction_t::UP;
+      }
+    }
+  }
+
+  assert(round_dir != rounding_direction_t::NONE);
+  assert(branch_var >= 0);
+
+  log.debug(
+    "Coefficient diving: selected var %d with val = %e, round dir = %d and min locks = %d\n",
+    branch_var,
+    solution[branch_var],
+    round_dir,
+    min_locks);
+
+  return {branch_var, round_dir};
+}
+
+#ifdef DUAL_SIMPLEX_INSTANTIATE_DOUBLE
+template branch_variable_t<int> line_search_diving(const std::vector<int>& fractional,
+                                                   const std::vector<double>& solution,
+                                                   const std::vector<double>& root_solution,
+                                                   logger_t& log);
+
+template branch_variable_t<int> pseudocost_diving(pseudo_costs_t<int, double>& pc,
+                                                  const std::vector<int>& fractional,
+                                                  const std::vector<double>& solution,
+                                                  const std::vector<double>& root_solution,
+                                                  logger_t& log);
+
+template branch_variable_t<int> guided_diving(pseudo_costs_t<int, double>& pc,
+                                              const std::vector<int>& fractional,
+                                              const std::vector<double>& solution,
+                                              const std::vector<double>& incumbent,
+                                              logger_t& log);
+
+template void calculate_variable_locks(const lp_problem_t<int, double>& lp_problem,
+                                       std::vector<int>& up_locks,
+                                       std::vector<int>& down_locks);
+
+template branch_variable_t<int> coefficient_diving(const lp_problem_t<int, double>& lp_problem,
+                                                   const std::vector<int>& fractional,
+                                                   const std::vector<double>& solution,
+                                                   const std::vector<int>& up_locks,
+                                                   const std::vector<int>& down_locks,
+                                                   logger_t& log);
+#endif
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_heuristics.hpp b/cpp/src/dual_simplex/diving_heuristics.hpp
new file mode 100644
index 000000000..3c6d77c04
--- /dev/null
+++ b/cpp/src/dual_simplex/diving_heuristics.hpp
@@ -0,0 +1,58 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <dual_simplex/basis_updates.hpp>
+#include <dual_simplex/bounds_strengthening.hpp>
+#include <dual_simplex/pseudo_costs.hpp>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t>
+struct branch_variable_t {
+  i_t variable;
+  rounding_direction_t direction;
+};
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> line_search_diving(const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          const std::vector<f_t>& root_solution,
+                                          logger_t& log);
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                         const std::vector<i_t>& fractional,
+                                         const std::vector<f_t>& solution,
+                                         const std::vector<f_t>& root_solution,
+                                         logger_t& log);
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> guided_diving(pseudo_costs_t<i_t, f_t>& pc,
+                                     const std::vector<i_t>& fractional,
+                                     const std::vector<f_t>& solution,
+                                     const std::vector<f_t>& incumbent,
+                                     logger_t& log);
+
+// Calculate the variable locks assuming that the constraints
+// has the following format: `Ax = b`.
+template <typename i_t, typename f_t>
+void calculate_variable_locks(const lp_problem_t<i_t, f_t>& lp_problem,
+                              std::vector<i_t>& up_locks,
+                              std::vector<i_t>& down_locks);
+
+template <typename i_t, typename f_t>
+branch_variable_t<i_t> coefficient_diving(const lp_problem_t<i_t, f_t>& lp_problem,
+                                          const std::vector<i_t>& fractional,
+                                          const std::vector<f_t>& solution,
+                                          const std::vector<i_t>& up_locks,
+                                          const std::vector<i_t>& down_locks,
+                                          logger_t& log);
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/diving_queue.hpp b/cpp/src/dual_simplex/diving_queue.hpp
deleted file mode 100644
index f7035109e..000000000
--- a/cpp/src/dual_simplex/diving_queue.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#pragma once
-
-#include <algorithm>
-#include <climits>
-#include <vector>
-
-#include <dual_simplex/mip_node.hpp>
-
-namespace cuopt::linear_programming::dual_simplex {
-
-template <typename i_t, typename f_t>
-struct diving_root_t {
-  mip_node_t<i_t, f_t> node;
-  std::vector<f_t> lower;
-  std::vector<f_t> upper;
-
-  diving_root_t(mip_node_t<i_t, f_t>&& node, std::vector<f_t>&& lower, std::vector<f_t>&& upper)
-    : node(std::move(node)), lower(std::move(lower)), upper(std::move(upper))
-  {
-  }
-
-  friend bool operator>(const diving_root_t<i_t, f_t>& a, const diving_root_t<i_t, f_t>& b)
-  {
-    return a.node.lower_bound > b.node.lower_bound;
-  }
-};
-
-// A min-heap for storing the starting nodes for the dives.
-// This has a maximum size of 1024, such that the container
-// will discard the least promising node if the queue is full.
-template <typename i_t, typename f_t>
-class diving_queue_t {
- private:
-  std::vector<diving_root_t<i_t, f_t>> buffer;
-  static constexpr i_t max_size_ = 1024;
-
- public:
-  diving_queue_t() { buffer.reserve(max_size_); }
-
-  void push(diving_root_t<i_t, f_t>&& node)
-  {
-    buffer.push_back(std::move(node));
-    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
-    if (buffer.size() > max_size() - 1) { buffer.pop_back(); }
-  }
-
-  void emplace(mip_node_t<i_t, f_t>&& node, std::vector<f_t>&& lower, std::vector<f_t>&& upper)
-  {
-    buffer.emplace_back(std::move(node), std::move(lower), std::move(upper));
-    std::push_heap(buffer.begin(), buffer.end(), std::greater<>());
-    if (buffer.size() > max_size() - 1) { buffer.pop_back(); }
-  }
-
-  diving_root_t<i_t, f_t> pop()
-  {
-    std::pop_heap(buffer.begin(), buffer.end(), std::greater<>());
-    diving_root_t<i_t, f_t> node = std::move(buffer.back());
-    buffer.pop_back();
-    return node;
-  }
-
-  i_t size() const { return buffer.size(); }
-  constexpr i_t max_size() const { return max_size_; }
-  const diving_root_t<i_t, f_t>& top() const { return buffer.front(); }
-  void clear() { buffer.clear(); }
-};
-
-}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/mip_node.hpp b/cpp/src/dual_simplex/mip_node.hpp
index 1d66a21f7..de147132a 100644
--- a/cpp/src/dual_simplex/mip_node.hpp
+++ b/cpp/src/dual_simplex/mip_node.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -45,6 +45,7 @@ class mip_node_t {
       branch_var_lower(-std::numeric_limits<f_t>::infinity()),
       branch_var_upper(std::numeric_limits<f_t>::infinity()),
       fractional_val(std::numeric_limits<f_t>::infinity()),
+      objective_estimate(std::numeric_limits<f_t>::infinity()),
       vstatus(0)
   {
     children[0] = nullptr;
@@ -59,6 +60,7 @@ class mip_node_t {
       node_id(0),
       branch_var(-1),
       branch_dir(rounding_direction_t::NONE),
+      objective_estimate(std::numeric_limits<f_t>::infinity()),
       vstatus(basis)
   {
     children[0] = nullptr;
@@ -80,6 +82,7 @@ class mip_node_t {
       branch_var(branch_variable),
       branch_dir(branch_direction),
       fractional_val(branch_var_value),
+      objective_estimate(parent_node->objective_estimate),
       vstatus(basis)
 
   {
@@ -227,17 +230,19 @@ class mip_node_t {
   mip_node_t<i_t, f_t> detach_copy() const
   {
     mip_node_t<i_t, f_t> copy(lower_bound, vstatus);
-    copy.branch_var       = branch_var;
-    copy.branch_dir       = branch_dir;
-    copy.branch_var_lower = branch_var_lower;
-    copy.branch_var_upper = branch_var_upper;
-    copy.fractional_val   = fractional_val;
-    copy.node_id          = node_id;
+    copy.branch_var         = branch_var;
+    copy.branch_dir         = branch_dir;
+    copy.branch_var_lower   = branch_var_lower;
+    copy.branch_var_upper   = branch_var_upper;
+    copy.fractional_val     = fractional_val;
+    copy.objective_estimate = objective_estimate;
+    copy.node_id            = node_id;
     return copy;
   }
 
   node_status_t status;
   f_t lower_bound;
+  f_t objective_estimate;
   i_t depth;
   i_t node_id;
   i_t branch_var;
@@ -262,22 +267,6 @@ void remove_fathomed_nodes(std::vector<mip_node_t<i_t, f_t>*>& stack)
   }
 }
 
-template <typename i_t, typename f_t>
-class node_compare_t {
- public:
-  bool operator()(const mip_node_t<i_t, f_t>& a, const mip_node_t<i_t, f_t>& b) const
-  {
-    return a.lower_bound >
-           b.lower_bound;  // True if a comes before b, elements that come before are output last
-  }
-
-  bool operator()(const mip_node_t<i_t, f_t>* a, const mip_node_t<i_t, f_t>* b) const
-  {
-    return a->lower_bound >
-           b->lower_bound;  // True if a comes before b, elements that come before are output last
-  }
-};
-
 template <typename i_t, typename f_t>
 class search_tree_t {
  public:
diff --git a/cpp/src/dual_simplex/node_queue.hpp b/cpp/src/dual_simplex/node_queue.hpp
new file mode 100644
index 000000000..28072795a
--- /dev/null
+++ b/cpp/src/dual_simplex/node_queue.hpp
@@ -0,0 +1,162 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights
+ * reserved. SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <dual_simplex/mip_node.hpp>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+// This is a generic heap implementation based
+// on the STL functions. The main benefit here is
+// that we access the underlying container.
+template <typename T, typename Comp>
+class heap_t {
+ public:
+  heap_t()          = default;
+  virtual ~heap_t() = default;
+
+  void push(const T& node)
+  {
+    buffer.push_back(node);
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  void push(T&& node)
+  {
+    buffer.push_back(std::move(node));
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  template <typename... Args>
+  void emplace(Args&&... args)
+  {
+    buffer.emplace_back(std::forward<Args>(args)...);
+    std::push_heap(buffer.begin(), buffer.end(), comp);
+  }
+
+  std::optional<T> pop()
+  {
+    if (buffer.empty()) return std::nullopt;
+
+    std::pop_heap(buffer.begin(), buffer.end(), comp);
+    T node = std::move(buffer.back());
+    buffer.pop_back();
+    return node;
+  }
+
+  size_t size() const { return buffer.size(); }
+  T& top() { return buffer.front(); }
+  void clear() { buffer.clear(); }
+  bool empty() const { return buffer.empty(); }
+
+ private:
+  std::vector<T> buffer;
+  Comp comp;
+};
+
+// A queue storing the nodes waiting to be explored/dived from.
+template <typename i_t, typename f_t>
+class node_queue_t {
+ private:
+  struct heap_entry_t {
+    mip_node_t<i_t, f_t>* node = nullptr;
+    f_t lower_bound            = -inf;
+    f_t score                  = inf;
+
+    heap_entry_t(mip_node_t<i_t, f_t>* new_node)
+      : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate)
+    {
+    }
+  };
+
+  // Comparision function for ordering the nodes based on their lower bound with
+  // lowest one being explored first.
+  struct lower_bound_comp {
+    bool operator()(const std::shared_ptr<heap_entry_t>& a, const std::shared_ptr<heap_entry_t>& b)
+    {
+      // `a` will be placed after `b`
+      return a->lower_bound > b->lower_bound;
+    }
+  };
+
+  // Comparision function for ordering the nodes based on some score (currently the pseudocost
+  // estimate) with the lowest being explored first.
+  struct score_comp {
+    bool operator()(const std::shared_ptr<heap_entry_t>& a, const std::shared_ptr<heap_entry_t>& b)
+    {
+      // `a` will be placed after `b`
+      return a->score > b->score;
+    }
+  };
+
+  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap;
+  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap;
+  omp_mutex_t mutex;
+
+ public:
+  void push(mip_node_t<i_t, f_t>* new_node)
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    auto entry = std::make_shared<heap_entry_t>(new_node);
+    best_first_heap.push(entry);
+    diving_heap.push(entry);
+  }
+
+  std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
+  {
+    auto entry = best_first_heap.pop();
+    if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
+    return std::nullopt;
+  }
+
+  std::optional<mip_node_t<i_t, f_t>*> pop_diving()
+  {
+    while (!diving_heap.empty()) {
+      auto entry = diving_heap.pop();
+      if (entry.has_value()) {
+        if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
+      }
+    }
+    return std::nullopt;
+  }
+
+  void lock() { mutex.lock(); }
+
+  void unlock() { mutex.unlock(); }
+
+  i_t diving_queue_size()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return diving_heap.size();
+  }
+
+  i_t best_first_queue_size()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.size();
+  }
+
+  f_t get_lower_bound()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound;
+  }
+
+  mip_node_t<i_t, f_t>* bfs_top()
+  {
+    std::lock_guard<omp_mutex_t> lock(mutex);
+    return best_first_heap.empty() ? nullptr : best_first_heap.top()->node;
+  }
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/phase2.cpp b/cpp/src/dual_simplex/phase2.cpp
index 56298ef4d..2bc00f636 100644
--- a/cpp/src/dual_simplex/phase2.cpp
+++ b/cpp/src/dual_simplex/phase2.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -623,14 +623,17 @@ f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
                                            const std::vector<i_t>& basic_list,
                                            const std::vector<f_t>& x,
                                            std::vector<f_t>& squared_infeasibilities,
-                                           std::vector<i_t>& infeasibility_indices)
+                                           std::vector<i_t>& infeasibility_indices,
+                                           f_t& primal_inf)
 {
   const i_t m = lp.num_rows;
   const i_t n = lp.num_cols;
-  squared_infeasibilities.resize(n, 0.0);
+  squared_infeasibilities.resize(n);
+  std::fill(squared_infeasibilities.begin(), squared_infeasibilities.end(), 0.0);
   infeasibility_indices.reserve(n);
   infeasibility_indices.clear();
-  f_t primal_inf = 0.0;
+  f_t primal_inf_squared = 0.0;
+  primal_inf             = 0.0;
   for (i_t k = 0; k < m; ++k) {
     const i_t j            = basic_list[k];
     const f_t lower_infeas = lp.lower[j] - x[j];
@@ -640,10 +643,11 @@ f_t compute_initial_primal_infeasibilities(const lp_problem_t<i_t, f_t>& lp,
       const f_t square_infeas    = infeas * infeas;
       squared_infeasibilities[j] = square_infeas;
       infeasibility_indices.push_back(j);
-      primal_inf += square_infeas;
+      primal_inf_squared += square_infeas;
+      primal_inf += infeas;
     }
   }
-  return primal_inf;
+  return primal_inf_squared;
 }
 
 template <typename i_t, typename f_t>
@@ -2241,7 +2245,8 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
     assert(superbasic_list.size() == 0);
     assert(nonbasic_list.size() == n - m);
 
-    if (ft.refactor_basis(lp.A, settings, basic_list, nonbasic_list, vstatus) > 0) {
+    if (ft.refactor_basis(lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus) >
+        0) {
       return dual::status_t::NUMERICAL;
     }
 
@@ -2268,7 +2273,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 
 #ifdef COMPUTE_DUAL_RESIDUAL
   std::vector<f_t> dual_res1;
-  compute_dual_residual(lp.A, objective, y, z, dual_res1);
+  phase2::compute_dual_residual(lp.A, objective, y, z, dual_res1);
   f_t dual_res_norm = vector_norm_inf<i_t, f_t>(dual_res1);
   if (dual_res_norm > settings.tight_tol) {
     settings.log.printf("|| A'*y + z - c || %e\n", dual_res_norm);
@@ -2357,8 +2362,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
   std::vector<uint8_t> bounded_variables(n, 0);
   phase2::compute_bounded_info(lp.lower, lp.upper, bounded_variables);
 
-  f_t primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
-    lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+  f_t primal_infeasibility;
+  f_t primal_infeasibility_squared =
+    phase2::compute_initial_primal_infeasibilities(lp,
+                                                   settings,
+                                                   basic_list,
+                                                   x,
+                                                   squared_infeasibilities,
+                                                   infeasibility_indices,
+                                                   primal_infeasibility);
 
 #ifdef CHECK_BASIC_INFEASIBILITIES
   phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 0);
@@ -2556,9 +2568,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
             std::vector<f_t> unperturbed_x(n);
             phase2::compute_primal_solution_from_basis(
               lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
-            x                    = unperturbed_x;
-            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
-              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+            x = unperturbed_x;
+            primal_infeasibility_squared =
+              phase2::compute_initial_primal_infeasibilities(lp,
+                                                             settings,
+                                                             basic_list,
+                                                             x,
+                                                             squared_infeasibilities,
+                                                             infeasibility_indices,
+                                                             primal_infeasibility);
             settings.log.printf("Updated primal infeasibility: %e\n", primal_infeasibility);
 
             objective = lp.objective;
@@ -2593,9 +2611,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
             std::vector<f_t> unperturbed_x(n);
             phase2::compute_primal_solution_from_basis(
               lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
-            x                    = unperturbed_x;
-            primal_infeasibility = phase2::compute_initial_primal_infeasibilities(
-              lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+            x = unperturbed_x;
+            primal_infeasibility_squared =
+              phase2::compute_initial_primal_infeasibilities(lp,
+                                                             settings,
+                                                             basic_list,
+                                                             x,
+                                                             squared_infeasibilities,
+                                                             infeasibility_indices,
+                                                             primal_infeasibility);
 
             const f_t orig_dual_infeas = phase2::dual_infeasibility(
               lp, settings, vstatus, z, settings.tight_tol, settings.dual_tol);
@@ -2810,7 +2834,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                           delta_xB_0_sparse.i,
                                           squared_infeasibilities,
                                           infeasibility_indices,
-                                          primal_infeasibility);
+                                          primal_infeasibility_squared);
     // Update primal infeasibilities due to changes in basic variables
     // from the leaving and entering variables
     phase2::update_primal_infeasibilities(lp,
@@ -2822,7 +2846,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                                           scaled_delta_xB_sparse.i,
                                           squared_infeasibilities,
                                           infeasibility_indices,
-                                          primal_infeasibility);
+                                          primal_infeasibility_squared);
     // Update the entering variable
     phase2::update_single_primal_infeasibility(lp.lower,
                                                lp.upper,
@@ -2883,14 +2907,15 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
 #endif
     if (should_refactor) {
       bool should_recompute_x = false;
-      if (ft.refactor_basis(lp.A, settings, basic_list, nonbasic_list, vstatus) > 0) {
+      if (ft.refactor_basis(
+            lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus) > 0) {
         should_recompute_x = true;
         settings.log.printf("Failed to factorize basis. Iteration %d\n", iter);
         if (toc(start_time) > settings.time_limit) { return dual::status_t::TIME_LIMIT; }
         i_t count = 0;
         i_t deficient_size;
-        while ((deficient_size =
-                  ft.refactor_basis(lp.A, settings, basic_list, nonbasic_list, vstatus)) > 0) {
+        while ((deficient_size = ft.refactor_basis(
+                  lp.A, settings, lp.lower, lp.upper, basic_list, nonbasic_list, vstatus)) > 0) {
           settings.log.printf("Failed to repair basis. Iteration %d. %d deficient columns.\n",
                               iter,
                               static_cast<int>(deficient_size));
@@ -2912,8 +2937,14 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
           lp, ft, basic_list, nonbasic_list, vstatus, unperturbed_x);
         x = unperturbed_x;
       }
-      phase2::compute_initial_primal_infeasibilities(
-        lp, settings, basic_list, x, squared_infeasibilities, infeasibility_indices);
+      primal_infeasibility_squared =
+        phase2::compute_initial_primal_infeasibilities(lp,
+                                                       settings,
+                                                       basic_list,
+                                                       x,
+                                                       squared_infeasibilities,
+                                                       infeasibility_indices,
+                                                       primal_infeasibility);
     }
 #ifdef CHECK_BASIC_INFEASIBILITIES
     phase2::check_basic_infeasibilities(basic_list, basic_mark, infeasibility_indices, 7);
@@ -2951,7 +2982,7 @@ dual::status_t dual_phase2_with_advanced_basis(i_t phase,
                           iter,
                           compute_user_objective(lp, obj),
                           infeasibility_indices.size(),
-                          primal_infeasibility,
+                          primal_infeasibility_squared,
                           sum_perturb,
                           now);
     }
diff --git a/cpp/src/dual_simplex/primal.cpp b/cpp/src/dual_simplex/primal.cpp
index 80406dcf0..69f15ba18 100644
--- a/cpp/src/dual_simplex/primal.cpp
+++ b/cpp/src/dual_simplex/primal.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -298,7 +298,15 @@ primal::status_t primal_phase2(i_t phase,
     factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed);
   if (rank != m) {
     settings.log.debug("Failed to factorize basis. rank %d m %d\n", rank, m);
-    basis_repair(lp.A, settings, deficient, slacks_needed, basic_list, nonbasic_list, vstatus);
+    basis_repair(lp.A,
+                 settings,
+                 lp.lower,
+                 lp.upper,
+                 deficient,
+                 slacks_needed,
+                 basic_list,
+                 nonbasic_list,
+                 vstatus);
     if (factorize_basis(lp.A, settings, basic_list, L, U, p, pinv, q, deficient, slacks_needed) ==
         -1) {
       settings.log.printf("Failed to factorize basis after repair. rank %d m %d\n", rank, m);
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 9f84e108d..0a8f660e9 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -148,8 +148,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       pseudo_costs_t<i_t, f_t>& pc)
 {
   pc.resize(original_lp.num_cols);
-  pc.strong_branch_down.resize(fractional.size());
-  pc.strong_branch_up.resize(fractional.size());
+  pc.strong_branch_down.assign(fractional.size(), 0);
+  pc.strong_branch_up.assign(fractional.size(), 0);
   pc.num_strong_branches_completed = 0;
 
   settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
@@ -199,7 +199,7 @@ template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_ptr,
                                                    f_t leaf_objective)
 {
-  mutex.lock();
+  std::lock_guard<omp_mutex_t> lock(mutex);
   const f_t change_in_obj = leaf_objective - node_ptr->lower_bound;
   const f_t frac          = node_ptr->branch_dir == rounding_direction_t::DOWN
                               ? node_ptr->fractional_val - std::floor(node_ptr->fractional_val)
@@ -211,7 +211,6 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs(mip_node_t<i_t, f_t>* node_pt
     pseudo_cost_sum_up[node_ptr->branch_var] += change_in_obj / frac;
     pseudo_cost_num_up[node_ptr->branch_var]++;
   }
-  mutex.unlock();
 }
 
 template <typename i_t, typename f_t>
@@ -258,7 +257,7 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
                                                  const std::vector<f_t>& solution,
                                                  logger_t& log)
 {
-  mutex.lock();
+  std::lock_guard<omp_mutex_t> lock(mutex);
 
   const i_t num_fractional = fractional.size();
   std::vector<f_t> pseudo_cost_up(num_fractional);
@@ -312,11 +311,53 @@ i_t pseudo_costs_t<i_t, f_t>::variable_selection(const std::vector<i_t>& fractio
   log.printf(
     "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], score[select]);
 
-  mutex.unlock();
-
   return branch_var;
 }
 
+template <typename i_t, typename f_t>
+f_t pseudo_costs_t<i_t, f_t>::obj_estimate(const std::vector<i_t>& fractional,
+                                           const std::vector<f_t>& solution,
+                                           f_t lower_bound,
+                                           logger_t& log)
+{
+  std::lock_guard<omp_mutex_t> lock(mutex);
+
+  const i_t num_fractional = fractional.size();
+  f_t estimate             = lower_bound;
+
+  i_t num_initialized_down;
+  i_t num_initialized_up;
+  f_t pseudo_cost_down_avg;
+  f_t pseudo_cost_up_avg;
+
+  initialized(num_initialized_down, num_initialized_up, pseudo_cost_down_avg, pseudo_cost_up_avg);
+
+  for (i_t k = 0; k < num_fractional; k++) {
+    const i_t j          = fractional[k];
+    f_t pseudo_cost_down = 0;
+    f_t pseudo_cost_up   = 0;
+
+    if (pseudo_cost_num_down[j] != 0) {
+      pseudo_cost_down = pseudo_cost_sum_down[j] / pseudo_cost_num_down[j];
+    } else {
+      pseudo_cost_down = pseudo_cost_down_avg;
+    }
+
+    if (pseudo_cost_num_up[j] != 0) {
+      pseudo_cost_up = pseudo_cost_sum_up[j] / pseudo_cost_num_up[j];
+    } else {
+      pseudo_cost_up = pseudo_cost_up_avg;
+    }
+    constexpr f_t eps = 1e-6;
+    const f_t f_down  = solution[j] - std::floor(solution[j]);
+    const f_t f_up    = std::ceil(solution[j]) - solution[j];
+    estimate += std::min(pseudo_cost_down * f_down, pseudo_cost_up * f_up);
+  }
+
+  log.printf("pseudocost estimate = %e\n", estimate);
+  return estimate;
+}
+
 template <typename i_t, typename f_t>
 void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
   const std::vector<i_t>& fractional, const std::vector<f_t>& root_soln)
diff --git a/cpp/src/dual_simplex/pseudo_costs.hpp b/cpp/src/dual_simplex/pseudo_costs.hpp
index 799cdc3ff..4bab438fa 100644
--- a/cpp/src/dual_simplex/pseudo_costs.hpp
+++ b/cpp/src/dual_simplex/pseudo_costs.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -32,10 +32,10 @@ class pseudo_costs_t {
 
   void resize(i_t num_variables)
   {
-    pseudo_cost_sum_down.resize(num_variables);
-    pseudo_cost_sum_up.resize(num_variables);
-    pseudo_cost_num_down.resize(num_variables);
-    pseudo_cost_num_up.resize(num_variables);
+    pseudo_cost_sum_down.assign(num_variables, 0);
+    pseudo_cost_sum_up.assign(num_variables, 0);
+    pseudo_cost_num_down.assign(num_variables, 0);
+    pseudo_cost_num_up.assign(num_variables, 0);
   }
 
   void initialized(i_t& num_initialized_down,
@@ -43,6 +43,11 @@ class pseudo_costs_t {
                    f_t& pseudo_cost_down_avg,
                    f_t& pseudo_cost_up_avg) const;
 
+  f_t obj_estimate(const std::vector<i_t>& fractional,
+                   const std::vector<f_t>& solution,
+                   f_t lower_bound,
+                   logger_t& log);
+
   i_t variable_selection(const std::vector<i_t>& fractional,
                          const std::vector<f_t>& solution,
                          logger_t& log);
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index a1cc049e7..d86f84c39 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -13,12 +13,29 @@
 #include <omp.h>
 #include <algorithm>
 #include <atomic>
+#include <cmath>
 #include <functional>
 #include <limits>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
 
+template <typename i_t, typename f_t>
+struct diving_heuristics_settings_t {
+  i_t num_diving_workers = -1;
+
+  // -1 automatic, 0 disabled, 1 enabled
+  i_t line_search_diving = -1;
+  i_t pseudocost_diving  = -1;
+  i_t guided_diving      = -1;
+  i_t coefficient_diving = -1;
+
+  i_t min_node_depth         = 10;
+  i_t node_limit             = 500;
+  f_t iteration_limit_factor = 0.05;
+  i_t backtrack_limit        = 5;
+};
+
 template <typename i_t, typename f_t>
 struct simplex_solver_settings_t {
  public:
@@ -70,14 +87,14 @@ struct simplex_solver_settings_t {
       iteration_log_frequency(1000),
       first_iteration_log(2),
       num_threads(omp_get_max_threads() - 1),
-      num_bfs_threads(std::min(num_threads / 4, 1)),
-      num_diving_threads(std::min(num_threads - num_bfs_threads, 1)),
+      num_bfs_workers(std::max(num_threads / 4, 1)),
       random_seed(0),
       inside_mip(0),
       solution_callback(nullptr),
       heuristic_preemption_callback(nullptr),
       concurrent_halt(nullptr)
   {
+    diving_settings.num_diving_workers = std::max(num_threads - num_bfs_workers, 1);
   }
 
   void set_log(bool logging) const { log.log = logging; }
@@ -137,8 +154,10 @@ struct simplex_solver_settings_t {
   i_t first_iteration_log;         // number of iterations to log at beginning of solve
   i_t num_threads;                 // number of threads to use
   i_t random_seed;                 // random seed
-  i_t num_bfs_threads;             // number of threads dedicated to the best-first search
-  i_t num_diving_threads;          // number of threads dedicated to diving
+  i_t num_bfs_workers;             // number of threads dedicated to the best-first search
+
+  diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
+
   i_t inside_mip;  // 0 if outside MIP, 1 if inside MIP at root node, 2 if inside MIP at leaf node
   std::function<void(std::vector<f_t>&, f_t)> solution_callback;
   std::function<void(const std::vector<f_t>&, f_t)> node_processed_callback;
diff --git a/cpp/src/mip/diversity/lns/rins.cu b/cpp/src/mip/diversity/lns/rins.cu
index 537f49cea..7456b59ed 100644
--- a/cpp/src/mip/diversity/lns/rins.cu
+++ b/cpp/src/mip/diversity/lns/rins.cu
@@ -256,13 +256,19 @@ void rins_t<i_t, f_t>::run_rins()
   branch_and_bound_settings.absolute_mip_gap_tol = context.settings.tolerances.absolute_mip_gap;
   branch_and_bound_settings.relative_mip_gap_tol =
     std::min(current_mip_gap, (f_t)settings.target_mip_gap);
-  branch_and_bound_settings.integer_tol        = context.settings.tolerances.integrality_tolerance;
-  branch_and_bound_settings.num_threads        = 2;
-  branch_and_bound_settings.num_bfs_threads    = 1;
-  branch_and_bound_settings.num_diving_threads = 1;
-  branch_and_bound_settings.log.log            = false;
-  branch_and_bound_settings.log.log_prefix     = "[RINS] ";
-  branch_and_bound_settings.solution_callback  = [&rins_solution_queue](std::vector<f_t>& solution,
+  branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
+  branch_and_bound_settings.num_threads     = 2;
+  branch_and_bound_settings.num_bfs_workers = 1;
+
+  // In the future, let RINS use all the diving heuristics. For now,
+  // restricting to guided diving.
+  branch_and_bound_settings.diving_settings.num_diving_workers = 1;
+  branch_and_bound_settings.diving_settings.line_search_diving = 0;
+  branch_and_bound_settings.diving_settings.coefficient_diving = 0;
+  branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
+  branch_and_bound_settings.log.log                            = false;
+  branch_and_bound_settings.log.log_prefix                     = "[RINS] ";
+  branch_and_bound_settings.solution_callback = [&rins_solution_queue](std::vector<f_t>& solution,
                                                                        f_t objective) {
     rins_solution_queue.push_back(solution);
   };
diff --git a/cpp/src/mip/diversity/recombiners/sub_mip.cuh b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
index 5be807372..82670437a 100644
--- a/cpp/src/mip/diversity/recombiners/sub_mip.cuh
+++ b/cpp/src/mip/diversity/recombiners/sub_mip.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -103,9 +103,15 @@ class sub_mip_recombiner_t : public recombiner_t<i_t, f_t> {
       branch_and_bound_settings.relative_mip_gap_tol = context.settings.tolerances.relative_mip_gap;
       branch_and_bound_settings.integer_tol     = context.settings.tolerances.integrality_tolerance;
       branch_and_bound_settings.num_threads     = 2;
-      branch_and_bound_settings.num_bfs_threads = 1;
-      branch_and_bound_settings.num_diving_threads = 1;
-      branch_and_bound_settings.solution_callback  = [this](std::vector<f_t>& solution,
+      branch_and_bound_settings.num_bfs_workers = 1;
+
+      // In the future, let SubMIP use all the diving heuristics. For now,
+      // restricting to guided diving.
+      branch_and_bound_settings.diving_settings.num_diving_workers = 1;
+      branch_and_bound_settings.diving_settings.line_search_diving = 0;
+      branch_and_bound_settings.diving_settings.coefficient_diving = 0;
+      branch_and_bound_settings.diving_settings.pseudocost_diving  = 0;
+      branch_and_bound_settings.solution_callback = [this](std::vector<f_t>& solution,
                                                            f_t objective) {
         this->solution_callback(solution, objective);
       };
diff --git a/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh b/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh
index 2906e648f..5cd219ec3 100644
--- a/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh
+++ b/cpp/src/mip/local_search/rounding/simple_rounding_kernels.cuh
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -32,6 +32,7 @@ __global__ void simple_rounding_kernel(typename solution_t<i_t, f_t>::view_t sol
       auto cstr_idx   = solution.problem.reverse_constraints[i];
       auto cstr_coeff = solution.problem.reverse_coefficients[i];
 
+      // Here, we are storing the constraints in the following format: u <= Ax <= l
       // boxed constraint. can't be rounded safely
       if (std::isfinite(solution.problem.constraint_lower_bounds[cstr_idx]) &&
           std::isfinite(solution.problem.constraint_upper_bounds[cstr_idx])) {
diff --git a/cpp/src/mip/solver.cu b/cpp/src/mip/solver.cu
index 7311a26fd..08e1806b9 100644
--- a/cpp/src/mip/solver.cu
+++ b/cpp/src/mip/solver.cu
@@ -173,13 +173,12 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     } else {
       branch_and_bound_settings.num_threads = std::max(1, context.settings.num_cpu_threads);
     }
-    CUOPT_LOG_INFO("Using %d CPU threads for B&B", branch_and_bound_settings.num_threads);
 
-    i_t num_threads                              = branch_and_bound_settings.num_threads;
-    i_t num_bfs_threads                          = std::max(1, num_threads / 4);
-    i_t num_diving_threads                       = std::max(1, num_threads - num_bfs_threads);
-    branch_and_bound_settings.num_bfs_threads    = num_bfs_threads;
-    branch_and_bound_settings.num_diving_threads = num_diving_threads;
+    i_t num_threads                           = branch_and_bound_settings.num_threads;
+    i_t num_bfs_workers                       = std::max(1, num_threads / 4);
+    i_t num_diving_workers                    = std::max(1, num_threads - num_bfs_workers);
+    branch_and_bound_settings.num_bfs_workers = num_bfs_workers;
+    branch_and_bound_settings.diving_settings.num_diving_workers = num_diving_workers;
 
     // Set the branch and bound -> primal heuristics callback
     branch_and_bound_settings.solution_callback =
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index 33eda66cb..e1b68bf88 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -53,7 +53,7 @@ class omp_atomic_t {
   T operator--() { return fetch_sub(T(1)) - 1; }
   T operator--(int) { return fetch_sub(T(1)); }
 
-  T load()
+  T load() const
   {
     T res;
 #pragma omp atomic read