Run sampler as a method if available (#16888)

larryliu0820 · web-flow · commit 079799cccf8d · 2026-01-27T14:24:02.000-08:00
With this PR: huggingface/optimum-executorch#207 we are adding a new method "sampler" to ASR models, alongside with "encoder" and "text_decoder". The flow becomes: if temperature is 0 and sampler method is available, run that method. Otherwise still go with the old path. This change should largely improve the performance on CUDA since we don't have to copy logits from device to CPU for sampling purpose. Benchmark result on RTX 5080: ``` ====================================================================== BENCHMARK SUMMARY ====================================================================== Total runs: 30 Generated tokens per run: 104 THROUGHPUT (tokens/sec): Min: 793.89 t/s Max: 845.53 t/s Mean: 820.35 t/s Stdev: 11.86 t/s MODEL LOAD TIME (ms): Min: 620 ms Max: 2170 ms Mean: 700 ms Stdev: 279 ms ENCODE TIME (ms, inference_start to prompt_eval_end): Min: 36 ms Max: 38 ms Mean: 37 ms Stdev: 1 ms DECODE TIME (ms, prompt_eval_end to inference_end): Min: 123 ms Max: 131 ms Mean: 127 ms Stdev: 2 ms ====================================================================== ```
diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-732b11313b2006b4d8649500eaf5567ec6ac1e49
+f8aa919593cc51301ade73a2ee5491582521ab80
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -12,13 +12,15 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 #include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <cctype>
 #include <cstdio>
 
 #include <array>
 #include <filesystem>
 #include <fstream>
 #include <mutex>
 #include <string>
+#include <string_view>
 #include <vector>
 
 // Include our shim layer headers
@@ -60,6 +62,41 @@ constexpr char kSkipCopyOutputToCpuForMethod[] =
 class ET_EXPERIMENTAL CudaBackend final
     : public ::executorch::runtime::BackendInterface {
  private:
+  // Trim leading/trailing whitespace from a view of the string.
+  static std::string_view trim(std::string_view s) {
+    size_t start = 0;
+    while (start < s.size() &&
+           std::isspace(static_cast<unsigned char>(s[start]))) {
+      ++start;
+    }
+    size_t end = s.size();
+    while (end > start &&
+           std::isspace(static_cast<unsigned char>(s[end - 1]))) {
+      --end;
+    }
+    return s.substr(start, end - start);
+  }
+
+  // Check if method_name appears in a comma-separated list.
+  static bool method_in_csv(
+      const std::string& method_name,
+      const std::string& csv) {
+    size_t pos = 0;
+    while (pos <= csv.size()) {
+      const size_t comma = csv.find(',', pos);
+      const std::string_view token =
+          trim(std::string_view(csv).substr(pos, comma - pos));
+      if (!token.empty() && token == method_name) {
+        return true;
+      }
+      if (comma == std::string::npos) {
+        break;
+      }
+      pos = comma + 1;
+    }
+    return false;
+  }
+
   void set_skip_copy_method(
       const std::array<char, kMaxOptionValueLength>& raw) {
     std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
@@ -83,7 +120,7 @@ class ET_EXPERIMENTAL CudaBackend final
       return false;
     }
     std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
-    return method_name == skip_copy_method_;
+    return method_in_csv(method_name, skip_copy_method_);
   }
 
   Error load_function_pointers_into_handle(
@@ -316,7 +353,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           create_err == Error::Ok,
           Internal,
-          "Failed to create GPU tensor for input %d",
+          "Failed to create GPU tensor for input %" ET_PRIsize_t,
           i);
 
       gpu_inputs[i] = gpu_input_handle;
@@ -325,7 +362,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
           Internal,
-          "Failed to copy input %d from CPU to GPU",
+          "Failed to copy input %" ET_PRIsize_t " from CPU to GPU",
           i);
     }
     // Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -352,7 +389,7 @@ class ET_EXPERIMENTAL CudaBackend final
       ET_CHECK_OR_RETURN_ERROR(
           create_err == Error::Ok,
           Internal,
-          "Failed to create GPU tensor for output %d",
+          "Failed to create GPU tensor for output %" ET_PRIsize_t,
           i);
 
       gpu_outputs[i] = gpu_output_handle;
@@ -382,11 +419,11 @@ class ET_EXPERIMENTAL CudaBackend final
         // For DYNAMIC_BOUND tensors we try to resize
         ET_CHECK_OK_OR_RETURN_ERROR(
             resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
-            "Error resizing tensor at output index %d",
+            "Error resizing tensor at output index %" ET_PRIsize_t,
             i);
         ET_CHECK_OK_OR_RETURN_ERROR(
             aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
-            "Failed to copy GPU output %d back to CPU",
+            "Failed to copy GPU output %" ET_PRIsize_t " back to CPU",
             i);
       }
     } else {
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
@@ -27,6 +27,7 @@ namespace {
 
 constexpr const char* kEncoderMethodName = "encoder";
 constexpr const char* kDecoderMethodName = "text_decoder";
+constexpr const char* kSamplerMethodName = "sampler";
 
 } // namespace
 
@@ -47,7 +48,8 @@ AsrRunner::AsrRunner(
 
 bool AsrRunner::is_loaded() const {
   return module_ && encoder_method_loaded_ && decoder_method_loaded_ &&
-      tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty();
+      (!sampler_method_present_ || sampler_method_loaded_) && tokenizer_ &&
+      tokenizer_->is_loaded() && !eos_token_ids_.empty();
 }
 
 Error AsrRunner::load_tokenizer() {
@@ -96,6 +98,8 @@ Error AsrRunner::load() {
   ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error());
   const auto& method_names = method_names_result.get();
 
+  sampler_method_present_ = method_names.count(kSamplerMethodName);
+
   ET_CHECK_OR_RETURN_ERROR(
       method_names.count(kEncoderMethodName) &&
           method_names.count(kDecoderMethodName),
@@ -109,13 +113,21 @@ Error AsrRunner::load() {
 
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
   decoder_method_loaded_ = true;
+
+  if (sampler_method_present_) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kSamplerMethodName));
+    sampler_method_loaded_ = true;
+  }
 #ifdef CUDA_AVAILABLE
+  // Skip copying outputs to CPU. When a sampler exists, keep both encoder and
+  // decoder outputs on device and pass decoder logits directly into sampler.
   executorch::runtime::BackendOptions<1> backend_options;
-  // For decoder still copy output from GPU to CPU for sampling.
-  // TODO: change sampler to use a CUDA kernel to sample and then skip copying
-  // decoder output as well
+  std::string skip_methods = kEncoderMethodName;
+  if (sampler_method_present_) {
+    skip_methods.append(",").append(kDecoderMethodName);
+  }
   ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option(
-      "skip_copy_output_to_cpu_for_method", kEncoderMethodName));
+      "skip_copy_output_to_cpu_for_method", skip_methods.c_str()));
   const auto opt_err =
       executorch::runtime::set_option("CudaBackend", backend_options.view());
   if (opt_err != ::executorch::runtime::Error::Ok) {
@@ -264,6 +276,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
   decoder_inputs.emplace_back(cache_position_ptr);
   // Add some green coloring for the first generated token
   // token_callback("\033[1;32m");
+  const bool use_sampler_method = sampler_method_loaded_;
   while (generated_tokens < config.max_new_tokens) {
     input_id = tokens.back();
     auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs);
@@ -276,15 +289,36 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
         "Decoder returned %zu outputs; expected a single tensor.",
         decoder_outputs.size());
 
-    ::executorch::aten::Tensor logits_tensor =
-        std::move(decoder_outputs[0]).toTensor();
-    const int64_t vocab_size = logits_tensor.numel();
-    ET_CHECK_OR_RETURN_ERROR(
-        vocab_size > 0, Internal, "Decoder logits tensor is empty.");
-
-    const int64_t next_token =
-        static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
-            logits_tensor, config.temperature));
+    int64_t next_token = 0;
+    if (!use_sampler_method || config.temperature != 0.0f) {
+      ::executorch::aten::Tensor logits_tensor =
+          std::move(decoder_outputs[0]).toTensor();
+      const int64_t vocab_size = logits_tensor.numel();
+      ET_CHECK_OR_RETURN_ERROR(
+          vocab_size > 0, Internal, "Decoder logits tensor is empty.");
+      next_token =
+          static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
+              logits_tensor, config.temperature));
+    } else {
+      auto sampler_result =
+          module_->execute(kSamplerMethodName, decoder_outputs);
+      ET_CHECK_OK_OR_RETURN_ERROR(sampler_result.error());
+
+      auto sampler_outputs = std::move(*sampler_result);
+      ET_CHECK_OR_RETURN_ERROR(
+          sampler_outputs.size() == 1 && sampler_outputs[0].isTensor(),
+          Internal,
+          "Sampler returned %zu outputs; expected a single tensor.",
+          sampler_outputs.size());
+
+      ::executorch::aten::Tensor token_tensor =
+          std::move(sampler_outputs[0]).toTensor();
+      ET_CHECK_OR_RETURN_ERROR(
+          token_tensor.numel() > 0,
+          Internal,
+          "Sampler logits tensor is empty.");
+      next_token = token_tensor.mutable_data_ptr<int64_t>()[0];
+    }
 
     if (!first_token_generated) {
       stats_.first_token_ms = ::executorch::extension::llm::time_in_ms();
diff --git a/extension/asr/runner/runner.h b/extension/asr/runner/runner.h
@@ -108,6 +108,8 @@ class ET_EXPERIMENTAL AsrRunner {
 
   bool encoder_method_loaded_ = false;
   bool decoder_method_loaded_ = false;
+  bool sampler_method_loaded_ = false;
+  bool sampler_method_present_ = false;
 
   Stats stats_;
 };

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-732b11313b2006b4d8649500eaf5567ec6ac1e49`
	`1`	`+f8aa919593cc51301ade73a2ee5491582521ab80`