Skip to content

Commit 079799c

Browse files
authored
Run sampler as a method if available (#16888)
With this PR: huggingface/optimum-executorch#207 we are adding a new method "sampler" to ASR models, alongside with "encoder" and "text_decoder". The flow becomes: if temperature is 0 and sampler method is available, run that method. Otherwise still go with the old path. This change should largely improve the performance on CUDA since we don't have to copy logits from device to CPU for sampling purpose. Benchmark result on RTX 5080: ``` ====================================================================== BENCHMARK SUMMARY ====================================================================== Total runs: 30 Generated tokens per run: 104 THROUGHPUT (tokens/sec): Min: 793.89 t/s Max: 845.53 t/s Mean: 820.35 t/s Stdev: 11.86 t/s MODEL LOAD TIME (ms): Min: 620 ms Max: 2170 ms Mean: 700 ms Stdev: 279 ms ENCODE TIME (ms, inference_start to prompt_eval_end): Min: 36 ms Max: 38 ms Mean: 37 ms Stdev: 1 ms DECODE TIME (ms, prompt_eval_end to inference_end): Min: 123 ms Max: 131 ms Mean: 127 ms Stdev: 2 ms ====================================================================== ```
1 parent 429f014 commit 079799c

File tree

4 files changed

+94
-21
lines changed

4 files changed

+94
-21
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
732b11313b2006b4d8649500eaf5567ec6ac1e49
1+
f8aa919593cc51301ade73a2ee5491582521ab80

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@
1212
#include <executorch/runtime/core/error.h>
1313
#include <executorch/runtime/core/evalue.h>
1414
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
15+
#include <cctype>
1516
#include <cstdio>
1617

1718
#include <array>
1819
#include <filesystem>
1920
#include <fstream>
2021
#include <mutex>
2122
#include <string>
23+
#include <string_view>
2224
#include <vector>
2325

2426
// Include our shim layer headers
@@ -60,6 +62,41 @@ constexpr char kSkipCopyOutputToCpuForMethod[] =
6062
class ET_EXPERIMENTAL CudaBackend final
6163
: public ::executorch::runtime::BackendInterface {
6264
private:
65+
// Trim leading/trailing whitespace from a view of the string.
66+
static std::string_view trim(std::string_view s) {
67+
size_t start = 0;
68+
while (start < s.size() &&
69+
std::isspace(static_cast<unsigned char>(s[start]))) {
70+
++start;
71+
}
72+
size_t end = s.size();
73+
while (end > start &&
74+
std::isspace(static_cast<unsigned char>(s[end - 1]))) {
75+
--end;
76+
}
77+
return s.substr(start, end - start);
78+
}
79+
80+
// Check if method_name appears in a comma-separated list.
81+
static bool method_in_csv(
82+
const std::string& method_name,
83+
const std::string& csv) {
84+
size_t pos = 0;
85+
while (pos <= csv.size()) {
86+
const size_t comma = csv.find(',', pos);
87+
const std::string_view token =
88+
trim(std::string_view(csv).substr(pos, comma - pos));
89+
if (!token.empty() && token == method_name) {
90+
return true;
91+
}
92+
if (comma == std::string::npos) {
93+
break;
94+
}
95+
pos = comma + 1;
96+
}
97+
return false;
98+
}
99+
63100
void set_skip_copy_method(
64101
const std::array<char, kMaxOptionValueLength>& raw) {
65102
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
@@ -83,7 +120,7 @@ class ET_EXPERIMENTAL CudaBackend final
83120
return false;
84121
}
85122
std::lock_guard<std::mutex> guard(skip_copy_method_mutex_);
86-
return method_name == skip_copy_method_;
123+
return method_in_csv(method_name, skip_copy_method_);
87124
}
88125

89126
Error load_function_pointers_into_handle(
@@ -316,7 +353,7 @@ class ET_EXPERIMENTAL CudaBackend final
316353
ET_CHECK_OR_RETURN_ERROR(
317354
create_err == Error::Ok,
318355
Internal,
319-
"Failed to create GPU tensor for input %d",
356+
"Failed to create GPU tensor for input %" ET_PRIsize_t,
320357
i);
321358

322359
gpu_inputs[i] = gpu_input_handle;
@@ -325,7 +362,7 @@ class ET_EXPERIMENTAL CudaBackend final
325362
ET_CHECK_OR_RETURN_ERROR(
326363
aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0) == Error::Ok,
327364
Internal,
328-
"Failed to copy input %d from CPU to GPU",
365+
"Failed to copy input %" ET_PRIsize_t " from CPU to GPU",
329366
i);
330367
}
331368
// Process output tensors: create GPU counterparts for ExecuTorch CPU
@@ -352,7 +389,7 @@ class ET_EXPERIMENTAL CudaBackend final
352389
ET_CHECK_OR_RETURN_ERROR(
353390
create_err == Error::Ok,
354391
Internal,
355-
"Failed to create GPU tensor for output %d",
392+
"Failed to create GPU tensor for output %" ET_PRIsize_t,
356393
i);
357394

358395
gpu_outputs[i] = gpu_output_handle;
@@ -382,11 +419,11 @@ class ET_EXPERIMENTAL CudaBackend final
382419
// For DYNAMIC_BOUND tensors we try to resize
383420
ET_CHECK_OK_OR_RETURN_ERROR(
384421
resize_tensor(*cpu_output_tensor, gpu_outputs[i]->sizes()),
385-
"Error resizing tensor at output index %d",
422+
"Error resizing tensor at output index %" ET_PRIsize_t,
386423
i);
387424
ET_CHECK_OK_OR_RETURN_ERROR(
388425
aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0),
389-
"Failed to copy GPU output %d back to CPU",
426+
"Failed to copy GPU output %" ET_PRIsize_t " back to CPU",
390427
i);
391428
}
392429
} else {

extension/asr/runner/runner.cpp

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ namespace {
2727

2828
constexpr const char* kEncoderMethodName = "encoder";
2929
constexpr const char* kDecoderMethodName = "text_decoder";
30+
constexpr const char* kSamplerMethodName = "sampler";
3031

3132
} // namespace
3233

@@ -47,7 +48,8 @@ AsrRunner::AsrRunner(
4748

4849
bool AsrRunner::is_loaded() const {
4950
return module_ && encoder_method_loaded_ && decoder_method_loaded_ &&
50-
tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty();
51+
(!sampler_method_present_ || sampler_method_loaded_) && tokenizer_ &&
52+
tokenizer_->is_loaded() && !eos_token_ids_.empty();
5153
}
5254

5355
Error AsrRunner::load_tokenizer() {
@@ -96,6 +98,8 @@ Error AsrRunner::load() {
9698
ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error());
9799
const auto& method_names = method_names_result.get();
98100

101+
sampler_method_present_ = method_names.count(kSamplerMethodName);
102+
99103
ET_CHECK_OR_RETURN_ERROR(
100104
method_names.count(kEncoderMethodName) &&
101105
method_names.count(kDecoderMethodName),
@@ -109,13 +113,21 @@ Error AsrRunner::load() {
109113

110114
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
111115
decoder_method_loaded_ = true;
116+
117+
if (sampler_method_present_) {
118+
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kSamplerMethodName));
119+
sampler_method_loaded_ = true;
120+
}
112121
#ifdef CUDA_AVAILABLE
122+
// Skip copying outputs to CPU. When a sampler exists, keep both encoder and
123+
// decoder outputs on device and pass decoder logits directly into sampler.
113124
executorch::runtime::BackendOptions<1> backend_options;
114-
// For decoder still copy output from GPU to CPU for sampling.
115-
// TODO: change sampler to use a CUDA kernel to sample and then skip copying
116-
// decoder output as well
125+
std::string skip_methods = kEncoderMethodName;
126+
if (sampler_method_present_) {
127+
skip_methods.append(",").append(kDecoderMethodName);
128+
}
117129
ET_CHECK_OK_OR_RETURN_ERROR(backend_options.set_option(
118-
"skip_copy_output_to_cpu_for_method", kEncoderMethodName));
130+
"skip_copy_output_to_cpu_for_method", skip_methods.c_str()));
119131
const auto opt_err =
120132
executorch::runtime::set_option("CudaBackend", backend_options.view());
121133
if (opt_err != ::executorch::runtime::Error::Ok) {
@@ -264,6 +276,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
264276
decoder_inputs.emplace_back(cache_position_ptr);
265277
// Add some green coloring for the first generated token
266278
// token_callback("\033[1;32m");
279+
const bool use_sampler_method = sampler_method_loaded_;
267280
while (generated_tokens < config.max_new_tokens) {
268281
input_id = tokens.back();
269282
auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs);
@@ -276,15 +289,36 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
276289
"Decoder returned %zu outputs; expected a single tensor.",
277290
decoder_outputs.size());
278291

279-
::executorch::aten::Tensor logits_tensor =
280-
std::move(decoder_outputs[0]).toTensor();
281-
const int64_t vocab_size = logits_tensor.numel();
282-
ET_CHECK_OR_RETURN_ERROR(
283-
vocab_size > 0, Internal, "Decoder logits tensor is empty.");
284-
285-
const int64_t next_token =
286-
static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
287-
logits_tensor, config.temperature));
292+
int64_t next_token = 0;
293+
if (!use_sampler_method || config.temperature != 0.0f) {
294+
::executorch::aten::Tensor logits_tensor =
295+
std::move(decoder_outputs[0]).toTensor();
296+
const int64_t vocab_size = logits_tensor.numel();
297+
ET_CHECK_OR_RETURN_ERROR(
298+
vocab_size > 0, Internal, "Decoder logits tensor is empty.");
299+
next_token =
300+
static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
301+
logits_tensor, config.temperature));
302+
} else {
303+
auto sampler_result =
304+
module_->execute(kSamplerMethodName, decoder_outputs);
305+
ET_CHECK_OK_OR_RETURN_ERROR(sampler_result.error());
306+
307+
auto sampler_outputs = std::move(*sampler_result);
308+
ET_CHECK_OR_RETURN_ERROR(
309+
sampler_outputs.size() == 1 && sampler_outputs[0].isTensor(),
310+
Internal,
311+
"Sampler returned %zu outputs; expected a single tensor.",
312+
sampler_outputs.size());
313+
314+
::executorch::aten::Tensor token_tensor =
315+
std::move(sampler_outputs[0]).toTensor();
316+
ET_CHECK_OR_RETURN_ERROR(
317+
token_tensor.numel() > 0,
318+
Internal,
319+
"Sampler logits tensor is empty.");
320+
next_token = token_tensor.mutable_data_ptr<int64_t>()[0];
321+
}
288322

289323
if (!first_token_generated) {
290324
stats_.first_token_ms = ::executorch::extension::llm::time_in_ms();

extension/asr/runner/runner.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ class ET_EXPERIMENTAL AsrRunner {
108108

109109
bool encoder_method_loaded_ = false;
110110
bool decoder_method_loaded_ = false;
111+
bool sampler_method_loaded_ = false;
112+
bool sampler_method_present_ = false;
111113

112114
Stats stats_;
113115
};

0 commit comments

Comments
 (0)