Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions parakeet-cpp/include/parakeet/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,11 @@ class PARAKEET_API Engine {
// the lifetime of the Engine.
std::string backend_name() const;

// True when a GPU was detected but the engine fell back to CPU because it is
// a known-bad backend (Mali). A CPU backend with this set is expected, not a
// regression.
bool gpu_unsupported() const;

struct Impl;

private:
Expand Down
30 changes: 27 additions & 3 deletions parakeet-cpp/src/parakeet_ctc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ struct ParakeetCtcModel::Impl {
ggml_backend_t backend_blas = nullptr;
ggml_backend_t backend_gpu = nullptr;
ggml_backend_t backend_active = nullptr;
// True when a GPU was detected but skipped as known-bad (Mali), so
// backend_active fell back to CPU.
bool gpu_unsupported = false;
ggml_backend_buffer_t weights_buffer = nullptr;
std::vector<std::unique_ptr<EncoderGraph>> encoder_graphs;
static constexpr size_t k_encoder_graph_cache_max = 3;
Expand Down Expand Up @@ -291,7 +294,9 @@ const char * dev_reg_name(ggml_backend_dev_t dev) {
// with, those entry points live in separate shared libraries that
// are dlopen()'d at runtime and are not linkable from libparakeet.
// The registry walk reaches the same backends in both modes.
ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) {
ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose,
bool & out_skipped_unsupported_gpu) {
out_skipped_unsupported_gpu = false;
if (n_gpu_layers <= 0) return nullptr;

ensure_backends_loaded();
Expand Down Expand Up @@ -351,7 +356,20 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) {
opencl_other.push_back({dev, name, desc, reg_name});
}
} else {
other_gpu.push_back({dev, name, desc, reg_name});
// ARM Mali (Valhall) Vulkan mis-computes every parakeet model (its
// narrow subgroup width breaks the ggml-vulkan shaders), so guard it
// by name as a known-bad backend (like the Adreno-6xx skip above) and
// route it to CPU.
const bool is_mali = (name && std::strstr(name, "Mali")) ||
(desc && std::strstr(desc, "Mali"));
if (is_mali) {
out_skipped_unsupported_gpu = true;
if (verbose) PARAKEET_LOG_INFO(
"parakeet: Mali GPU '%s' mis-computes on Vulkan; using CPU\n",
name ? name : (desc ? desc : "unknown"));
} else {
other_gpu.push_back({dev, name, desc, reg_name});
}
}
}

Expand Down Expand Up @@ -576,8 +594,10 @@ int load_from_gguf(const std::string & gguf_path,
backend_set_n_threads(impl->backend_blas, resolved_threads);
}

impl->backend_gpu = init_gpu_backend(n_gpu_layers, verbose);
bool skipped_unsupported_gpu = false;
impl->backend_gpu = init_gpu_backend(n_gpu_layers, verbose, skipped_unsupported_gpu);
impl->backend_active = impl->backend_gpu ? impl->backend_gpu : impl->backend_cpu;
impl->gpu_unsupported = skipped_unsupported_gpu && impl->backend_gpu == nullptr;

gguf_init_params params = { /*no_alloc=*/ true, &impl->ctx };
impl->gguf = gguf_init_from_file(gguf_path.c_str(), params);
Expand Down Expand Up @@ -946,6 +966,10 @@ bool model_has_gpu_backend(const ParakeetCtcModel & m) {
return m.impl && m.impl->backend_gpu != nullptr;
}

bool model_gpu_unsupported(const ParakeetCtcModel & m) {
return m.impl && m.impl->gpu_unsupported;
}

std::string model_active_backend_name(const ParakeetCtcModel & m) {
if (!m.impl) return "CPU";
ggml_backend_t b = m.impl->backend_active;
Expand Down
3 changes: 3 additions & 0 deletions parakeet-cpp/src/parakeet_ctc.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,9 @@ int load_from_gguf(const std::string & gguf_path,
void print_model_summary(const ParakeetCtcModel & m);

bool model_has_gpu_backend(const ParakeetCtcModel & m);
// True when a GPU was detected but routed to CPU as a known-bad backend (Mali).
// Lets hosts treat the CPU backend as expected, not a GPU regression.
bool model_gpu_unsupported(const ParakeetCtcModel & m);
std::string model_active_backend_name(const ParakeetCtcModel & m);
ggml_backend_t model_active_backend(ParakeetCtcModel & m);

Expand Down
4 changes: 4 additions & 0 deletions parakeet-cpp/src/parakeet_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,10 @@ std::string Engine::backend_name() const {
return model_active_backend_name(pimpl_->model);
}

bool Engine::gpu_unsupported() const {
return model_gpu_unsupported(pimpl_->model);
}

void Engine::cancel() {
pimpl_->cancel_flag.store(true);
}
Expand Down
82 changes: 59 additions & 23 deletions parakeet-cpp/src/parakeet_tdt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,10 @@ LstmBodyOuts build_lstm_body(TdtRuntimeWeights & rt,
// readback is ~17 us); on a discrete GPU PCIe bus it's an
// order-of-magnitude saving per emission step (~250 / call).
struct JointBodyOuts {
ggml_tensor * token_argmax; // i32[1], over logits[0 : V_plus_1]
ggml_tensor * dur_argmax; // i32[1], over logits[V_plus_1 : V_plus_1 + num_durations]
ggml_tensor * token_out; // argmax i32[1] (argmax_on_gpu) OR token logits f32[V_plus_1]
ggml_tensor * dur_out; // argmax i32[1] (argmax_on_gpu) OR dur logits f32[num_durations]
};
JointBodyOuts build_joint_body(const TdtRuntimeWeights & rt,
JointBodyOuts build_joint_body(TdtRuntimeWeights & rt,
ggml_context * gctx,
ggml_tensor * pred_src,
ggml_tensor * frame_idx_in) {
Expand Down Expand Up @@ -314,9 +314,20 @@ JointBodyOuts build_joint_body(const TdtRuntimeWeights & rt,
tok_logits = ggml_cont(gctx, tok_logits);
dur_logits = ggml_cont(gctx, dur_logits);

// ggml-opencl has no ARGMAX kernel (graph_compute would abort), so gate the
// on-device argmax on backend support and fall back to a host argmax of the
// logit slices otherwise. tok_am is unused on that path (never expanded).
ggml_tensor * tok_am = ggml_argmax(gctx, tok_logits); // i32[1]
rt.argmax_on_gpu = ggml_backend_supports_op(rt.backend, tok_am);

JointBodyOuts outs{};
outs.token_argmax = ggml_argmax(gctx, tok_logits); // i32[1]
outs.dur_argmax = ggml_argmax(gctx, dur_logits); // i32[1]
if (rt.argmax_on_gpu) {
outs.token_out = tok_am;
outs.dur_out = ggml_argmax(gctx, dur_logits); // i32[1]
} else {
outs.token_out = tok_logits;
outs.dur_out = dur_logits;
}
return outs;
}

Expand Down Expand Up @@ -360,10 +371,10 @@ void build_joint_graph(TdtRuntimeWeights & rt) {
ggml_set_input(rt.joint_frame_idx_in);

JointBodyOuts outs = build_joint_body(rt, gctx, rt.pred_persist, rt.joint_frame_idx_in);
rt.joint_token_out = outs.token_argmax;
rt.joint_dur_out = outs.dur_argmax;
ggml_set_name(rt.joint_token_out, "joint.token_argmax");
ggml_set_name(rt.joint_dur_out, "joint.dur_argmax");
rt.joint_token_out = outs.token_out;
rt.joint_dur_out = outs.dur_out;
ggml_set_name(rt.joint_token_out, rt.argmax_on_gpu ? "joint.token_argmax" : "joint.token_logits");
ggml_set_name(rt.joint_dur_out, rt.argmax_on_gpu ? "joint.dur_argmax" : "joint.dur_logits");
ggml_set_output(rt.joint_token_out);
ggml_set_output(rt.joint_dur_out);

Expand Down Expand Up @@ -399,10 +410,10 @@ void build_lstm_joint_graph(TdtRuntimeWeights & rt) {
// Use the pred_cpy node (not pred_persist directly) so the joint mat_muls
// depend on the LSTM update finishing first.
JointBodyOuts joint_outs = build_joint_body(rt, gctx, lstm_outs.pred_cpy, rt.lj_frame_idx_in);
rt.lj_token_out = joint_outs.token_argmax;
rt.lj_dur_out = joint_outs.dur_argmax;
ggml_set_name(rt.lj_token_out, "lstm_joint.token_argmax");
ggml_set_name(rt.lj_dur_out, "lstm_joint.dur_argmax");
rt.lj_token_out = joint_outs.token_out;
rt.lj_dur_out = joint_outs.dur_out;
ggml_set_name(rt.lj_token_out, rt.argmax_on_gpu ? "lstm_joint.token_argmax" : "lstm_joint.token_logits");
ggml_set_name(rt.lj_dur_out, rt.argmax_on_gpu ? "lstm_joint.dur_argmax" : "lstm_joint.dur_logits");
ggml_set_output(rt.lj_token_out);
ggml_set_output(rt.lj_dur_out);
// Mark the LSTM cpy nodes as outputs too so gallocr keeps them alive
Expand Down Expand Up @@ -619,6 +630,15 @@ int tdt_prepare_runtime(const ParakeetCtcModel & model, TdtRuntimeWeights & W) {
// because of native quantised matmul and faster argmax / large gemvs.
W.use_graphs = !backend_is_cpu(W.backend);

// ggml-opencl drops the in-place ggml_cpy writes that update the TDT LSTM
// persistent state (h/c/pred), so the state never advances and the decode
// emits one constant token per frame. Run the per-step decode on the host on
// OpenCL; the encoder still runs on the GPU. (EOU/Sortformer don't use this
// persistent-state pattern and stay on the GPU.)
if (W.use_graphs && std::strcmp(backend_reg_name(W.backend), "OpenCL") == 0) {
W.use_graphs = false;
}

if (!W.use_graphs) {
// ---- CPU fallback: dequantise weights to host f32 ----
dequantize_to_f32(model.tdt.predict_embed, W.embed);
Expand Down Expand Up @@ -732,6 +752,30 @@ bool run_lstm_init_step(TdtRuntimeWeights & rt, int token_id) {
return true;
}

// Read the joint token/dur outputs into host ints: i32 argmax indices when
// argmax_on_gpu, else the raw f32 logit slices (ggml-opencl) argmaxed on host.
// thread_local scratch keeps the per-step readback allocation-free.
void resolve_joint_step(TdtRuntimeWeights & rt,
ggml_tensor * tok_t, ggml_tensor * dur_t,
int * tok_out, int * dur_out) {
if (rt.argmax_on_gpu) {
int32_t tok_val = 0, dur_val = 0;
ggml_backend_tensor_get(tok_t, &tok_val, 0, sizeof(int32_t));
ggml_backend_tensor_get(dur_t, &dur_val, 0, sizeof(int32_t));
*tok_out = (int) tok_val;
*dur_out = (int) dur_val;
return;
}
static thread_local std::vector<float> tok_logits;
static thread_local std::vector<float> dur_logits;
tok_logits.resize((size_t) rt.V_plus_1);
dur_logits.resize((size_t) rt.num_durations);
ggml_backend_tensor_get(tok_t, tok_logits.data(), 0, (size_t) rt.V_plus_1 * sizeof(float));
ggml_backend_tensor_get(dur_t, dur_logits.data(), 0, (size_t) rt.num_durations * sizeof(float));
*tok_out = argmax_f32(tok_logits.data(), rt.V_plus_1);
*dur_out = argmax_f32(dur_logits.data(), rt.num_durations);
}

// Joint-only step (used after a blank emission). pred_persist is unchanged
// from the previous step; only enc_proj_persist[frame_idx] varies. The
// graph runs token + duration argmax on-device, so the host reads
Expand All @@ -750,11 +794,7 @@ bool run_joint_step(TdtRuntimeWeights & rt,
return false;
}

int32_t tok_val = 0, dur_val = 0;
ggml_backend_tensor_get(rt.joint_token_out, &tok_val, 0, sizeof(int32_t));
ggml_backend_tensor_get(rt.joint_dur_out, &dur_val, 0, sizeof(int32_t));
*tok_out = (int) tok_val;
*dur_out = (int) dur_val;
resolve_joint_step(rt, rt.joint_token_out, rt.joint_dur_out, tok_out, dur_out);
return true;
}

Expand All @@ -777,11 +817,7 @@ bool run_lstm_joint_step(TdtRuntimeWeights & rt,
return false;
}

int32_t tok_val = 0, dur_val = 0;
ggml_backend_tensor_get(rt.lj_token_out, &tok_val, 0, sizeof(int32_t));
ggml_backend_tensor_get(rt.lj_dur_out, &dur_val, 0, sizeof(int32_t));
*tok_out = (int) tok_val;
*dur_out = (int) dur_val;
resolve_joint_step(rt, rt.lj_token_out, rt.lj_dur_out, tok_out, dur_out);
return true;
}

Expand Down
11 changes: 7 additions & 4 deletions parakeet-cpp/src/parakeet_tdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ struct TdtRuntimeWeights {
ggml_backend_t backend = nullptr;
int n_threads = 0;
bool use_graphs = false;
// false on ggml-opencl (no ARGMAX kernel): the joint graph emits raw logits
// for the host to argmax; true elsewhere keeps the argmax on-device.
bool argmax_on_gpu = true;

// ---- CPU-fallback host weights (populated only when !use_graphs) ----
std::vector<float> embed;
Expand Down Expand Up @@ -111,8 +114,8 @@ struct TdtRuntimeWeights {
ggml_cgraph * g_joint = nullptr;
ggml_gallocr_t alloc_joint = nullptr;
ggml_tensor * joint_frame_idx_in = nullptr; // i32[1]
ggml_tensor * joint_token_out = nullptr; // i32[1] — token argmax
ggml_tensor * joint_dur_out = nullptr; // i32[1] — duration argmax
ggml_tensor * joint_token_out = nullptr; // i32 argmax, or f32 token logits when !argmax_on_gpu
ggml_tensor * joint_dur_out = nullptr; // i32 argmax, or f32 dur logits when !argmax_on_gpu

// (3) Fused LSTM + joint graph: used after a non-blank emission.
// LSTM updates h/c/pred from the last emitted token, then joint
Expand All @@ -123,8 +126,8 @@ struct TdtRuntimeWeights {
ggml_gallocr_t alloc_lstm_joint = nullptr;
ggml_tensor * lj_token_in = nullptr; // i32[1]
ggml_tensor * lj_frame_idx_in = nullptr; // i32[1]
ggml_tensor * lj_token_out = nullptr; // i32[1] — token argmax
ggml_tensor * lj_dur_out = nullptr; // i32[1] — duration argmax
ggml_tensor * lj_token_out = nullptr; // i32 argmax, or f32 token logits when !argmax_on_gpu
ggml_tensor * lj_dur_out = nullptr; // i32 argmax, or f32 dur logits when !argmax_on_gpu

struct EncProjGraph {
// Each cached graph owns its own ggml_context for the cgraph + tensor
Expand Down
Loading