diff --git a/parakeet-cpp/include/parakeet/engine.h b/parakeet-cpp/include/parakeet/engine.h index 236157e1746..95cbec1f9c3 100644 --- a/parakeet-cpp/include/parakeet/engine.h +++ b/parakeet-cpp/include/parakeet/engine.h @@ -248,6 +248,11 @@ class PARAKEET_API Engine { // the lifetime of the Engine. std::string backend_name() const; + // True when a GPU was detected but the engine fell back to CPU because it is + // a known-bad backend (Mali). A CPU backend with this set is expected, not a + // regression. + bool gpu_unsupported() const; + struct Impl; private: diff --git a/parakeet-cpp/src/parakeet_ctc.cpp b/parakeet-cpp/src/parakeet_ctc.cpp index af2046d0938..44439b2b05d 100644 --- a/parakeet-cpp/src/parakeet_ctc.cpp +++ b/parakeet-cpp/src/parakeet_ctc.cpp @@ -108,6 +108,9 @@ struct ParakeetCtcModel::Impl { ggml_backend_t backend_blas = nullptr; ggml_backend_t backend_gpu = nullptr; ggml_backend_t backend_active = nullptr; + // True when a GPU was detected but skipped as known-bad (Mali), so + // backend_active fell back to CPU. + bool gpu_unsupported = false; ggml_backend_buffer_t weights_buffer = nullptr; std::vector> encoder_graphs; static constexpr size_t k_encoder_graph_cache_max = 3; @@ -291,7 +294,9 @@ const char * dev_reg_name(ggml_backend_dev_t dev) { // with, those entry points live in separate shared libraries that // are dlopen()'d at runtime and are not linkable from libparakeet. // The registry walk reaches the same backends in both modes. -ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) { +ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose, + bool & out_skipped_unsupported_gpu) { + out_skipped_unsupported_gpu = false; if (n_gpu_layers <= 0) return nullptr; ensure_backends_loaded(); @@ -351,7 +356,20 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) { opencl_other.push_back({dev, name, desc, reg_name}); } } else { - other_gpu.push_back({dev, name, desc, reg_name}); + // ARM Mali (Valhall) Vulkan mis-computes every parakeet model (its + // narrow subgroup width breaks the ggml-vulkan shaders), so guard it + // by name as a known-bad backend (like the Adreno-6xx skip above) and + // route it to CPU. + const bool is_mali = (name && std::strstr(name, "Mali")) || + (desc && std::strstr(desc, "Mali")); + if (is_mali) { + out_skipped_unsupported_gpu = true; + if (verbose) PARAKEET_LOG_INFO( + "parakeet: Mali GPU '%s' mis-computes on Vulkan; using CPU\n", + name ? name : (desc ? desc : "unknown")); + } else { + other_gpu.push_back({dev, name, desc, reg_name}); + } } } @@ -576,8 +594,10 @@ int load_from_gguf(const std::string & gguf_path, backend_set_n_threads(impl->backend_blas, resolved_threads); } - impl->backend_gpu = init_gpu_backend(n_gpu_layers, verbose); + bool skipped_unsupported_gpu = false; + impl->backend_gpu = init_gpu_backend(n_gpu_layers, verbose, skipped_unsupported_gpu); impl->backend_active = impl->backend_gpu ? impl->backend_gpu : impl->backend_cpu; + impl->gpu_unsupported = skipped_unsupported_gpu && impl->backend_gpu == nullptr; gguf_init_params params = { /*no_alloc=*/ true, &impl->ctx }; impl->gguf = gguf_init_from_file(gguf_path.c_str(), params); @@ -946,6 +966,10 @@ bool model_has_gpu_backend(const ParakeetCtcModel & m) { return m.impl && m.impl->backend_gpu != nullptr; } +bool model_gpu_unsupported(const ParakeetCtcModel & m) { + return m.impl && m.impl->gpu_unsupported; +} + std::string model_active_backend_name(const ParakeetCtcModel & m) { if (!m.impl) return "CPU"; ggml_backend_t b = m.impl->backend_active; diff --git a/parakeet-cpp/src/parakeet_ctc.h b/parakeet-cpp/src/parakeet_ctc.h index 1187e75ce3f..a69681d9479 100644 --- a/parakeet-cpp/src/parakeet_ctc.h +++ b/parakeet-cpp/src/parakeet_ctc.h @@ -329,6 +329,9 @@ int load_from_gguf(const std::string & gguf_path, void print_model_summary(const ParakeetCtcModel & m); bool model_has_gpu_backend(const ParakeetCtcModel & m); +// True when a GPU was detected but routed to CPU as a known-bad backend (Mali). +// Lets hosts treat the CPU backend as expected, not a GPU regression. +bool model_gpu_unsupported(const ParakeetCtcModel & m); std::string model_active_backend_name(const ParakeetCtcModel & m); ggml_backend_t model_active_backend(ParakeetCtcModel & m); diff --git a/parakeet-cpp/src/parakeet_engine.cpp b/parakeet-cpp/src/parakeet_engine.cpp index 030714c3aba..4642f0ce80c 100644 --- a/parakeet-cpp/src/parakeet_engine.cpp +++ b/parakeet-cpp/src/parakeet_engine.cpp @@ -227,6 +227,10 @@ std::string Engine::backend_name() const { return model_active_backend_name(pimpl_->model); } +bool Engine::gpu_unsupported() const { + return model_gpu_unsupported(pimpl_->model); +} + void Engine::cancel() { pimpl_->cancel_flag.store(true); } diff --git a/parakeet-cpp/src/parakeet_tdt.cpp b/parakeet-cpp/src/parakeet_tdt.cpp index 944b5de7588..0aa6a93595f 100644 --- a/parakeet-cpp/src/parakeet_tdt.cpp +++ b/parakeet-cpp/src/parakeet_tdt.cpp @@ -269,10 +269,10 @@ LstmBodyOuts build_lstm_body(TdtRuntimeWeights & rt, // readback is ~17 us); on a discrete GPU PCIe bus it's an // order-of-magnitude saving per emission step (~250 / call). struct JointBodyOuts { - ggml_tensor * token_argmax; // i32[1], over logits[0 : V_plus_1] - ggml_tensor * dur_argmax; // i32[1], over logits[V_plus_1 : V_plus_1 + num_durations] + ggml_tensor * token_out; // argmax i32[1] (argmax_on_gpu) OR token logits f32[V_plus_1] + ggml_tensor * dur_out; // argmax i32[1] (argmax_on_gpu) OR dur logits f32[num_durations] }; -JointBodyOuts build_joint_body(const TdtRuntimeWeights & rt, +JointBodyOuts build_joint_body(TdtRuntimeWeights & rt, ggml_context * gctx, ggml_tensor * pred_src, ggml_tensor * frame_idx_in) { @@ -314,9 +314,20 @@ JointBodyOuts build_joint_body(const TdtRuntimeWeights & rt, tok_logits = ggml_cont(gctx, tok_logits); dur_logits = ggml_cont(gctx, dur_logits); + // ggml-opencl has no ARGMAX kernel (graph_compute would abort), so gate the + // on-device argmax on backend support and fall back to a host argmax of the + // logit slices otherwise. tok_am is unused on that path (never expanded). + ggml_tensor * tok_am = ggml_argmax(gctx, tok_logits); // i32[1] + rt.argmax_on_gpu = ggml_backend_supports_op(rt.backend, tok_am); + JointBodyOuts outs{}; - outs.token_argmax = ggml_argmax(gctx, tok_logits); // i32[1] - outs.dur_argmax = ggml_argmax(gctx, dur_logits); // i32[1] + if (rt.argmax_on_gpu) { + outs.token_out = tok_am; + outs.dur_out = ggml_argmax(gctx, dur_logits); // i32[1] + } else { + outs.token_out = tok_logits; + outs.dur_out = dur_logits; + } return outs; } @@ -360,10 +371,10 @@ void build_joint_graph(TdtRuntimeWeights & rt) { ggml_set_input(rt.joint_frame_idx_in); JointBodyOuts outs = build_joint_body(rt, gctx, rt.pred_persist, rt.joint_frame_idx_in); - rt.joint_token_out = outs.token_argmax; - rt.joint_dur_out = outs.dur_argmax; - ggml_set_name(rt.joint_token_out, "joint.token_argmax"); - ggml_set_name(rt.joint_dur_out, "joint.dur_argmax"); + rt.joint_token_out = outs.token_out; + rt.joint_dur_out = outs.dur_out; + ggml_set_name(rt.joint_token_out, rt.argmax_on_gpu ? "joint.token_argmax" : "joint.token_logits"); + ggml_set_name(rt.joint_dur_out, rt.argmax_on_gpu ? "joint.dur_argmax" : "joint.dur_logits"); ggml_set_output(rt.joint_token_out); ggml_set_output(rt.joint_dur_out); @@ -399,10 +410,10 @@ void build_lstm_joint_graph(TdtRuntimeWeights & rt) { // Use the pred_cpy node (not pred_persist directly) so the joint mat_muls // depend on the LSTM update finishing first. JointBodyOuts joint_outs = build_joint_body(rt, gctx, lstm_outs.pred_cpy, rt.lj_frame_idx_in); - rt.lj_token_out = joint_outs.token_argmax; - rt.lj_dur_out = joint_outs.dur_argmax; - ggml_set_name(rt.lj_token_out, "lstm_joint.token_argmax"); - ggml_set_name(rt.lj_dur_out, "lstm_joint.dur_argmax"); + rt.lj_token_out = joint_outs.token_out; + rt.lj_dur_out = joint_outs.dur_out; + ggml_set_name(rt.lj_token_out, rt.argmax_on_gpu ? "lstm_joint.token_argmax" : "lstm_joint.token_logits"); + ggml_set_name(rt.lj_dur_out, rt.argmax_on_gpu ? "lstm_joint.dur_argmax" : "lstm_joint.dur_logits"); ggml_set_output(rt.lj_token_out); ggml_set_output(rt.lj_dur_out); // Mark the LSTM cpy nodes as outputs too so gallocr keeps them alive @@ -619,6 +630,15 @@ int tdt_prepare_runtime(const ParakeetCtcModel & model, TdtRuntimeWeights & W) { // because of native quantised matmul and faster argmax / large gemvs. W.use_graphs = !backend_is_cpu(W.backend); + // ggml-opencl drops the in-place ggml_cpy writes that update the TDT LSTM + // persistent state (h/c/pred), so the state never advances and the decode + // emits one constant token per frame. Run the per-step decode on the host on + // OpenCL; the encoder still runs on the GPU. (EOU/Sortformer don't use this + // persistent-state pattern and stay on the GPU.) + if (W.use_graphs && std::strcmp(backend_reg_name(W.backend), "OpenCL") == 0) { + W.use_graphs = false; + } + if (!W.use_graphs) { // ---- CPU fallback: dequantise weights to host f32 ---- dequantize_to_f32(model.tdt.predict_embed, W.embed); @@ -732,6 +752,30 @@ bool run_lstm_init_step(TdtRuntimeWeights & rt, int token_id) { return true; } +// Read the joint token/dur outputs into host ints: i32 argmax indices when +// argmax_on_gpu, else the raw f32 logit slices (ggml-opencl) argmaxed on host. +// thread_local scratch keeps the per-step readback allocation-free. +void resolve_joint_step(TdtRuntimeWeights & rt, + ggml_tensor * tok_t, ggml_tensor * dur_t, + int * tok_out, int * dur_out) { + if (rt.argmax_on_gpu) { + int32_t tok_val = 0, dur_val = 0; + ggml_backend_tensor_get(tok_t, &tok_val, 0, sizeof(int32_t)); + ggml_backend_tensor_get(dur_t, &dur_val, 0, sizeof(int32_t)); + *tok_out = (int) tok_val; + *dur_out = (int) dur_val; + return; + } + static thread_local std::vector tok_logits; + static thread_local std::vector dur_logits; + tok_logits.resize((size_t) rt.V_plus_1); + dur_logits.resize((size_t) rt.num_durations); + ggml_backend_tensor_get(tok_t, tok_logits.data(), 0, (size_t) rt.V_plus_1 * sizeof(float)); + ggml_backend_tensor_get(dur_t, dur_logits.data(), 0, (size_t) rt.num_durations * sizeof(float)); + *tok_out = argmax_f32(tok_logits.data(), rt.V_plus_1); + *dur_out = argmax_f32(dur_logits.data(), rt.num_durations); +} + // Joint-only step (used after a blank emission). pred_persist is unchanged // from the previous step; only enc_proj_persist[frame_idx] varies. The // graph runs token + duration argmax on-device, so the host reads @@ -750,11 +794,7 @@ bool run_joint_step(TdtRuntimeWeights & rt, return false; } - int32_t tok_val = 0, dur_val = 0; - ggml_backend_tensor_get(rt.joint_token_out, &tok_val, 0, sizeof(int32_t)); - ggml_backend_tensor_get(rt.joint_dur_out, &dur_val, 0, sizeof(int32_t)); - *tok_out = (int) tok_val; - *dur_out = (int) dur_val; + resolve_joint_step(rt, rt.joint_token_out, rt.joint_dur_out, tok_out, dur_out); return true; } @@ -777,11 +817,7 @@ bool run_lstm_joint_step(TdtRuntimeWeights & rt, return false; } - int32_t tok_val = 0, dur_val = 0; - ggml_backend_tensor_get(rt.lj_token_out, &tok_val, 0, sizeof(int32_t)); - ggml_backend_tensor_get(rt.lj_dur_out, &dur_val, 0, sizeof(int32_t)); - *tok_out = (int) tok_val; - *dur_out = (int) dur_val; + resolve_joint_step(rt, rt.lj_token_out, rt.lj_dur_out, tok_out, dur_out); return true; } diff --git a/parakeet-cpp/src/parakeet_tdt.h b/parakeet-cpp/src/parakeet_tdt.h index bb2e81f0b9f..69e9d6faa17 100644 --- a/parakeet-cpp/src/parakeet_tdt.h +++ b/parakeet-cpp/src/parakeet_tdt.h @@ -61,6 +61,9 @@ struct TdtRuntimeWeights { ggml_backend_t backend = nullptr; int n_threads = 0; bool use_graphs = false; + // false on ggml-opencl (no ARGMAX kernel): the joint graph emits raw logits + // for the host to argmax; true elsewhere keeps the argmax on-device. + bool argmax_on_gpu = true; // ---- CPU-fallback host weights (populated only when !use_graphs) ---- std::vector embed; @@ -111,8 +114,8 @@ struct TdtRuntimeWeights { ggml_cgraph * g_joint = nullptr; ggml_gallocr_t alloc_joint = nullptr; ggml_tensor * joint_frame_idx_in = nullptr; // i32[1] - ggml_tensor * joint_token_out = nullptr; // i32[1] — token argmax - ggml_tensor * joint_dur_out = nullptr; // i32[1] — duration argmax + ggml_tensor * joint_token_out = nullptr; // i32 argmax, or f32 token logits when !argmax_on_gpu + ggml_tensor * joint_dur_out = nullptr; // i32 argmax, or f32 dur logits when !argmax_on_gpu // (3) Fused LSTM + joint graph: used after a non-blank emission. // LSTM updates h/c/pred from the last emitted token, then joint @@ -123,8 +126,8 @@ struct TdtRuntimeWeights { ggml_gallocr_t alloc_lstm_joint = nullptr; ggml_tensor * lj_token_in = nullptr; // i32[1] ggml_tensor * lj_frame_idx_in = nullptr; // i32[1] - ggml_tensor * lj_token_out = nullptr; // i32[1] — token argmax - ggml_tensor * lj_dur_out = nullptr; // i32[1] — duration argmax + ggml_tensor * lj_token_out = nullptr; // i32 argmax, or f32 token logits when !argmax_on_gpu + ggml_tensor * lj_dur_out = nullptr; // i32 argmax, or f32 dur logits when !argmax_on_gpu struct EncProjGraph { // Each cached graph owns its own ggml_context for the cgraph + tensor