tetherto · pratiknarola-t · Jun 15, 2026 · Jun 15, 2026
@@ -248,6 +248,11 @@ class PARAKEET_API Engine {
     // the lifetime of the Engine.
     std::string backend_name() const;
 
+    // True when a GPU was detected but the engine fell back to CPU because it is
+    // a known-bad backend (Mali). A CPU backend with this set is expected, not a
+    // regression.
+    bool gpu_unsupported() const;
+
     struct Impl;
 
 private:

@@ -108,6 +108,9 @@ struct ParakeetCtcModel::Impl {
     ggml_backend_t         backend_blas   = nullptr;
     ggml_backend_t         backend_gpu    = nullptr;
     ggml_backend_t         backend_active = nullptr;
+    // True when a GPU was detected but skipped as known-bad (Mali), so
+    // backend_active fell back to CPU.
+    bool                   gpu_unsupported = false;
     ggml_backend_buffer_t  weights_buffer = nullptr;
     std::vector<std::unique_ptr<EncoderGraph>> encoder_graphs;
     static constexpr size_t k_encoder_graph_cache_max = 3;
@@ -291,7 +294,9 @@ const char * dev_reg_name(ggml_backend_dev_t dev) {
 // with, those entry points live in separate shared libraries that
 // are dlopen()'d at runtime and are not linkable from libparakeet.
 // The registry walk reaches the same backends in both modes.
-ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) {
+ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose,
+                                bool & out_skipped_unsupported_gpu) {
+    out_skipped_unsupported_gpu = false;
     if (n_gpu_layers <= 0) return nullptr;
 
     ensure_backends_loaded();
@@ -351,7 +356,20 @@ ggml_backend_t init_gpu_backend(int n_gpu_layers, bool verbose) {
                 opencl_other.push_back({dev, name, desc, reg_name});
             }
         } else {
-            other_gpu.push_back({dev, name, desc, reg_name});
+            // ARM Mali (Valhall) Vulkan mis-computes every parakeet model (its
+            // narrow subgroup width breaks the ggml-vulkan shaders), so guard it
+            // by name as a known-bad backend (like the Adreno-6xx skip above) and
+            // route it to CPU.
+            const bool is_mali = (name && std::strstr(name, "Mali")) ||
+                                 (desc && std::strstr(desc, "Mali"));
+            if (is_mali) {
+                out_skipped_unsupported_gpu = true;
+                if (verbose) PARAKEET_LOG_INFO(
+                    "parakeet: Mali GPU '%s' mis-computes on Vulkan; using CPU\n",
+                    name ? name : (desc ? desc : "unknown"));
+            } else {
+                other_gpu.push_back({dev, name, desc, reg_name});
+            }
         }
     }
 
@@ -576,8 +594,10 @@ int load_from_gguf(const std::string & gguf_path,
         backend_set_n_threads(impl->backend_blas, resolved_threads);
     }
 
-    impl->backend_gpu    = init_gpu_backend(n_gpu_layers, verbose);
+    bool skipped_unsupported_gpu = false;
+    impl->backend_gpu    = init_gpu_backend(n_gpu_layers, verbose, skipped_unsupported_gpu);
     impl->backend_active = impl->backend_gpu ? impl->backend_gpu : impl->backend_cpu;
+    impl->gpu_unsupported = skipped_unsupported_gpu && impl->backend_gpu == nullptr;
 
     gguf_init_params params = { /*no_alloc=*/ true, &impl->ctx };
     impl->gguf = gguf_init_from_file(gguf_path.c_str(), params);
@@ -946,6 +966,10 @@ bool model_has_gpu_backend(const ParakeetCtcModel & m) {
     return m.impl && m.impl->backend_gpu != nullptr;
 }
 
+bool model_gpu_unsupported(const ParakeetCtcModel & m) {
+    return m.impl && m.impl->gpu_unsupported;
+}
+
 std::string model_active_backend_name(const ParakeetCtcModel & m) {
     if (!m.impl) return "CPU";
     ggml_backend_t b = m.impl->backend_active;

@@ -329,6 +329,9 @@ int load_from_gguf(const std::string & gguf_path,
 void print_model_summary(const ParakeetCtcModel & m);
 
 bool        model_has_gpu_backend(const ParakeetCtcModel & m);
+// True when a GPU was detected but routed to CPU as a known-bad backend (Mali).
+// Lets hosts treat the CPU backend as expected, not a GPU regression.
+bool        model_gpu_unsupported(const ParakeetCtcModel & m);
 std::string model_active_backend_name(const ParakeetCtcModel & m);
 ggml_backend_t model_active_backend(ParakeetCtcModel & m);
 

@@ -227,6 +227,10 @@ std::string Engine::backend_name() const {
     return model_active_backend_name(pimpl_->model);
 }
 
+bool Engine::gpu_unsupported() const {
+    return model_gpu_unsupported(pimpl_->model);
+}
+
 void Engine::cancel() {
     pimpl_->cancel_flag.store(true);
 }

@@ -269,10 +269,10 @@ LstmBodyOuts build_lstm_body(TdtRuntimeWeights & rt,
 // readback is ~17 us); on a discrete GPU PCIe bus it's an
 // order-of-magnitude saving per emission step (~250 / call).
 struct JointBodyOuts {
-    ggml_tensor * token_argmax;  // i32[1], over logits[0 : V_plus_1]
-    ggml_tensor * dur_argmax;    // i32[1], over logits[V_plus_1 : V_plus_1 + num_durations]
+    ggml_tensor * token_out;  // argmax i32[1] (argmax_on_gpu) OR token logits f32[V_plus_1]
+    ggml_tensor * dur_out;    // argmax i32[1] (argmax_on_gpu) OR dur logits f32[num_durations]
 };
-JointBodyOuts build_joint_body(const TdtRuntimeWeights & rt,
+JointBodyOuts build_joint_body(TdtRuntimeWeights & rt,
                                ggml_context * gctx,
                                ggml_tensor * pred_src,
                                ggml_tensor * frame_idx_in) {
@@ -314,9 +314,20 @@ JointBodyOuts build_joint_body(const TdtRuntimeWeights & rt,
     tok_logits = ggml_cont(gctx, tok_logits);
     dur_logits = ggml_cont(gctx, dur_logits);
 
+    // ggml-opencl has no ARGMAX kernel (graph_compute would abort), so gate the
+    // on-device argmax on backend support and fall back to a host argmax of the
+    // logit slices otherwise. tok_am is unused on that path (never expanded).
+    ggml_tensor * tok_am = ggml_argmax(gctx, tok_logits);  // i32[1]
+    rt.argmax_on_gpu = ggml_backend_supports_op(rt.backend, tok_am);
+
     JointBodyOuts outs{};
-    outs.token_argmax = ggml_argmax(gctx, tok_logits);  // i32[1]
-    outs.dur_argmax   = ggml_argmax(gctx, dur_logits);  // i32[1]
+    if (rt.argmax_on_gpu) {
+        outs.token_out = tok_am;
+        outs.dur_out   = ggml_argmax(gctx, dur_logits);  // i32[1]
+    } else {
+        outs.token_out = tok_logits;
+        outs.dur_out   = dur_logits;
+    }
     return outs;
 }
 
@@ -360,10 +371,10 @@ void build_joint_graph(TdtRuntimeWeights & rt) {
     ggml_set_input(rt.joint_frame_idx_in);
 
     JointBodyOuts outs = build_joint_body(rt, gctx, rt.pred_persist, rt.joint_frame_idx_in);
-    rt.joint_token_out = outs.token_argmax;
-    rt.joint_dur_out   = outs.dur_argmax;
-    ggml_set_name(rt.joint_token_out, "joint.token_argmax");
-    ggml_set_name(rt.joint_dur_out,   "joint.dur_argmax");
+    rt.joint_token_out = outs.token_out;
+    rt.joint_dur_out   = outs.dur_out;
+    ggml_set_name(rt.joint_token_out, rt.argmax_on_gpu ? "joint.token_argmax" : "joint.token_logits");
+    ggml_set_name(rt.joint_dur_out,   rt.argmax_on_gpu ? "joint.dur_argmax"   : "joint.dur_logits");
     ggml_set_output(rt.joint_token_out);
     ggml_set_output(rt.joint_dur_out);
 
@@ -399,10 +410,10 @@ void build_lstm_joint_graph(TdtRuntimeWeights & rt) {
     // Use the pred_cpy node (not pred_persist directly) so the joint mat_muls
     // depend on the LSTM update finishing first.
     JointBodyOuts joint_outs = build_joint_body(rt, gctx, lstm_outs.pred_cpy, rt.lj_frame_idx_in);
-    rt.lj_token_out = joint_outs.token_argmax;
-    rt.lj_dur_out   = joint_outs.dur_argmax;
-    ggml_set_name(rt.lj_token_out, "lstm_joint.token_argmax");
-    ggml_set_name(rt.lj_dur_out,   "lstm_joint.dur_argmax");
+    rt.lj_token_out = joint_outs.token_out;
+    rt.lj_dur_out   = joint_outs.dur_out;
+    ggml_set_name(rt.lj_token_out, rt.argmax_on_gpu ? "lstm_joint.token_argmax" : "lstm_joint.token_logits");
+    ggml_set_name(rt.lj_dur_out,   rt.argmax_on_gpu ? "lstm_joint.dur_argmax"   : "lstm_joint.dur_logits");
     ggml_set_output(rt.lj_token_out);
     ggml_set_output(rt.lj_dur_out);
     // Mark the LSTM cpy nodes as outputs too so gallocr keeps them alive
@@ -619,6 +630,15 @@ int tdt_prepare_runtime(const ParakeetCtcModel & model, TdtRuntimeWeights & W) {
     // because of native quantised matmul and faster argmax / large gemvs.
     W.use_graphs = !backend_is_cpu(W.backend);
 
+    // ggml-opencl drops the in-place ggml_cpy writes that update the TDT LSTM
+    // persistent state (h/c/pred), so the state never advances and the decode
+    // emits one constant token per frame. Run the per-step decode on the host on
+    // OpenCL; the encoder still runs on the GPU. (EOU/Sortformer don't use this
+    // persistent-state pattern and stay on the GPU.)
+    if (W.use_graphs && std::strcmp(backend_reg_name(W.backend), "OpenCL") == 0) {
+        W.use_graphs = false;
+    }
+
     if (!W.use_graphs) {
         // ---- CPU fallback: dequantise weights to host f32 ----
         dequantize_to_f32(model.tdt.predict_embed, W.embed);
@@ -732,6 +752,30 @@ bool run_lstm_init_step(TdtRuntimeWeights & rt, int token_id) {
     return true;
 }
 
+// Read the joint token/dur outputs into host ints: i32 argmax indices when
+// argmax_on_gpu, else the raw f32 logit slices (ggml-opencl) argmaxed on host.
+// thread_local scratch keeps the per-step readback allocation-free.
+void resolve_joint_step(TdtRuntimeWeights & rt,
+                        ggml_tensor * tok_t, ggml_tensor * dur_t,
+                        int * tok_out, int * dur_out) {
+    if (rt.argmax_on_gpu) {
+        int32_t tok_val = 0, dur_val = 0;
+        ggml_backend_tensor_get(tok_t, &tok_val, 0, sizeof(int32_t));
+        ggml_backend_tensor_get(dur_t, &dur_val, 0, sizeof(int32_t));
+        *tok_out = (int) tok_val;
+        *dur_out = (int) dur_val;
+        return;
+    }
+    static thread_local std::vector<float> tok_logits;
+    static thread_local std::vector<float> dur_logits;
+    tok_logits.resize((size_t) rt.V_plus_1);
+    dur_logits.resize((size_t) rt.num_durations);
+    ggml_backend_tensor_get(tok_t, tok_logits.data(), 0, (size_t) rt.V_plus_1 * sizeof(float));
+    ggml_backend_tensor_get(dur_t, dur_logits.data(), 0, (size_t) rt.num_durations * sizeof(float));
+    *tok_out = argmax_f32(tok_logits.data(), rt.V_plus_1);
+    *dur_out = argmax_f32(dur_logits.data(), rt.num_durations);
+}
+
 // Joint-only step (used after a blank emission). pred_persist is unchanged
 // from the previous step; only enc_proj_persist[frame_idx] varies.  The
 // graph runs token + duration argmax on-device, so the host reads
@@ -750,11 +794,7 @@ bool run_joint_step(TdtRuntimeWeights & rt,
         return false;
     }
 
-    int32_t tok_val = 0, dur_val = 0;
-    ggml_backend_tensor_get(rt.joint_token_out, &tok_val, 0, sizeof(int32_t));
-    ggml_backend_tensor_get(rt.joint_dur_out,   &dur_val, 0, sizeof(int32_t));
-    *tok_out = (int) tok_val;
-    *dur_out = (int) dur_val;
+    resolve_joint_step(rt, rt.joint_token_out, rt.joint_dur_out, tok_out, dur_out);
     return true;
 }
 
@@ -777,11 +817,7 @@ bool run_lstm_joint_step(TdtRuntimeWeights & rt,
         return false;
     }
 
-    int32_t tok_val = 0, dur_val = 0;
-    ggml_backend_tensor_get(rt.lj_token_out, &tok_val, 0, sizeof(int32_t));
-    ggml_backend_tensor_get(rt.lj_dur_out,   &dur_val, 0, sizeof(int32_t));
-    *tok_out = (int) tok_val;
-    *dur_out = (int) dur_val;
+    resolve_joint_step(rt, rt.lj_token_out, rt.lj_dur_out, tok_out, dur_out);
     return true;
 }
 

@@ -61,6 +61,9 @@ struct TdtRuntimeWeights {
     ggml_backend_t     backend = nullptr;
     int                n_threads = 0;
     bool               use_graphs = false;
+    // false on ggml-opencl (no ARGMAX kernel): the joint graph emits raw logits
+    // for the host to argmax; true elsewhere keeps the argmax on-device.
+    bool               argmax_on_gpu = true;
 
     // ---- CPU-fallback host weights (populated only when !use_graphs) ----
     std::vector<float>             embed;
@@ -111,8 +114,8 @@ struct TdtRuntimeWeights {
     ggml_cgraph *  g_joint     = nullptr;
     ggml_gallocr_t alloc_joint = nullptr;
     ggml_tensor *  joint_frame_idx_in = nullptr;  // i32[1]
-    ggml_tensor *  joint_token_out    = nullptr;  // i32[1] — token argmax
-    ggml_tensor *  joint_dur_out      = nullptr;  // i32[1] — duration argmax
+    ggml_tensor *  joint_token_out    = nullptr;  // i32 argmax, or f32 token logits when !argmax_on_gpu
+    ggml_tensor *  joint_dur_out      = nullptr;  // i32 argmax, or f32 dur logits when !argmax_on_gpu
 
     // (3) Fused LSTM + joint graph: used after a non-blank emission.
     //     LSTM updates h/c/pred from the last emitted token, then joint
@@ -123,8 +126,8 @@ struct TdtRuntimeWeights {
     ggml_gallocr_t alloc_lstm_joint = nullptr;
     ggml_tensor *  lj_token_in        = nullptr;  // i32[1]
     ggml_tensor *  lj_frame_idx_in    = nullptr;  // i32[1]
-    ggml_tensor *  lj_token_out       = nullptr;  // i32[1] — token argmax
-    ggml_tensor *  lj_dur_out         = nullptr;  // i32[1] — duration argmax
+    ggml_tensor *  lj_token_out       = nullptr;  // i32 argmax, or f32 token logits when !argmax_on_gpu
+    ggml_tensor *  lj_dur_out         = nullptr;  // i32 argmax, or f32 dur logits when !argmax_on_gpu
 
     struct EncProjGraph {
         // Each cached graph owns its own ggml_context for the cgraph + tensor