mlc-ai · ksgr5566 · Mar 28, 2026 · Mar 30, 2026 · gemini-code-assist · Mar 28, 2026
diff --git a/examples/README.md b/examples/README.md
@@ -22,6 +22,7 @@ Note that all examples below run in-browser and use WebGPU as a backend.
 - [simple-chat-ts](simple-chat-ts): a mininum and complete chat bot app in TypeScript.
 - [get-started-web-worker](get-started-web-worker): same as get-started, but using web worker.
 - [next-simple-chat](next-simple-chat): a mininum and complete chat bot app with [Next.js](https://nextjs.org/).
+- [wasm-gating](wasm-gating): capability-based routing between baseline and subgroup WebGPU WASM builds.
 - [multi-round-chat](multi-round-chat): while APIs are functional, we internally optimize so that multi round chat usage can reuse KV cache
 - [text-completion](text-completion): demonstrates API `engine.completions.create()`, which is pure text completion with no conversation, as opposed to `engine.chat.completions.create()`
 - [embeddings](embeddings): demonstrates API `engine.embeddings.create()`, integration with `EmbeddingsInterface` and `MemoryVectorStore` of [Langchain.js](https://js.langchain.com), and RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine

diff --git a/examples/wasm-gating/README.md b/examples/wasm-gating/README.md
@@ -0,0 +1,19 @@
+# WebLLM Wasm Gating App
+
+This folder provides a minimum demo to show capability-based routing between
+baseline and subgroup WebGPU WASM builds in a webapp setting.
+To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Edit `src/wasm_gating.ts` if you would like to point the example at your own
+model path and baseline `model_lib`. The example will switch to
+`-subgroups.wasm` when the adapter reports subgroup support.
+
+Note if you would like to hack WebLLM core package.
+You can change the WebLLM dependency to `"file:../.."`, and follow the build
+from source instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/wasm-gating/package.json b/examples/wasm-gating/package.json
@@ -0,0 +1,25 @@
+{
+  "name": "wasm-gating",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/wasm_gating.html --port 8888",
+    "build": "parcel build src/wasm_gating.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "crypto-browserify": "^3.12.1",
+    "events": "^3.3.0",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "stream-browserify": "^3.0.0",
+    "string_decoder": "^1.3.0",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3",
+    "vm-browserify": "^1.1.2"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "^0.2.82"
+  }
+}
diff --git a/examples/wasm-gating/src/wasm_gating.html b/examples/wasm-gating/src/wasm_gating.html
@@ -0,0 +1,26 @@
+<!doctype html>
+<html>
+  <script>
+    webLLMGlobal = {};
+  </script>
+  <body>
+    <h2>WebLLM Test Page</h2>
+    Open console to see output
+    <br />
+    <br />
+    <label id="init-label"> </label>
+    <br />
+    <br />
+    <h3>Prompt</h3>
+    <label id="prompt-label"> </label>
+    <br />
+    <br />
+    <h3>Response</h3>
+    <label id="generate-label"> </label>
+    <br />
+    <br />
+    <label id="stats-label"> </label>
+
+    <script type="module" src="./wasm_gating.ts"></script>
+  </body>
+</html>
diff --git a/examples/wasm-gating/src/wasm_gating.ts b/examples/wasm-gating/src/wasm_gating.ts
@@ -0,0 +1,122 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+async function main() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+
+  const selectedModel = "Llama-3.2-1B-Instruct-q4f16_1-MLC";
+  const adapter = await (navigator as any).gpu?.requestAdapter({
+    powerPreference: "high-performance",
+  });
+  if (adapter == null) {
+    throw Error("Unable to request a WebGPU adapter.");
+  }
+  const adapterInfo =
+    adapter.info || (await (adapter as any).requestAdapterInfo());
+  const subgroupMinSize = adapterInfo.subgroupMinSize;
+  const subgroupMaxSize = adapterInfo.subgroupMaxSize;
+  const supportsSubgroups =
+    adapter.features.has("subgroups") &&
+    subgroupMinSize !== undefined &&
+    subgroupMinSize <= 32 &&
+    subgroupMaxSize !== undefined &&
+    32 <= subgroupMaxSize &&
+    adapter.limits.maxComputeInvocationsPerWorkgroup >= 1024;
+  console.log("supportsSubgroups: ", supportsSubgroups);
+  // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
+  const modelRecord = webllm.prebuiltAppConfig.model_list.find(
+    (entry) => entry.model_id === selectedModel,
+  );
+  const appConfig =
+    supportsSubgroups && modelRecord !== undefined
+      ? {
+          model_list: [
+            {
+              ...modelRecord,
+              model_lib: modelRecord.model_lib.replace(
+                /\.wasm$/,
+                "-subgroups.wasm",
+              ),
+            },
+          ],
+        }
+      : undefined;
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      appConfig: appConfig,
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO", // specify the log level
+    },
+    // customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
+    {
+      context_window_size: 2048,
+      // sliding_window_size: 1024,
+      // attention_sink_size: 4,
+    },
+  );
+
+  // Option 2: Specify your own model other than the prebuilt ones
+  // const appConfig: webllm.AppConfig = {
+  //   model_list: [
+  //     {
+  //       model: "http://127.0.0.1:8000/models/Llama-3.2-1B-Instruct-q4f16_1-MLC/",
+  //       model_id: "Llama-3.2-1B-Instruct-q4f16_1-MLC",
+  //       model_lib: "http://127.0.0.1:8000/libs/Llama-3.2-1B-Instruct-q4f16_1-webgpu.wasm",
+  //       overrides: {
+  //         context_window_size: 2048,
+  //       },
+  //     },
+  //   ],
+  // };
+  // if (supportsSubgroups) {
+  //   appConfig.model_list[0].model_lib = appConfig.model_list[0].model_lib.replace(
+  //     /\.wasm$/,
+  //     "-subgroups.wasm",
+  //   );
+  // }
+  // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+  //   selectedModel,
+  //   { appConfig: appConfig, initProgressCallback: initProgressCallback },
+  // );
+
+  // Option 3: Instantiate MLCEngine() and call reload() separately
+  // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({
+  //   appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig
+  //   initProgressCallback: initProgressCallback,
+  // });
+  // await engine.reload(selectedModel);
+
+  const reply0 = await engine.chat.completions.create({
+    messages: [{ role: "user", content: "List three US states." }],
+    // below configurations are all optional
+    n: 3,
+    temperature: 1.5,
+    max_tokens: 256,
+    // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
+    // So we would have a higher chance of seeing the latter two, but never the first in the answer
+    logit_bias: {
+      "46510": -100,
+      "7188": -100,
+      "8421": 5,
+      "51325": 5,
+    },
+    logprobs: true,
+    top_logprobs: 2,
+  });
+  console.log(reply0);
+  console.log(reply0.usage);
+
+  // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
+}
+
+main();