pytorch · GregoryComer · Apr 10, 2026
diff --git a/.ci/scripts/build_llama_android.sh b/.ci/scripts/build_llama_android.sh
@@ -15,8 +15,12 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
 fi
 which "${PYTHON_EXECUTABLE}"
 
-install_executorch_and_backend_lib() {
-  echo "Installing executorch and xnnpack backend"
+build_llama_android() {
+  echo "Building llama runner for Android..."
+  pushd extension/llm/tokenizers
+  echo "Updating tokenizers submodule"
+  git submodule update --init
+  popd
   clean_executorch_install_folders
   mkdir cmake-android-out
   ANDROID_NDK=${ANDROID_NDK:-/opt/ndk}
@@ -26,30 +30,10 @@ install_executorch_and_backend_lib() {
     -DBUCK2="${BUCK2}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
-    -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
     -DXNNPACK_ENABLE_ARM_BF16=OFF \
     -Bcmake-android-out .
 
-  cmake --build cmake-android-out -j4 --target install --config Release
+  cmake --build cmake-android-out -j4 --target llama_main --config Release
 }
-
-build_llama_runner() {
-    echo "Building llama runner for Android..."
-    pushd extension/llm/tokenizers
-    echo "Updating tokenizers submodule"
-    git submodule update --init
-    popd
-    ANDROID_ABI=arm64-v8a
-    cmake -DBUCK2="${BUCK2}" \
-    -DBUILD_TESTING=OFF \
-    -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake  \
-    -DANDROID_ABI="${ANDROID_ABI}" \
-    -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_BUILD_TYPE=Release \
-    -Bcmake-android-out/examples/models/llama examples/models/llama
-
-    cmake --build cmake-android-out/examples/models/llama -j4 --config Release
-}
-install_executorch_and_backend_lib
-build_llama_runner
+build_llama_android
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -149,35 +149,21 @@ fi
 
 which "${PYTHON_EXECUTABLE}"
 
-cmake_install_executorch_libraries() {
-    echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+cmake_build_llama() {
+    echo "Building llama runner"
+    pushd extension/llm/tokenizers
+    echo "Updating tokenizers submodule"
+    git submodule update --init
+    popd
     rm -rf cmake-out
     retry cmake --preset llm \
         -DEXECUTORCH_BUILD_TESTS=ON \
         -DBUILD_TESTING=OFF \
-        -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
         -DEXECUTORCH_ENABLE_LOGGING=ON \
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT"
-    cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
-}
-
-cmake_build_llama_runner() {
-    echo "Building llama runner"
-    pushd extension/llm/tokenizers
-    echo "Updating tokenizers submodule"
-    git submodule update --init
-    popd
-    dir="examples/models/llama"
-    if [[ "$CMAKE_BUILD_TYPE" == "Debug" ]]; then
-        PRESET="llama-debug"
-    else
-        PRESET="llama-release"
-    fi
-    pushd "${dir}"
-    cmake --workflow --preset "${PRESET}"
-    popd
+    cmake --build cmake-out -j9 --target llama_main --config "$CMAKE_BUILD_TYPE"
 }
 
 cleanup_files() {
@@ -269,8 +255,7 @@ if [[ "${BUILD_TOOL}" == "buck2" ]]; then
   # shellcheck source=/dev/null
   $BUCK run examples/models/llama:main -- ${RUNTIME_ARGS} > result.txt
 elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-  cmake_install_executorch_libraries
-  cmake_build_llama_runner
+  cmake_build_llama
   # Run llama runner
   NOW=$(date +"%H:%M:%S")
   echo "Starting to run llama runner at ${NOW}"

diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -23,9 +23,8 @@ echo "Update tokenizers submodule"
 git submodule update --init
 popd
 
-# Install ET with CMake
+# Build llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_INSTALL_PREFIX=cmake-out \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -41,14 +40,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
     -Bcmake-out .
-cmake --build cmake-out -j16 --config Release --target install
-
-# Install llama runner with torchao
-cmake -DPYTHON_EXECUTABLE=python \
-    -DCMAKE_BUILD_TYPE=Release \
-    -Bcmake-out/examples/models/llama \
-    examples/models/llama
-cmake --build cmake-out/examples/models/llama -j16 --config Release
+cmake --build cmake-out -j16 --config Release --target llama_main
 
 # Download stories llama110m artifacts
 download_stories_model_artifacts

diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
@@ -9,20 +9,15 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-cmake_install_executorch_libraries() {
-    echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
-    rm -rf cmake-out
-    cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
-    cmake --build --preset llm-release-install
-}
-
-cmake_build_llama_runner() {
+cmake_build_llama() {
     echo "Building llama runner"
     pushd extension/llm/tokenizers
     echo "Updating tokenizers submodule"
     git submodule update --init
     popd
-    make llama-cpu
+    rm -rf cmake-out
+    cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
+    cmake --build cmake-out -j9 --target llama_main --config Release
 }
 
 cleanup_files() {
@@ -57,8 +52,7 @@ HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(s
 echo "Model downloaded to: $HF_QWEN_PATH"
 
 ### BUILD LLAMA RUNNER.
-cmake_install_executorch_libraries
-cmake_build_llama_runner
+cmake_build_llama
 
 # Runner constants.
 RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"

diff --git a/.ci/scripts/test_lora_multimethod.sh b/.ci/scripts/test_lora_multimethod.sh
@@ -9,20 +9,15 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
-cmake_install_executorch_libraries() {
-    echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
-    rm -rf cmake-out
-    cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
-    cmake --build --preset llm-release-install
-}
-
-cmake_build_llama_runner() {
+cmake_build_llama() {
     echo "Building llama runner"
     pushd extension/llm/tokenizers
     echo "Updating tokenizers submodule"
     git submodule update --init
     popd
-    make llama-cpu
+    rm -rf cmake-out
+    cmake --preset llm-release -DEXECUTORCH_ENABLE_LOGGING=ON
+    cmake --build cmake-out -j9 --target llama_main --config Release
 }
 
 cleanup_files() {
@@ -55,8 +50,7 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
     --config examples/models/qwen3/config/qwen3_multimethod.yaml
 
 ### BUILD LLAMA RUNNER ###
-cmake_install_executorch_libraries
-cmake_build_llama_runner
+cmake_build_llama
 
 # Runner constants.
 RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"

diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh
@@ -147,7 +147,6 @@ fi
 if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
   echo "[runner] Building and testing llama_main ..."
     cmake -DPYTHON_EXECUTABLE=python \
-        -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DEXECUTORCH_ENABLE_LOGGING=1 \
         -DCMAKE_BUILD_TYPE=Release \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -163,15 +162,7 @@ if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
         -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
         -DEXECUTORCH_BUILD_KERNELS_TORCHAO=${EXECUTORCH_BUILD_KERNELS_TORCHAO} \
         -Bcmake-out .
-    cmake --build cmake-out -j16 --config Release --target install
-
-
-    # Install llama runner
-    cmake -DPYTHON_EXECUTABLE=python \
-        -DCMAKE_BUILD_TYPE=Release \
-        -Bcmake-out/examples/models/llama \
-        examples/models/llama
-    cmake --build cmake-out/examples/models/llama -j16 --config Release
+    cmake --build cmake-out -j16 --config Release --target llama_main
 
     # Run the model
     ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"

diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
@@ -365,9 +365,7 @@ jobs:
         echo "::endgroup::"
 
         echo "::group::Build Llama runner with MLX"
-        pushd examples/models/llama
-        ${CONDA_RUN} cmake --workflow --preset llama-release
-        popd
+        ${CONDA_RUN} cmake --build cmake-out --target llama_main
         echo "::endgroup::"
 
         echo "::group::Download stories110M artifacts"

@@ -1140,6 +1140,7 @@ endif()
 if(EXECUTORCH_BUILD_KERNELS_LLM)
   # TODO: move all custom kernels to ${CMAKE_CURRENT_SOURCE_DIR}/kernels/custom
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/custom_ops)
+  executorch_target_link_options_shared_lib(custom_ops)
   list(APPEND _executorch_kernels custom_ops_aot_lib)
 endif()
 
@@ -1271,7 +1272,7 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   endif()
 
   if(EXECUTORCH_BUILD_KERNELS_LLM)
-    list(APPEND _executor_runner_libs $<LINK_LIBRARY:WHOLE_ARCHIVE,custom_ops>)
+    list(APPEND _executor_runner_libs custom_ops)
   endif()
 
   if(EXECUTORCH_ENABLE_EVENT_TRACER)
@@ -1322,6 +1323,12 @@ if(EXECUTORCH_BUILD_ANDROID_JNI)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/android)
 endif()
 
+# Examples — EXCLUDE_FROM_ALL so they only build when explicitly requested
+# (e.g., cmake --build cmake-out --target llama_main).
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/examples/models/llama EXCLUDE_FROM_ALL
+)
+
 include(Test.cmake)
 
 install(

@@ -442,6 +442,35 @@
         "install"
       ],
       "jobs": 0
+    },
+    {
+      "name": "llama-release-build",
+      "displayName": "Build llama_main (Release)",
+      "configurePreset": "llm-release",
+      "configuration": "Release",
+      "targets": [
+        "llama_main"
+      ],
+      "jobs": 0
+    },
+    {
+      "name": "llama-cuda-build",
+      "displayName": "Build llama_main with CUDA (Release)",
+      "configurePreset": "llm-release-cuda",
+      "configuration": "Release",
+      "targets": [
+        "llama_main"
+      ],
+      "jobs": 0
+    },
+    {
+      "name": "llama-cuda-debug-build",
+      "displayName": "Build llama_main with CUDA (Debug)",
+      "configurePreset": "llm-debug-cuda",
+      "targets": [
+        "llama_main"
+      ],
+      "jobs": 0
     }
   ],
   "workflowPresets": [
@@ -584,6 +613,48 @@
           "name": "mlx-debug-install"
         }
       ]
+    },
+    {
+      "name": "llama-release",
+      "displayName": "Configure and build llama_main (Release, CPU)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "llm-release"
+        },
+        {
+          "type": "build",
+          "name": "llama-release-build"
+        }
+      ]
+    },
+    {
+      "name": "llama-cuda",
+      "displayName": "Configure and build llama_main (Release, CUDA)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "llm-release-cuda"
+        },
+        {
+          "type": "build",
+          "name": "llama-cuda-build"
+        }
+      ]
+    },
+    {
+      "name": "llama-cuda-debug",
+      "displayName": "Configure and build llama_main (Debug, CUDA)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "llm-debug-cuda"
+        },
+        {
+          "type": "build",
+          "name": "llama-cuda-debug-build"
+        }
+      ]
     }
   ]
 }
diff --git a/Makefile b/Makefile
@@ -342,28 +342,22 @@ silero-vad-cpu:
 	@echo "  Binary: cmake-out/examples/models/silero_vad/silero_vad_stream_runner"
 
 llama-cpu:
-	@echo "==> Building and installing ExecuTorch..."
-	cmake --workflow --preset llm-release
 	@echo "==> Building Llama runner (CPU)..."
-	cd examples/models/llama && cmake --workflow --preset llama-release
+	cmake --workflow --preset llama-release
 	@echo ""
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
 
 llama-cuda:
-	@echo "==> Building and installing ExecuTorch with CUDA..."
-	cmake --workflow --preset llm-release-cuda
 	@echo "==> Building Llama runner with CUDA..."
-	cd examples/models/llama && cmake --workflow --preset llama-cuda
+	cmake --workflow --preset llama-cuda
 	@echo ""
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
 
 llama-cuda-debug:
-	@echo "==> Building and installing ExecuTorch with CUDA (debug mode)..."
-	cmake --workflow --preset llm-debug-cuda
 	@echo "==> Building Llama runner with CUDA (debug mode)..."
-	cd examples/models/llama && cmake --workflow --preset llama-cuda-debug
+	cmake --workflow --preset llama-cuda-debug
 	@echo ""
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"