diff --git a/examples/qualcomm/custom_op/custom_ops_fast_gelu.py b/examples/qualcomm/custom_op/custom_ops_fast_gelu.py
new file mode 100644
index 00000000000..25e65535577
--- /dev/null
+++ b/examples/qualcomm/custom_op/custom_ops_fast_gelu.py
@@ -0,0 +1,294 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Example: Custom FastGELU operator integrated with ExecuTorch Qualcomm backend (HTP).
+"""
+
+import json
+import os
+import subprocess
+import sys
+from multiprocessing.connection import Client
+
+import numpy as np
+import torch
+
+from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype
+from executorch.backends.qualcomm.serialization.qc_schema import (
+ _soc_info_table,
+ HtpArch,
+ QcomChipset,
+ QnnExecuTorchOpPackageInfo,
+ QnnExecuTorchOpPackageOptions,
+ QnnExecuTorchOpPackagePlatform,
+ QnnExecuTorchOpPackageTarget,
+)
+from executorch.examples.qualcomm.utils import (
+ build_executorch_binary,
+ generate_inputs,
+ make_output_dir,
+ make_quantizer,
+ setup_common_args_and_variables,
+ SimpleADB,
+)
+from torch.library import impl, Library
+
+# ------------------------------------------------------------------------------
+# 1. Register PyTorch custom operator (FastGELU)
+# ------------------------------------------------------------------------------
+
+my_op_lib = Library("my_ops", "DEF")
+my_op_lib.define("fast_gelu(Tensor input) -> Tensor")
+
+
+@impl(my_op_lib, "fast_gelu", "CompositeExplicitAutograd")
+def fast_gelu_impl(x: torch.Tensor) -> torch.Tensor:
+ return 0.5 * x * (1.0 + torch.tanh(0.7978845608 * (x + 0.044715 * x * x * x)))
+
+
+# registering the out variant.
+my_op_lib.define("fast_gelu.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)")
+
+
+class Model(torch.nn.Module):
+ def forward(self, a):
+ return torch.ops.my_ops.fast_gelu.default(a)
+
+
+def annotate_custom(gm: torch.fx.GraphModule) -> None:
+ """
+ This function is specific for custom op.
+ The source_fn of the rewritten nn module turns out to be "my_ops.fast_gelu.default"
+ """
+ from executorch.backends.qualcomm.quantizer.annotators import _is_annotated
+ from executorch.backends.qualcomm.quantizer.qconfig import (
+ get_ptq_per_channel_quant_config,
+ )
+ from torch.fx import Node
+ from torchao.quantization.pt2e.quantizer import QuantizationAnnotation
+ from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
+
+ quantization_config = get_ptq_per_channel_quant_config()
+ for node in gm.graph.nodes:
+ if node.target != torch.ops.my_ops.fast_gelu.default:
+ continue
+
+ # skip annotation if it is already annotated
+ if _is_annotated([node]):
+ continue
+
+ input_qspec_map = {}
+ input_act = node.args[0]
+ assert isinstance(input_act, Node)
+ input_spec = quantization_config.input_activation
+ input_qspec_map[input_act] = input_spec
+
+ node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+ input_qspec_map=input_qspec_map,
+ output_qspec=quantization_config.output_activation,
+ _annotated=True,
+ )
+
+
+def _run(cmd, cwd=None):
+ subprocess.run(cmd, stdout=sys.stdout, cwd=cwd, check=True)
+
+
+def prepare_op_package(
+ workspace: str, op_package_dir: str, arch: HtpArch, build_op_package: bool
+):
+ if build_op_package:
+ _run(["rm", "-rf", "build"], cwd=op_package_dir)
+ _run(["make", "htp_x86", "htp_aarch64", f"htp_v{arch}"], cwd=op_package_dir)
+ _run(
+ [
+ "cp",
+ f"{op_package_dir}/build/hexagon-v{arch}/libQnnFastGeluOpPackage.so",
+ f"{op_package_dir}/build/hexagon-v{arch}/libQnnFastGeluOpPackage_HTP.so",
+ ]
+ )
+
+ op_package_paths = [
+ f"{op_package_dir}/build/hexagon-v{arch}/libQnnFastGeluOpPackage_HTP.so",
+ f"{op_package_dir}/build/aarch64-android/libQnnFastGeluOpPackage.so",
+ ]
+
+ op_package_infos_HTP = QnnExecuTorchOpPackageInfo()
+ op_package_infos_HTP.interface_provider = "FastGeluOpPackageInterfaceProvider"
+ op_package_infos_HTP.op_package_name = "FastGeluOpPackage"
+ op_package_infos_HTP.op_package_path = f"{workspace}/libQnnFastGeluOpPackage_HTP.so"
+ op_package_infos_HTP.target = QnnExecuTorchOpPackageTarget.HTP
+ op_package_infos_HTP.custom_op_name = "my_ops.fast_gelu.default"
+ op_package_infos_HTP.qnn_op_type_name = "FastGelu"
+ op_package_infos_HTP.platform = QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID
+ op_package_infos_aarch64_CPU = QnnExecuTorchOpPackageInfo()
+ op_package_infos_aarch64_CPU.interface_provider = (
+ "FastGeluOpPackageInterfaceProvider"
+ )
+ op_package_infos_aarch64_CPU.op_package_name = "FastGeluOpPackage"
+ op_package_infos_aarch64_CPU.op_package_path = (
+ f"{workspace}/libQnnFastGeluOpPackage.so"
+ )
+ op_package_infos_aarch64_CPU.target = QnnExecuTorchOpPackageTarget.CPU
+ op_package_infos_aarch64_CPU.custom_op_name = "my_ops.fast_gelu.default"
+ op_package_infos_aarch64_CPU.qnn_op_type_name = "FastGelu"
+ op_package_infos_aarch64_CPU.platform = (
+ QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID
+ )
+ op_package_infos_x86_CPU = QnnExecuTorchOpPackageInfo()
+ op_package_infos_x86_CPU.interface_provider = "FastGeluOpPackageInterfaceProvider"
+ op_package_infos_x86_CPU.op_package_name = "FastGeluOpPackage"
+ op_package_infos_x86_CPU.op_package_path = (
+ f"{op_package_dir}/build/x86_64-linux-clang/libQnnFastGeluOpPackage.so"
+ )
+ op_package_infos_x86_CPU.target = QnnExecuTorchOpPackageTarget.CPU
+ op_package_infos_x86_CPU.custom_op_name = "my_ops.fast_gelu.default"
+ op_package_infos_x86_CPU.qnn_op_type_name = "FastGelu"
+ op_package_infos_x86_CPU.platform = QnnExecuTorchOpPackagePlatform.X86_64
+ op_package_options = QnnExecuTorchOpPackageOptions()
+ op_package_options.op_package_infos = [
+ op_package_infos_x86_CPU,
+ op_package_infos_aarch64_CPU,
+ op_package_infos_HTP,
+ ]
+
+ return op_package_options, op_package_paths
+
+
+def main(args):
+ if args.build_op_package:
+ if "HEXAGON_SDK_ROOT" not in os.environ:
+ raise RuntimeError("Environment variable HEXAGON_SDK_ROOT must be set")
+ print(f"HEXAGON_SDK_ROOT={os.getenv('HEXAGON_SDK_ROOT')}")
+
+ if "ANDROID_NDK_ROOT" not in os.environ:
+ raise RuntimeError("Environment variable ANDROID_NDK_ROOT must be set")
+ print(f"ANDROID_NDK_ROOT={os.getenv('ANDROID_NDK_ROOT')}")
+
+ # ensure the working directory exist.
+ os.makedirs(args.artifact, exist_ok=True)
+
+ quant_dtype = QuantDtype.use_8a8w
+ if args.use_fp16:
+ quant_dtype = None
+
+ instance = Model()
+ sample_input = (torch.randn(1, 16384),)
+ pte_filename = "fastgelu_model"
+ workspace = f"/data/local/tmp/executorch/{pte_filename}"
+ soc_info: SocInfo = _soc_info_table[getattr(QcomChipset, args.model)]
+
+ op_package_options, op_package_paths = prepare_op_package(
+ workspace,
+ args.op_package_dir,
+ soc_info.htp_info.htp_arch,
+ args.build_op_package,
+ )
+ quant_dtype: Literal[QuantDtype.use_16a16w] = QuantDtype.use_8a8w
+ if args.use_fp16:
+ quant_dtype = None
+ quantizer = None
+ if not args.use_fp16:
+ quantizer = make_quantizer(
+ quant_dtype=quant_dtype, custom_annotations=(annotate_custom,)
+ )
+
+ build_executorch_binary(
+ instance,
+ sample_input,
+ args.model,
+ f"{args.artifact}/{pte_filename}",
+ sample_input,
+ op_package_options=op_package_options,
+ quant_dtype=quant_dtype,
+ custom_quantizer=quantizer,
+ )
+
+ if args.compile_only:
+ sys.exit(0)
+
+ output_data_folder: LiteralString = os.path.join(args.artifact, "outputs")
+
+ adb = SimpleADB(
+ qnn_sdk=os.getenv("QNN_SDK_ROOT"),
+ build_path=args.build_folder,
+ pte_path=f"{args.artifact}/{pte_filename}.pte",
+ workspace=workspace,
+ device_id=args.device,
+ host_id=args.host,
+ soc_model=args.model,
+ )
+ adb.push(inputs=sample_input, files=op_package_paths)
+ adb.execute()
+ adb.pull(output_path=args.artifact)
+
+ # Compare results
+ model = Model()
+ x86_golden = model(*sample_input)
+ import numpy as np
+
+ device_output = torch.from_numpy(
+ np.fromfile(
+ os.path.join(output_data_folder, "output_0_0.raw"), dtype=np.float32
+ )
+ ).reshape(x86_golden.size())
+ result = torch.all(torch.isclose(x86_golden, device_output, atol=1e-2)).item()
+ print(
+ "is_close?",
+ result,
+ )
+ if not result:
+ print(f"x86_golden {x86_golden}")
+ print(f"device_out {device_output}")
+
+
+if __name__ == "__main__":
+ parser = setup_common_args_and_variables()
+ parser.add_argument(
+ "-a",
+ "--artifact",
+ help="path for storing generated artifacts by this example. Default ./custom_op",
+ default="./custom_op",
+ type=str,
+ )
+
+ parser.add_argument(
+ "-d",
+ "--op_package_dir",
+ help="Path to operator package which generates from QNN.",
+ type=str,
+ required=True,
+ )
+
+ parser.add_argument(
+ "-F",
+ "--use_fp16",
+ help="If specified, will run in fp16 precision and discard ptq setting",
+ action="store_true",
+ default=False,
+ )
+
+ parser.add_argument(
+ "--build_op_package",
+ help="Build op package based on op_package_dir. Please set up "
+ "`HEXAGON_SDK_ROOT` and `ANDROID_NDK_ROOT` environment variable. "
+ "And add clang compiler into `PATH`. Please refer to Qualcomm AI Engine "
+ "Direct SDK document to get more details",
+ action="store_true",
+ default=False,
+ )
+
+ args = parser.parse_args()
+ args.validate(args)
+
+ try:
+ main(args)
+ except Exception as e:
+ if args.ip and args.port != -1:
+ with Client((args.ip, args.port)) as conn:
+ conn.send(json.dumps({"Error": str(e)}))
+ else:
+ raise Exception(e)
diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile
new file mode 100644
index 00000000000..75d0abd47b1
--- /dev/null
+++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile
@@ -0,0 +1,385 @@
+# check all setup prerequisites if the command goal is not clean
+ifneq ($(MAKECMDGOALS),clean)
+ifndef QNN_INCLUDE
+$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
+QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN
+endif
+ifeq ($(wildcard $(QNN_INCLUDE)),)
+$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package")
+endif
+ifndef QNN_TARGET_LIB
+$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid")
+QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android
+endif
+ifeq ($(wildcard $(QNN_TARGET_LIB)),)
+ifeq ($(MAKECMDGOALS),htp_aarch64)
+$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64")
+else ifeq ($(MAKECMDGOALS),all)
+$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages")
+endif
+endif
+
+ifndef HEXAGON_SDK_ROOT
+$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z")
+endif
+
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),)
+$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path")
+endif
+
+HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT))
+
+$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]")
+# Users should note that the tools version may change between hexagon sdk versions
+# Following combination of SDK and Tool version is supported
+HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.2.0
+HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.3.0
+HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
+HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
+HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.0.0
+HEXAGON_SDK_ROOT_V81 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.2.0
+#Updated to point to latest sdk to match with libQnnHtp.so
+HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT_V81)
+
+HEXAGON_TOOLS_VERSION_V68 := 8.4.09
+HEXAGON_TOOLS_VERSION_V69 := 8.5.03
+HEXAGON_TOOLS_VERSION_V73 := 8.6.02
+HEXAGON_TOOLS_VERSION_V75 := 8.7.03
+HEXAGON_TOOLS_VERSION_V79 := 8.8.02
+HEXAGON_TOOLS_VERSION_V81 := 19.0.01
+#Updated to point to latest sdk to match with libQnnHtp.so
+HEXAGON_TOOLS_VERSION_X86 := 19.0.01
+
+ifndef ANDROID_NDK_ROOT
+ifeq ($(MAKECMDGOALS),htp_aarch64)
+$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
+else ifeq ($(MAKECMDGOALS),all)
+$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64")
+endif
+endif
+
+ifndef PACKAGE_NAME
+export
+PACKAGE_NAME := $(notdir $(shell pwd))
+$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name")
+endif
+
+WORK := build
+SRC_DIR := src
+OP_SRC_DIR := src/ops
+OP_INCLUDE_DIR := ./include
+OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags
+LIBRARY_NAME := libQnn$(PACKAGE_NAME).so
+SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 hexagon-v81 aarch64-android
+
+
+COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function
+COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++
+COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))"
+
+X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools
+
+$(info "HEXAGON_SDK_ROOT_X86 is [${HEXAGON_SDK_ROOT_X86}]")
+$(info "HEXAGON_SDK_ROOT_X86 is [${HEXAGON_SDK_ROOT_X86}]")
+$(info "X86_LIBNATIVE_RELEASE_DIR is [${X86_LIBNATIVE_RELEASE_DIR}]")
+
+# Ensure hexagon sdk tool version can be retrieved
+ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),)
+$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \
+ \
+ Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)")
+endif
+
+#Check tools for hexagon_v79 are present.
+ifeq ($(MAKECMDGOALS),htp_v79)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)")
+endif
+endif
+
+#Check tools for hexagon_v75 are present.
+ifeq ($(MAKECMDGOALS),htp_v75)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)")
+endif
+endif
+
+#Check tools for hexagon_v68 are present.
+ifeq ($(MAKECMDGOALS),htp_v68)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v69)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)")
+endif
+endif
+
+ifeq ($(MAKECMDGOALS),htp_v73)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)")
+endif
+endif
+
+#Check tools for hexagon_v81 are present.
+ifeq ($(MAKECMDGOALS),htp_v81)
+ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V81)),)
+$(error "ERROR: HEXAGON_SDK_ROOT_V81 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V81)")
+endif
+endif
+
+
+endif
+OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp)
+OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp)
+HFILES = $(wildcard $(QNN_INCLUDE)/*.h)
+HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h)
+HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h)
+OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES)))
+OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES)))
+
+#======= Assembly ========
+OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S)
+OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86))))
+OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S)
+OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68))))
+OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S)
+OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69))))
+OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S)
+OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73))))
+OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S)
+OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75))))
+OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S)
+OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79))))
+OP_SOURCES_ASM_V81 += $(wildcard $(OP_SRC_DIR)/v81_asm/*.S)
+OP_OBJS_ASM_V81 += $(subst /v81_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V81))))
+
+OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S)
+OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID))))
+
+all: htp_v68 htp_x86 htp_aarch64
+
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for x86
+X86_CXX ?= clang++-9
+# Checking if clang++-9 is present. If not switch to clang++
+ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0)
+ X86_CXX := clang++
+endif
+X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread
+X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX
+X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof
+linux_objs =
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for hexagon
+HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED
+
+HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef
+HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef
+HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef
+HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef
+HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef
+HEXAGON_CXX_FLAGS_V81 := $(HEXAGON_CXX_FLAGS) -mv81 -I$(HEXAGON_SDK_ROOT_V81)/rtos/qurt/computev81/include/qurt -I$(HEXAGON_SDK_ROOT_V81)/rtos/qurt/computev81/include/posix -I$(HEXAGON_SDK_ROOT_V81)/incs -I$(HEXAGON_SDK_ROOT_V81)/incs/stddef
+
+HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++
+HEXAGON_CXX_V81 := $(HEXAGON_SDK_ROOT_V81)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V81)/Tools/bin/hexagon-clang++
+
+HEX_LDFLAGS =
+hexagon_objs =
+#============================================================================================================
+# Setup compiler, compiler instructions and linker for aarch64
+AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID
+AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers
+ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++
+AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS)
+AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare
+aarch64_objs =
+#============================================================================================================
+# Setup targets and goals
+
+htp_x86: X86_BUILD
+
+htp_v68: HEXAGON_BUILD_V68
+
+htp_v69: HEXAGON_BUILD_V69
+
+htp_v73: HEXAGON_BUILD_V73
+
+htp_v75: HEXAGON_BUILD_V75
+
+htp_v79: HEXAGON_BUILD_V79
+
+htp_v81: HEXAGON_BUILD_V81
+
+htp_aarch64: AARCH64_BUILD
+
+AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME)
+
+HEXAGON_BUILD_V81: $(WORK)/hexagon-v81/$(LIBRARY_NAME)
+
+X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME)
+
+
+define build_objs =
+ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),)
+$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x))
+else
+$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)")
+endif
+endef
+
+$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang))
+$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang))
+$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v68))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v69))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v73))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v75))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v79))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79))
+$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v81))
+$(eval $(call build_objs,$(OP_OBJS),hexagon-v81))
+$(eval $(call build_objs,$(OP_OBJS_ASM_V81),hexagon-v81))
+
+
+$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android))
+$(eval $(call build_objs,$(OP_OBJS),aarch64-android))
+$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android))
+
+# x86
+$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/hexagon-v81 $(WORK)/aarch64-android:
+ @mkdir -p $@/ops
+
+$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
+ $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang
+ $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang
+ $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES)
+ $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS)
+
+# v68
+$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
+ $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68
+ $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68
+ $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES)
+ $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v69
+$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
+ $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69
+ $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69
+ $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES)
+ $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v73
+$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
+ $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73
+ $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73
+ $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES)
+ $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+#v75
+$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
+ $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75
+ $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75
+ $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES)
+ $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+#v79
+$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
+ $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
+ $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79
+ $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES)
+ $(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+# v81
+$(WORK)/hexagon-v81/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v81
+ $(HEXAGON_CXX_V81) $(HEXAGON_CXX_FLAGS_V81) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v81/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v81
+ $(HEXAGON_CXX_V81) $(HEXAGON_CXX_FLAGS_V81) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v81/ops/%.o: $(OP_SRC_DIR)/v81_asm/%.S | $(WORK)/hexagon-v81
+ $(HEXAGON_CXX_V81) $(HEXAGON_CXX_FLAGS_V81) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/hexagon-v81/$(LIBRARY_NAME): $(hexagon-v81_objs) | $(HFILES)
+ $(HEXAGON_CXX_V81) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+
+# aarch64
+$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android
+ $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android
+ $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android
+ $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
+
+$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES)
+ $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS)
+
+clean:
+ -rm -rf $(WORK)
+
+.PHONY: all clean
diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/config/fastgelu_op_package_htp.xml b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/config/fastgelu_op_package_htp.xml
new file mode 100644
index 00000000000..ebe553edc41
--- /dev/null
+++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/config/fastgelu_op_package_htp.xml
@@ -0,0 +1,39 @@
+
+
+
+
+ FastGelu
+
+ FastGELU: y = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+
+
+
+ x
+ Input tensor
+ true
+ QNN_DATATYPE_FLOAT_32
+ QNN_DATATYPE_UFIXED_POINT_8
+
+ ND
+ UNDEFINED
+ elementwise
+
+
+
+
+
+ HTP
+
+
+
diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/FastGeluOpPackageInterface.cpp b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/FastGeluOpPackageInterface.cpp
new file mode 100644
index 00000000000..af62ee8a84e
--- /dev/null
+++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/FastGeluOpPackageInterface.cpp
@@ -0,0 +1,274 @@
+//==============================================================================
+// Auto Generated Code for FastGeluOpPackage
+//==============================================================================
+
+#include "HTP/QnnHtpCommon.h"
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "HTP/core/unique_types.h"
+#include "QnnOpPackage.h"
+#include "QnnSdkBuildId.h"
+
+DEFINE_UNIQ_TY()
+BEGIN_PKG_OPS_OPTS_LIST()
+
+/** Note that the order of declarations given here defines the order in which ops and graph optimizations are
+ * registered to the HTP Core.
+ * Append the latest OpName at the bottom
+ */
+DECLARE_PKG_OPS_OPTS_LIST(PKG_FastGelu)
+
+END_PKG_OPS_OPTS_LIST()
+
+// op package info
+static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag
+
+static std::array sg_opNames{{"FastGelu"}};
+
+static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT;
+static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
+
+// global data
+static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra =
+nullptr; // global infrastructure not in use for now
+static bool sg_packageInitialized = false;
+
+/*
+ * user provided logging call back function
+ * currently only supported on linux x86-64 and nonrpc versions
+ * typedef void (*QnnLog_Callback_t)(const char* fmt,
+ * QnnLog_Level_t level,
+ * uint64_t timestamp,
+ * va_list args);
+ * usage: if(sg_logInitialized && level <= sg_maxLogLevel)
+ * sg_logCallback(fmt, level, timestamp, args);
+ *
+ * for cross rpc versions, skel side user provided logging call back function
+ * can be defined as part of op packages. maximal log level sg_maxLogLevel
+ * can be set by Qnn_ErrorHandle_t FastGeluOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel)
+ */
+/*
+ * for alternative logging method provided by HTP core, please refer to log.h
+ */
+static QnnLog_Callback_t sg_logCallback =
+ nullptr; // user provided call back function pointer for logging
+static QnnLog_Level_t sg_maxLogLevel =
+ (QnnLog_Level_t)0; // maximal log level used in user provided logging
+static bool sg_logInitialized =
+ false; // tracks whether user provided logging method has been initialized
+
+
+/*
+* op initialization
+* needs to be global in the package
+* one initialization per package before any op definitions
+* syntax: INIT_PACKAGE_OP_DEF()
+*/
+INIT_PACKAGE_OP_DEF()
+
+/*
+* optimization initialization
+* needs to be global in the package
+* one initialization per package before any optimization definitions
+* syntax: INIT_PACKAGE_OPTIMIZATION_DEF()
+*/
+INIT_PACKAGE_OPTIMIZATION_DEF()
+
+/*
+ * op parameter order initialization
+ * needs to be global in the package
+ * one initialization per package before any op parameter order definitions
+ * syntax: INIT_PACKAGE_PARAM_ORDER_DEF()
+ */
+INIT_PACKAGE_PARAM_ORDER_DEF()
+
+/*
+ * axis parameter name list
+ * optional
+ * needs to be global in the package
+ * one list per package
+ * for listing axis parameter names passed into Qnn_AddNode API
+ * HTP backend auto-adjusts values in axis parameters based on HTP backfilling
+ * note: HTP backend backfills tensor dimensions to 4 dimensions
+ * syntax: LIST_PACKAGE_AXIS_PARAMS(...)
+ * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis")
+ */
+// LIST_PACKAGE_AXIS_PARAMS()
+
+/*
+ * per-channel quantized op name list
+ * optional
+ * needs to be global in the package
+ * one list per package
+ * for listing op names which support per-channel quantization
+ * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding
+ * inside Qnn_Tensor_t types
+ * HTP backend only supports per-channel scale ops
+ * i.e. along last dimension, offset is always zero
+ * if an op name is marked as having per-channel scale support, and in
+ * QNN_AddNode, at least one input, parameter, or output has
+ * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type:
+ * then:
+ * HTP backend will pass to op implementation function the following:
+ * output(s), input(s), parameter(s),
+ * outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s)
+ *
+ * optimization rules can be used to remove extra perChannelScale tensors
+ *
+ * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
+ * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name)
+ */
+
+// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+
+/*
+* Declare and define the special intialize function for HTP Backend to load
+*/
+INIT_PKG_CORE_INIT_FUNC()
+
+/* op package API's */
+
+Qnn_ErrorHandle_t FastGeluOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) {
+ if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
+
+ /*
+ * op parameter order registration
+ * registers all defined op parameter orders in the package
+ * syntax: REGISTER_PACKAGE_PARAM_ORDERS()
+ */
+ REGISTER_PACKAGE_PARAM_ORDERS()
+
+ /*
+ * op axis parameter name registration
+ * registers all axis parameter names in the package
+ * used with LIST_PACKAGE_AXIS_PARAMS(...)
+ * syntax: REGISTER_PACKAGE_AXIS_PARAMS()
+ */
+ REGISTER_PACKAGE_AXIS_PARAMS()
+
+ /*
+ * per-channel scale op name registration
+ * registers all per-channel scale op names in the package
+ * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...)
+ * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+ */
+ REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS()
+
+ sg_globalInfra = infrastructure;
+ sg_packageInitialized = true;
+ return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t FastGeluOpPackageGetInfo(const QnnOpPackage_Info_t** info) {
+ if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+ if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO;
+
+ sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
+ sg_packageInfo.packageName = sg_packageName;
+ sg_packageInfo.operationNames = sg_opNames.data();
+ sg_packageInfo.numOperations = sg_opNames.size();
+ sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID;
+ sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion;
+
+ *info = &sg_packageInfo;
+ return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t FastGeluOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) {
+ if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED;
+ if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT;
+ if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT;
+ sg_logCallback = callback;
+ sg_maxLogLevel = maxLogLevel;
+ sg_logInitialized = true;
+ return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t FastGeluOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) {
+ if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT;
+ sg_maxLogLevel = maxLogLevel;
+ return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t FastGeluOpPackageLogTerminate() {
+ if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+ sg_logCallback = nullptr;
+ sg_maxLogLevel = (QnnLog_Level_t)0;
+ sg_logInitialized = false;
+ return QNN_SUCCESS;
+}
+
+Qnn_ErrorHandle_t FastGeluOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){
+ if (std::string(sg_packageName) != opConfig.v1.packageName) {
+ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+ }
+
+ /* auto-generated validation code below
+ * Check if op config type matches any registered ops
+ * If a match is found, check number of inputs, outputs and params
+ */
+ if (std::string(opConfig.v1.typeName) == "FastGelu"){
+ if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){
+ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+ }
+ }
+ else{
+ return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE;
+ }
+
+ /*
+ * additional validation code here
+ * */
+
+ return QNN_SUCCESS;
+}
+
+/* The following three functions in this comment are not called by HTP backend for now,
+ * no auto-generated implementations are created. Users should see example for full function signatures.
+ * (version 1.3.0) Qnn_ErrorHandle_t FastGeluOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t
+ * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t*
+ * numKernels)
+ * (version 1.3.0) Qnn_ErrorHandle_t FastGeluOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels)
+ *
+ * (version 1.4.0) Qnn_ErrorHandle_t FastGeluOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t
+ * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl)
+ *(version 1.4.0) Qnn_ErrorHandle_t FastGeluOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl)
+ */
+
+Qnn_ErrorHandle_t FastGeluOpPackageTerminate() {
+if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED;
+
+sg_globalInfra = nullptr;
+sg_packageInitialized = false;
+return QNN_SUCCESS;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* latest version */
+Qnn_ErrorHandle_t FastGeluOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) {
+ if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT;
+ interface->interfaceVersion = {1, 4, 0};
+ interface->v1_4.init = FastGeluOpPackageInit;
+ interface->v1_4.terminate = FastGeluOpPackageTerminate;
+ interface->v1_4.getInfo = FastGeluOpPackageGetInfo;
+ interface->v1_4.validateOpConfig = FastGeluOpPackageValidateOpConfig;
+ interface->v1_4.createOpImpl = nullptr;
+ interface->v1_4.freeOpImpl = nullptr;
+ interface->v1_4.logInitialize = FastGeluOpPackageLogInitialize;
+ interface->v1_4.logSetLevel = FastGeluOpPackageLogSetLevel;
+ interface->v1_4.logTerminate = FastGeluOpPackageLogTerminate;
+ return QNN_SUCCESS;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+
diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp
new file mode 100644
index 00000000000..e8ac754f6d9
--- /dev/null
+++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp
@@ -0,0 +1,271 @@
+//==============================================================================
+// Auto Generated Code for FastGeluOpPackage
+//==============================================================================
+
+#include
+#include
+#include "HTP/core/constraints.h"
+#include "HTP/core/op_package_feature_support.h"
+#include "HTP/core/op_register_ext.h"
+#include "HTP/core/optimize.h"
+#include "HTP/core/simple_reg.h"
+#include "QnnOpPackage.h"
+
+BEGIN_PKG_OP_DEFINITION(PKG_FastGelu);
+
+// op execute function declarations
+template
+GraphStatus fastgeluImpl(TensorType& y, const TensorType& x);
+
+// forward declaration of sample cost function
+static float fastgeluCostFunc(const Op* op);
+
+/*
+ * method 1 for defining op, using default cost value (i.e. GLACIAL) and default
+ * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g.
+ * DEF_PACKAGE_OP((fastgeluImpl), "FastGelu")
+ */
+DEF_PACKAGE_OP((fastgeluImpl), "FastGelu")
+
+/*
+ * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL,
+ * FAST, FREE) and provided flags syntax:
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags,
+ * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not
+ * supported in external op packages) e.g.
+ * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((fastgeluImpl),
+ * "FastGelu", SNAIL)
+ */
+
+/*
+ * method 3 for defining op with cost function pointer and provided flags
+ * cost function pointer type: typedef float (*cost_function) (const Op * op);
+ * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...)
+ * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((fastgeluImpl),
+ * "FastGelu", fastgeluCostFunc, Flags::RESOURCE_HVX)
+ */
+
+/*
+ * optimization definitions
+ * need to be global in the package
+ * one definition per optimization
+ * syntax:
+ * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE)
+ * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000)
+ * HTP core provides some replacement functions for op package to use
+ * for more information about optimization rules, please refer to HTP core
+ * documentations
+ */
+
+/*
+ * op parameter order definitions
+ * need to be global in the package
+ * one definition per op, and this is optional
+ * syntax:
+ * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...)
+ * one or more parameters can be specified for each op
+ * order of parameters listed determines the order of parameters passed into op
+ * execution functions if an op does not have a parameter order definition,
+ * parameter order passed into Qnn_addNode will be passed into op execution
+ * functions if an op has a parameter order definition, any parameter passed
+ * into Qnn_addNode with unlisted name will be abandoned if two or more op
+ * packages with the same package name will be registered, they cannot list
+ * conflicting parameter orders
+ * PARAM refers to parameter name as a string literal
+ * MANDATORY refers to whether this parameter is required to be provided at
+ * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as
+ * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter
+ * is not provided at Qnn_addNode if provided as nullptr, graph construction
+ * will skip this parameter when this parameter is not provided at Qnn_addNode
+ */
+
+/* execute functions for ops */
+
+// template
+// GraphStatus fastgeluImpl(TensorType& y, const TensorType& x) {
+// const uint32_t numElements = x.total_storage_elements();
+
+// if (y.total_storage_elements() != numElements) {
+// return GraphStatus::ErrorFatal;
+// }
+
+// const float kAlpha = 0.7978845608f; // sqrt(2/pi)
+// const float kCoeff = 0.044715f;
+
+// float* yData = reinterpret_cast(y.raw_data());
+// const float* xData = reinterpret_cast(x.raw_data_const());
+
+// for (uint32_t i = 0; i < numElements; ++i) {
+// const float v = xData[i];
+// const float inner = kAlpha * (v + kCoeff * v * v * v);
+// yData[i] = 0.5f * v * (1.0f + std::tanh(inner));
+// }
+
+// return GraphStatus::Success;
+// }
+
+template
+GraphStatus fastgeluImpl(TensorType& y, const TensorType& x) {
+ const uint32_t N = x.total_storage_elements();
+
+ if (y.total_storage_elements() != N) {
+ return GraphStatus::ErrorFatal;
+ }
+
+ const auto in_info = x.get_dtype_intfc();
+ const auto out_info = y.get_dtype_intfc();
+
+ if (in_info.dtype != DType::Float32 || in_info.dtype != DType::QUInt8) {
+ return GraphStatus::ErrorPrecision;
+ }
+ if (in_info.dtype == DType::Float32 && out_info.dtype == DType::Float32) {
+ const float* xData = static_cast(x.raw_data_const());
+ float* yData = static_cast(y.raw_data());
+
+ // --- Temporary FP16 buffers ---
+ std::vector tmp_in(N);
+ std::vector tmp_out(N);
+
+ for (uint32_t i = 0; i < N; ++i) {
+ tmp_in[i] = static_cast(xData[i]);
+ }
+
+#ifdef __hexagon__
+ union {
+ Float16 f;
+ uint16_t b;
+ } kAlpha = {(Float16)0.7978845608f}; // sqrt(2/pi)
+ union {
+ Float16 f;
+ uint16_t b;
+ } kCoeff = {(Float16)0.044715f};
+ union {
+ Float16 f;
+ uint16_t b;
+ } kHalf = {(Float16)0.5f};
+ union {
+ Float16 f;
+ uint16_t b;
+ } kOne = {(Float16)1.0f};
+ union {
+ Float16 f;
+ uint16_t b;
+ } k27 = {(Float16)27.0f};
+ union {
+ Float16 f;
+ uint16_t b;
+ } kInv27 = {(Float16)(1.0f / 27.0f)};
+ union {
+ Float16 f;
+ uint16_t b;
+ } kOne3 = {(Float16)(1.0f / 3.0f)};
+ union {
+ Float16 f;
+ uint16_t b;
+ } kOne9 = {(Float16)(1.0f / 9.0f)};
+
+ HVX_Vector v_alpha = Q6_Vh_vsplat_R(kAlpha.b);
+ HVX_Vector v_coeff = Q6_Vh_vsplat_R(kCoeff.b);
+ HVX_Vector v_half = Q6_Vh_vsplat_R(kHalf.b);
+ HVX_Vector v_one = Q6_Vh_vsplat_R(kOne.b);
+ HVX_Vector v_27 = Q6_Vh_vsplat_R(k27.b);
+ HVX_Vector v_inv27 = Q6_Vh_vsplat_R(kInv27.b);
+ HVX_Vector v_1_3 = Q6_Vh_vsplat_R(kOne3.b);
+ HVX_Vector v_1_9 = Q6_Vh_vsplat_R(kOne9.b);
+
+ const int VBYTES = 128;
+ const int ELEMS = VBYTES / sizeof(Float16); // 64
+
+ for (uint32_t i = 0; i < N; i += ELEMS) {
+ HVX_Vector vx = q6op_V_vldu_A(&tmp_in[i]); // x
+ HVX_Vector vx2 = Q6_Vhf_vmpy_VhfVhf(vx, vx); // x^2
+ HVX_Vector vx3 = Q6_Vhf_vmpy_VhfVhf(vx2, vx); // x^3
+
+ // z = α * (x + c*x^3)
+ HVX_Vector vcx3 = Q6_Vhf_vmpy_VhfVhf(vx3, v_coeff);
+ HVX_Vector vsum = Q6_Vhf_vadd_VhfVhf(vx, vcx3);
+ HVX_Vector vz = Q6_Vhf_vmpy_VhfVhf(vsum, v_alpha);
+
+ // z^2, z^4
+ HVX_Vector vz2 = Q6_Vhf_vmpy_VhfVhf(vz, vz);
+ HVX_Vector vz4 = Q6_Vhf_vmpy_VhfVhf(vz2, vz2);
+
+ // inv_den ≈ (1/27) * (1 - (1/3) z^2 + (1/9) z^4)
+ HVX_Vector term1 = Q6_Vhf_vmpy_VhfVhf(vz2, v_1_3); // (1/3) z^2
+ HVX_Vector one_m_t = Q6_Vhf_vsub_VhfVhf(v_one, term1); // 1 - (1/3) z^2
+ HVX_Vector term2 = Q6_Vhf_vmpy_VhfVhf(vz4, v_1_9); // (1/9) z^4
+ HVX_Vector poly =
+ Q6_Vhf_vadd_VhfVhf(one_m_t, term2); // 1 - 1/3 z^2 + 1/9 z^4
+ HVX_Vector inv_den = Q6_Vhf_vmpy_VhfVhf(poly, v_inv27); // * (1/27)
+
+ // num = z * (27 + z^2) = 27z + z^3
+ HVX_Vector z3 = Q6_Vhf_vmpy_VhfVhf(vz2, vz);
+ HVX_Vector t27z = Q6_Vhf_vmpy_VhfVhf(vz, v_27);
+ HVX_Vector num = Q6_Vhf_vadd_VhfVhf(t27z, z3);
+
+ // tanh(z) ≈ num * inv_den
+ HVX_Vector vtanh = Q6_Vhf_vmpy_VhfVhf(num, inv_den);
+
+ // y = 0.5 * x * (1 + tanh)
+ HVX_Vector one_plus_tanh = Q6_Vhf_vadd_VhfVhf(v_one, vtanh);
+ HVX_Vector t = Q6_Vhf_vmpy_VhfVhf(vx, one_plus_tanh);
+ HVX_Vector vy = Q6_Vhf_vmpy_VhfVhf(t, v_half);
+
+ q6op_vstu_AV(&tmp_out[i], vy);
+ }
+#else
+ // Scalar fallback
+ for (uint32_t i = 0; i < N; ++i) {
+ const float v = xData[i];
+ const float inner = 0.7978845608f * (v + 0.044715f * v * v * v);
+ yData[i] = 0.5f * v * (1.0f + std::tanh(inner));
+ }
+#endif
+
+ for (uint32_t i = 0; i < N; ++i) {
+ yData[i] = static_cast(tmp_out[i]);
+ }
+ return GraphStatus::Success;
+ } else if (in_info.dtype == DType::QUInt8) {
+ const uint8_t* xData = static_cast(x.raw_data_const());
+ uint8_t* yData = static_cast(y.raw_data());
+
+ const float x_scale = in_info.scale;
+ const float y_scale = out_info.scale;
+ const int32_t x_zero = in_info.offset;
+ const int32_t y_zero = out_info.offset;
+
+ alignas(128) static uint8_t lut[256];
+ static bool lut_init = false;
+ if (!lut_init) {
+ for (int i = 0; i < 256; ++i) {
+ float x_f = (i - x_zero) * x_scale;
+ float inner = 0.7978845608f * (x_f + 0.044715f * x_f * x_f * x_f);
+ float y_f = 0.5f * x_f * (1.0f + std::tanh(inner));
+ int y_q = static_cast(std::round(y_f / y_scale)) + y_zero;
+ lut[i] = static_cast(std::clamp(y_q, 0, 255));
+ }
+ lut_init = true;
+ }
+ for (uint32_t i = 0; i < N; ++i) {
+ yData[i] = lut[xData[i]];
+ }
+ return GraphStatus::Success;
+ } else {
+ return GraphStatus::ErrorFatal;
+ }
+}
+
+__attribute__((unused)) static float fastgeluCostFunc(const Op* op) {
+ /*
+ * add code here
+ * */
+
+ float cost = 0.0; // add cost computation here
+ return cost;
+}
+
+/* At the bottom of the op file, call END_PKG_OP_DEFINITION(),
+ where is as BEGIN_PKG_OP_DEFINITION
+*/
+END_PKG_OP_DEFINITION(PKG_FastGelu);