diff --git a/examples/qualcomm/custom_op/custom_ops_fast_gelu.py b/examples/qualcomm/custom_op/custom_ops_fast_gelu.py new file mode 100644 index 00000000000..25e65535577 --- /dev/null +++ b/examples/qualcomm/custom_op/custom_ops_fast_gelu.py @@ -0,0 +1,294 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +""" +Example: Custom FastGELU operator integrated with ExecuTorch Qualcomm backend (HTP). +""" + +import json +import os +import subprocess +import sys +from multiprocessing.connection import Client + +import numpy as np +import torch + +from executorch.backends.qualcomm.quantizer.quantizer import QuantDtype +from executorch.backends.qualcomm.serialization.qc_schema import ( + _soc_info_table, + HtpArch, + QcomChipset, + QnnExecuTorchOpPackageInfo, + QnnExecuTorchOpPackageOptions, + QnnExecuTorchOpPackagePlatform, + QnnExecuTorchOpPackageTarget, +) +from executorch.examples.qualcomm.utils import ( + build_executorch_binary, + generate_inputs, + make_output_dir, + make_quantizer, + setup_common_args_and_variables, + SimpleADB, +) +from torch.library import impl, Library + +# ------------------------------------------------------------------------------ +# 1. Register PyTorch custom operator (FastGELU) +# ------------------------------------------------------------------------------ + +my_op_lib = Library("my_ops", "DEF") +my_op_lib.define("fast_gelu(Tensor input) -> Tensor") + + +@impl(my_op_lib, "fast_gelu", "CompositeExplicitAutograd") +def fast_gelu_impl(x: torch.Tensor) -> torch.Tensor: + return 0.5 * x * (1.0 + torch.tanh(0.7978845608 * (x + 0.044715 * x * x * x))) + + +# registering the out variant. +my_op_lib.define("fast_gelu.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)") + + +class Model(torch.nn.Module): + def forward(self, a): + return torch.ops.my_ops.fast_gelu.default(a) + + +def annotate_custom(gm: torch.fx.GraphModule) -> None: + """ + This function is specific for custom op. + The source_fn of the rewritten nn module turns out to be "my_ops.fast_gelu.default" + """ + from executorch.backends.qualcomm.quantizer.annotators import _is_annotated + from executorch.backends.qualcomm.quantizer.qconfig import ( + get_ptq_per_channel_quant_config, + ) + from torch.fx import Node + from torchao.quantization.pt2e.quantizer import QuantizationAnnotation + from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY + + quantization_config = get_ptq_per_channel_quant_config() + for node in gm.graph.nodes: + if node.target != torch.ops.my_ops.fast_gelu.default: + continue + + # skip annotation if it is already annotated + if _is_annotated([node]): + continue + + input_qspec_map = {} + input_act = node.args[0] + assert isinstance(input_act, Node) + input_spec = quantization_config.input_activation + input_qspec_map[input_act] = input_spec + + node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation( + input_qspec_map=input_qspec_map, + output_qspec=quantization_config.output_activation, + _annotated=True, + ) + + +def _run(cmd, cwd=None): + subprocess.run(cmd, stdout=sys.stdout, cwd=cwd, check=True) + + +def prepare_op_package( + workspace: str, op_package_dir: str, arch: HtpArch, build_op_package: bool +): + if build_op_package: + _run(["rm", "-rf", "build"], cwd=op_package_dir) + _run(["make", "htp_x86", "htp_aarch64", f"htp_v{arch}"], cwd=op_package_dir) + _run( + [ + "cp", + f"{op_package_dir}/build/hexagon-v{arch}/libQnnFastGeluOpPackage.so", + f"{op_package_dir}/build/hexagon-v{arch}/libQnnFastGeluOpPackage_HTP.so", + ] + ) + + op_package_paths = [ + f"{op_package_dir}/build/hexagon-v{arch}/libQnnFastGeluOpPackage_HTP.so", + f"{op_package_dir}/build/aarch64-android/libQnnFastGeluOpPackage.so", + ] + + op_package_infos_HTP = QnnExecuTorchOpPackageInfo() + op_package_infos_HTP.interface_provider = "FastGeluOpPackageInterfaceProvider" + op_package_infos_HTP.op_package_name = "FastGeluOpPackage" + op_package_infos_HTP.op_package_path = f"{workspace}/libQnnFastGeluOpPackage_HTP.so" + op_package_infos_HTP.target = QnnExecuTorchOpPackageTarget.HTP + op_package_infos_HTP.custom_op_name = "my_ops.fast_gelu.default" + op_package_infos_HTP.qnn_op_type_name = "FastGelu" + op_package_infos_HTP.platform = QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID + op_package_infos_aarch64_CPU = QnnExecuTorchOpPackageInfo() + op_package_infos_aarch64_CPU.interface_provider = ( + "FastGeluOpPackageInterfaceProvider" + ) + op_package_infos_aarch64_CPU.op_package_name = "FastGeluOpPackage" + op_package_infos_aarch64_CPU.op_package_path = ( + f"{workspace}/libQnnFastGeluOpPackage.so" + ) + op_package_infos_aarch64_CPU.target = QnnExecuTorchOpPackageTarget.CPU + op_package_infos_aarch64_CPU.custom_op_name = "my_ops.fast_gelu.default" + op_package_infos_aarch64_CPU.qnn_op_type_name = "FastGelu" + op_package_infos_aarch64_CPU.platform = ( + QnnExecuTorchOpPackagePlatform.AARCH64_ANDROID + ) + op_package_infos_x86_CPU = QnnExecuTorchOpPackageInfo() + op_package_infos_x86_CPU.interface_provider = "FastGeluOpPackageInterfaceProvider" + op_package_infos_x86_CPU.op_package_name = "FastGeluOpPackage" + op_package_infos_x86_CPU.op_package_path = ( + f"{op_package_dir}/build/x86_64-linux-clang/libQnnFastGeluOpPackage.so" + ) + op_package_infos_x86_CPU.target = QnnExecuTorchOpPackageTarget.CPU + op_package_infos_x86_CPU.custom_op_name = "my_ops.fast_gelu.default" + op_package_infos_x86_CPU.qnn_op_type_name = "FastGelu" + op_package_infos_x86_CPU.platform = QnnExecuTorchOpPackagePlatform.X86_64 + op_package_options = QnnExecuTorchOpPackageOptions() + op_package_options.op_package_infos = [ + op_package_infos_x86_CPU, + op_package_infos_aarch64_CPU, + op_package_infos_HTP, + ] + + return op_package_options, op_package_paths + + +def main(args): + if args.build_op_package: + if "HEXAGON_SDK_ROOT" not in os.environ: + raise RuntimeError("Environment variable HEXAGON_SDK_ROOT must be set") + print(f"HEXAGON_SDK_ROOT={os.getenv('HEXAGON_SDK_ROOT')}") + + if "ANDROID_NDK_ROOT" not in os.environ: + raise RuntimeError("Environment variable ANDROID_NDK_ROOT must be set") + print(f"ANDROID_NDK_ROOT={os.getenv('ANDROID_NDK_ROOT')}") + + # ensure the working directory exist. + os.makedirs(args.artifact, exist_ok=True) + + quant_dtype = QuantDtype.use_8a8w + if args.use_fp16: + quant_dtype = None + + instance = Model() + sample_input = (torch.randn(1, 16384),) + pte_filename = "fastgelu_model" + workspace = f"/data/local/tmp/executorch/{pte_filename}" + soc_info: SocInfo = _soc_info_table[getattr(QcomChipset, args.model)] + + op_package_options, op_package_paths = prepare_op_package( + workspace, + args.op_package_dir, + soc_info.htp_info.htp_arch, + args.build_op_package, + ) + quant_dtype: Literal[QuantDtype.use_16a16w] = QuantDtype.use_8a8w + if args.use_fp16: + quant_dtype = None + quantizer = None + if not args.use_fp16: + quantizer = make_quantizer( + quant_dtype=quant_dtype, custom_annotations=(annotate_custom,) + ) + + build_executorch_binary( + instance, + sample_input, + args.model, + f"{args.artifact}/{pte_filename}", + sample_input, + op_package_options=op_package_options, + quant_dtype=quant_dtype, + custom_quantizer=quantizer, + ) + + if args.compile_only: + sys.exit(0) + + output_data_folder: LiteralString = os.path.join(args.artifact, "outputs") + + adb = SimpleADB( + qnn_sdk=os.getenv("QNN_SDK_ROOT"), + build_path=args.build_folder, + pte_path=f"{args.artifact}/{pte_filename}.pte", + workspace=workspace, + device_id=args.device, + host_id=args.host, + soc_model=args.model, + ) + adb.push(inputs=sample_input, files=op_package_paths) + adb.execute() + adb.pull(output_path=args.artifact) + + # Compare results + model = Model() + x86_golden = model(*sample_input) + import numpy as np + + device_output = torch.from_numpy( + np.fromfile( + os.path.join(output_data_folder, "output_0_0.raw"), dtype=np.float32 + ) + ).reshape(x86_golden.size()) + result = torch.all(torch.isclose(x86_golden, device_output, atol=1e-2)).item() + print( + "is_close?", + result, + ) + if not result: + print(f"x86_golden {x86_golden}") + print(f"device_out {device_output}") + + +if __name__ == "__main__": + parser = setup_common_args_and_variables() + parser.add_argument( + "-a", + "--artifact", + help="path for storing generated artifacts by this example. Default ./custom_op", + default="./custom_op", + type=str, + ) + + parser.add_argument( + "-d", + "--op_package_dir", + help="Path to operator package which generates from QNN.", + type=str, + required=True, + ) + + parser.add_argument( + "-F", + "--use_fp16", + help="If specified, will run in fp16 precision and discard ptq setting", + action="store_true", + default=False, + ) + + parser.add_argument( + "--build_op_package", + help="Build op package based on op_package_dir. Please set up " + "`HEXAGON_SDK_ROOT` and `ANDROID_NDK_ROOT` environment variable. " + "And add clang compiler into `PATH`. Please refer to Qualcomm AI Engine " + "Direct SDK document to get more details", + action="store_true", + default=False, + ) + + args = parser.parse_args() + args.validate(args) + + try: + main(args) + except Exception as e: + if args.ip and args.port != -1: + with Client((args.ip, args.port)) as conn: + conn.send(json.dumps({"Error": str(e)})) + else: + raise Exception(e) diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile new file mode 100644 index 00000000000..75d0abd47b1 --- /dev/null +++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile @@ -0,0 +1,385 @@ +# check all setup prerequisites if the command goal is not clean +ifneq ($(MAKECMDGOALS),clean) +ifndef QNN_INCLUDE +$(info "INFO: Qnn include not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_INCLUDE := $(QNN_SDK_ROOT)/include/QNN +endif +ifeq ($(wildcard $(QNN_INCLUDE)),) +$(error "ERROR: QNN_INCLUDE path is not set. QNN include paths must be set to obtain BE headers necessary to compile the package") +endif +ifndef QNN_TARGET_LIB +$(info "INFO: Qnn target not explicitly defined, attempting to use QNN_SDK_ROOT if it is valid") +QNN_TARGET_LIB := $(QNN_SDK_ROOT)/lib/aarch64-android +endif +ifeq ($(wildcard $(QNN_TARGET_LIB)),) +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: QNN_TARGET_LIB is needed to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING:QNN_TARGET_LIB may need to be defined to compile packages") +endif +endif + +ifndef HEXAGON_SDK_ROOT +$(error "ERROR: HEXAGON_SDK_ROOT is not set. Hexagon-SDK path must be set to the latest hexagon-sdk-x.y.z") +endif + +ifeq ($(wildcard $(HEXAGON_SDK_ROOT)),) +$(error "ERROR: HEXAGON_SDK_ROOT is not set correctly. Please set HEXAGON_SDK_ROOT to latest hexagon-sdk-X.Y.Z path") +endif + +HEXAGON_SDK_BASE := $(dir $(HEXAGON_SDK_ROOT)) + +$(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]") +# Users should note that the tools version may change between hexagon sdk versions +# Following combination of SDK and Tool version is supported +HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.2.0 +HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.3.0 +HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0 +HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0 +HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.0.0 +HEXAGON_SDK_ROOT_V81 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.2.0 +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT_V81) + +HEXAGON_TOOLS_VERSION_V68 := 8.4.09 +HEXAGON_TOOLS_VERSION_V69 := 8.5.03 +HEXAGON_TOOLS_VERSION_V73 := 8.6.02 +HEXAGON_TOOLS_VERSION_V75 := 8.7.03 +HEXAGON_TOOLS_VERSION_V79 := 8.8.02 +HEXAGON_TOOLS_VERSION_V81 := 19.0.01 +#Updated to point to latest sdk to match with libQnnHtp.so +HEXAGON_TOOLS_VERSION_X86 := 19.0.01 + +ifndef ANDROID_NDK_ROOT +ifeq ($(MAKECMDGOALS),htp_aarch64) +$(error "ERROR: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +else ifeq ($(MAKECMDGOALS),all) +$(info "WARNING: ANDROID_NDK_ROOT is not set. Android NDK path must be set to compile package for aarch64") +endif +endif + +ifndef PACKAGE_NAME +export +PACKAGE_NAME := $(notdir $(shell pwd)) +$(info "INFO: No package name defined. Using current directory name: $(PACKAGE_NAME) as the package name") +endif + +WORK := build +SRC_DIR := src +OP_SRC_DIR := src/ops +OP_INCLUDE_DIR := ./include +OP_INCLUDES = #$(wildcard $(OP_INCLUDE_DIR)/*.h) user defined if any op specific headers are needed, add -I to common flags +LIBRARY_NAME := libQnn$(PACKAGE_NAME).so +SUPPORTED_TARGETS = x86_64-linux-clang hexagon-v68 hexagon-v69 hexagon-v73 hexagon-v75 hexagon-v79 hexagon-v81 aarch64-android + + +COMMON_CXX_FLAGS = -std=c++17 -I$(QNN_INCLUDE) -fPIC -Wall -Wreorder -Wno-missing-braces -Wno-unused-function +COMMON_CXX_FLAGS += -Werror -Wno-format -Wno-unused-command-line-argument -fvisibility=default -stdlib=libc++ +COMMON_CXX_FLAGS += -DQNN_API="__attribute__((visibility(\"default\")))" -D__QAIC_HEADER_EXPORT="__attribute__((visibility(\"default\")))" + +X86_LIBNATIVE_RELEASE_DIR := $(HEXAGON_SDK_ROOT_X86)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_X86)/Tools + +$(info "HEXAGON_SDK_ROOT_X86 is [${HEXAGON_SDK_ROOT_X86}]") +$(info "HEXAGON_SDK_ROOT_X86 is [${HEXAGON_SDK_ROOT_X86}]") +$(info "X86_LIBNATIVE_RELEASE_DIR is [${X86_LIBNATIVE_RELEASE_DIR}]") + +# Ensure hexagon sdk tool version can be retrieved +ifeq ($(wildcard $(X86_LIBNATIVE_RELEASE_DIR)/.),) +$(error "Cannot retrieve hexagon tools from: $(X86_LIBNATIVE_RELEASE_DIR). \ + \ + Please check that hexagon tools version is correct. Expected: $(HEXAGON_TOOLS_VERSION_X86)") +endif + +#Check tools for hexagon_v79 are present. +ifeq ($(MAKECMDGOALS),htp_v79) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V79)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V79 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V79)") +endif +endif + +#Check tools for hexagon_v75 are present. +ifeq ($(MAKECMDGOALS),htp_v75) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V75)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V75 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V75)") +endif +endif + +#Check tools for hexagon_v68 are present. +ifeq ($(MAKECMDGOALS),htp_v68) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V68)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V68 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V68)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v69) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V69)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V69 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V69)") +endif +endif + +ifeq ($(MAKECMDGOALS),htp_v73) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V73)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V73 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V73)") +endif +endif + +#Check tools for hexagon_v81 are present. +ifeq ($(MAKECMDGOALS),htp_v81) +ifeq ($(wildcard $(HEXAGON_SDK_ROOT_V81)),) +$(error "ERROR: HEXAGON_SDK_ROOT_V81 is set incorrectly. Cannot retrieve $(HEXAGON_SDK_ROOT_V81)") +endif +endif + + +endif +OP_SOURCES = $(wildcard $(OP_SRC_DIR)/*.cpp) +OTHER_SOURCES = $(wildcard $(SRC_DIR)/*.cpp) +HFILES = $(wildcard $(QNN_INCLUDE)/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/*.h) +HFILES += $(wildcard $(QNN_INCLUDE)/HTP/core/*.h) +OP_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OP_SOURCES))) +OTHER_OBJS = $(patsubst $(SRC_DIR)/%,%,$(patsubst %.cpp,%.o,$(OTHER_SOURCES))) + +#======= Assembly ======== +OP_SOURCES_ASM_X86 += $(wildcard $(OP_SRC_DIR)/x86_asm/*.S) +OP_OBJS_ASM_X86 += $(subst /x86_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_X86)))) +OP_SOURCES_ASM_V68 += $(wildcard $(OP_SRC_DIR)/v68_asm/*.S) +OP_OBJS_ASM_V68 += $(subst /v68_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V68)))) +OP_SOURCES_ASM_V69 += $(wildcard $(OP_SRC_DIR)/v69_asm/*.S) +OP_OBJS_ASM_V69 += $(subst /v69_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V69)))) +OP_SOURCES_ASM_V73 += $(wildcard $(OP_SRC_DIR)/v73_asm/*.S) +OP_OBJS_ASM_V73 += $(subst /v73_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V73)))) +OP_SOURCES_ASM_V75 += $(wildcard $(OP_SRC_DIR)/v75_asm/*.S) +OP_OBJS_ASM_V75 += $(subst /v75_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V75)))) +OP_SOURCES_ASM_V79 += $(wildcard $(OP_SRC_DIR)/v79_asm/*.S) +OP_OBJS_ASM_V79 += $(subst /v79_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V79)))) +OP_SOURCES_ASM_V81 += $(wildcard $(OP_SRC_DIR)/v81_asm/*.S) +OP_OBJS_ASM_V81 += $(subst /v81_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_V81)))) + +OP_SOURCES_ASM_ANDROID += $(wildcard $(OP_SRC_DIR)/android_asm/*.S) +OP_OBJS_ASM_ANDROID += $(subst /android_asm/,/,$(patsubst $(SRC_DIR)/%,%,$(patsubst %.S,%.o,$(OP_SOURCES_ASM_ANDROID)))) + +all: htp_v68 htp_x86 htp_aarch64 + +#============================================================================================================ +# Setup compiler, compiler instructions and linker for x86 +X86_CXX ?= clang++-9 +# Checking if clang++-9 is present. If not switch to clang++ +ifeq ($(shell $(X86_CXX) -v 2>&1 | grep -c "clang version"), 0) + X86_CXX := clang++ +endif +X86_LDFLAGS:= -Wl,--whole-archive -L$(X86_LIBNATIVE_RELEASE_DIR)/libnative/lib -lnative -Wl,--no-whole-archive -lpthread +X86_C_FLAGS := -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX +X86_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(X86_C_FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof +linux_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for hexagon +HEXAGON_CXX_FLAGS := $(COMMON_CXX_FLAGS) -mhvx -mhvx-length=128B -mhmx -DUSE_OS_QURT -O2 -Wno-reorder -DPREPARE_DISABLED + +HEXAGON_CXX_FLAGS_V68 := $(HEXAGON_CXX_FLAGS) -mv68 -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/qurt -I$(HEXAGON_SDK_ROOT_V68)/rtos/qurt/computev68/include/posix -I$(HEXAGON_SDK_ROOT_V68)/incs -I$(HEXAGON_SDK_ROOT_V68)/incs/stddef +HEXAGON_CXX_FLAGS_V69 := $(HEXAGON_CXX_FLAGS) -mv69 -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/qurt -I$(HEXAGON_SDK_ROOT_V69)/rtos/qurt/computev69/include/posix -I$(HEXAGON_SDK_ROOT_V69)/incs -I$(HEXAGON_SDK_ROOT_V69)/incs/stddef +HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/qurt -I$(HEXAGON_SDK_ROOT_V73)/rtos/qurt/computev73/include/posix -I$(HEXAGON_SDK_ROOT_V73)/incs -I$(HEXAGON_SDK_ROOT_V73)/incs/stddef +HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef +HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef +HEXAGON_CXX_FLAGS_V81 := $(HEXAGON_CXX_FLAGS) -mv81 -I$(HEXAGON_SDK_ROOT_V81)/rtos/qurt/computev81/include/qurt -I$(HEXAGON_SDK_ROOT_V81)/rtos/qurt/computev81/include/posix -I$(HEXAGON_SDK_ROOT_V81)/incs -I$(HEXAGON_SDK_ROOT_V81)/incs/stddef + +HEXAGON_CXX_V68 := $(HEXAGON_SDK_ROOT_V68)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V68)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V69 := $(HEXAGON_SDK_ROOT_V69)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V69)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V73 := $(HEXAGON_SDK_ROOT_V73)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V73)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V75 := $(HEXAGON_SDK_ROOT_V75)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V75)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V79 := $(HEXAGON_SDK_ROOT_V79)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V79)/Tools/bin/hexagon-clang++ +HEXAGON_CXX_V81 := $(HEXAGON_SDK_ROOT_V81)/tools/HEXAGON_Tools/$(HEXAGON_TOOLS_VERSION_V81)/Tools/bin/hexagon-clang++ + +HEX_LDFLAGS = +hexagon_objs = +#============================================================================================================ +# Setup compiler, compiler instructions and linker for aarch64 +AARCH64_C__FLAGS = -D__HVXDBL__ -I$(X86_LIBNATIVE_RELEASE_DIR)/libnative/include -ffast-math -DUSE_OS_LINUX -DANDROID +AARCH64_CXX_FLAGS = $(COMMON_CXX_FLAGS) $(AARCH64_C__FLAGS) -fomit-frame-pointer -Wno-invalid-offsetof -Wno-unused-variable -Wno-unused-parameter -Wno-missing-braces -Wno-sign-compare -Wno-unused-private-field -Wno-unused-variable -Wno-ignored-qualifiers -Wno-missing-field-initializers +ARM_CLANG_OPTS =--target=aarch64-none-linux-android21 --sysroot=$(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/sysroot -stdlib=libc++ -static-libstdc++ +AARCH64_CXX = $(ANDROID_NDK_ROOT)/toolchains/llvm/prebuilt/linux-x86_64/bin/clang++ $(ARM_CLANG_OPTS) +AARCH64_LDFLAGS = -L$(QNN_TARGET_LIB) -lQnnHtp -lQnnHtpPrepare +aarch64_objs = +#============================================================================================================ +# Setup targets and goals + +htp_x86: X86_BUILD + +htp_v68: HEXAGON_BUILD_V68 + +htp_v69: HEXAGON_BUILD_V69 + +htp_v73: HEXAGON_BUILD_V73 + +htp_v75: HEXAGON_BUILD_V75 + +htp_v79: HEXAGON_BUILD_V79 + +htp_v81: HEXAGON_BUILD_V81 + +htp_aarch64: AARCH64_BUILD + +AARCH64_BUILD: $(WORK)/aarch64-android/$(LIBRARY_NAME) + +HEXAGON_BUILD_V68: $(WORK)/hexagon-v68/$(LIBRARY_NAME) + +HEXAGON_BUILD_V69: $(WORK)/hexagon-v69/$(LIBRARY_NAME) + +HEXAGON_BUILD_V73: $(WORK)/hexagon-v73/$(LIBRARY_NAME) + +HEXAGON_BUILD_V75: $(WORK)/hexagon-v75/$(LIBRARY_NAME) + +HEXAGON_BUILD_V79: $(WORK)/hexagon-v79/$(LIBRARY_NAME) + +HEXAGON_BUILD_V81: $(WORK)/hexagon-v81/$(LIBRARY_NAME) + +X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME) + + +define build_objs = +ifneq ($(filter $(2),$(SUPPORTED_TARGETS)),) +$(2)_objs += $(foreach x,$(1),$(WORK)/$(2)/$(x)) +else +$$(error "Unknown target option provided: $(2): Supported targets are: $(SUPPORTED_TARGETS)") +endif +endef + +$(eval $(call build_objs,$(OTHER_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS),x86_64-linux-clang)) +$(eval $(call build_objs,$(OP_OBJS_ASM_X86),x86_64-linux-clang)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v68)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V68),hexagon-v68)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v69)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V69),hexagon-v69)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v73)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V73),hexagon-v73)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v75)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v75)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v79)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v79)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V75),hexagon-v79)) +$(eval $(call build_objs,$(OTHER_OBJS),hexagon-v81)) +$(eval $(call build_objs,$(OP_OBJS),hexagon-v81)) +$(eval $(call build_objs,$(OP_OBJS_ASM_V81),hexagon-v81)) + + +$(eval $(call build_objs,$(OTHER_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS),aarch64-android)) +$(eval $(call build_objs,$(OP_OBJS_ASM_ANDROID),aarch64-android)) + +# x86 +$(WORK)/x86_64-linux-clang $(WORK)/hexagon-v68 $(WORK)/hexagon-v69 $(WORK)/hexagon-v73 $(WORK)/hexagon-v75 $(WORK)/hexagon-v79 $(WORK)/hexagon-v81 $(WORK)/aarch64-android: + @mkdir -p $@/ops + +$(WORK)/x86_64-linux-clang/%.o: $(SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/ops/%.o: $(OP_SRC_DIR)/x86_asm/%.S | $(WORK)/x86_64-linux-clang + $(X86_CXX) $(X86_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/x86_64-linux-clang/$(LIBRARY_NAME): $(x86_64-linux-clang_objs) | $(HFILES) + $(X86_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(X86_LDFLAGS) + +# v68 +$(WORK)/hexagon-v68/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/ops/%.o: $(OP_SRC_DIR)/v68_asm/%.S | $(WORK)/hexagon-v68 + $(HEXAGON_CXX_V68) $(HEXAGON_CXX_FLAGS_V68) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v68/$(LIBRARY_NAME): $(hexagon-v68_objs) | $(HFILES) + $(HEXAGON_CXX_V68) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v69 +$(WORK)/hexagon-v69/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/ops/%.o: $(OP_SRC_DIR)/v69_asm/%.S | $(WORK)/hexagon-v69 + $(HEXAGON_CXX_V69) $(HEXAGON_CXX_FLAGS_V69) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v69/$(LIBRARY_NAME): $(hexagon-v69_objs) | $(HFILES) + $(HEXAGON_CXX_V69) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v73 +$(WORK)/hexagon-v73/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/ops/%.o: $(OP_SRC_DIR)/v73_asm/%.S | $(WORK)/hexagon-v73 + $(HEXAGON_CXX_V73) $(HEXAGON_CXX_FLAGS_V73) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v73/$(LIBRARY_NAME): $(hexagon-v73_objs) | $(HFILES) + $(HEXAGON_CXX_V73) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +#v75 +$(WORK)/hexagon-v75/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/ops/%.o: $(OP_SRC_DIR)/v75_asm/%.S | $(WORK)/hexagon-v75 + $(HEXAGON_CXX_V75) $(HEXAGON_CXX_FLAGS_V75) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v75/$(LIBRARY_NAME): $(hexagon-v75_objs) | $(HFILES) + $(HEXAGON_CXX_V75) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +#v79 +$(WORK)/hexagon-v79/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79 + $(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES) + $(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + +# v81 +$(WORK)/hexagon-v81/%.o: $(SRC_DIR)/%.cpp | $(WORK)/hexagon-v81 + $(HEXAGON_CXX_V81) $(HEXAGON_CXX_FLAGS_V81) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v81/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v81 + $(HEXAGON_CXX_V81) $(HEXAGON_CXX_FLAGS_V81) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v81/ops/%.o: $(OP_SRC_DIR)/v81_asm/%.S | $(WORK)/hexagon-v81 + $(HEXAGON_CXX_V81) $(HEXAGON_CXX_FLAGS_V81) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/hexagon-v81/$(LIBRARY_NAME): $(hexagon-v81_objs) | $(HFILES) + $(HEXAGON_CXX_V81) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS) + + +# aarch64 +$(WORK)/aarch64-android/%.o: $(SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/ops/%.o: $(OP_SRC_DIR)/android_asm/%.S | $(WORK)/aarch64-android + $(AARCH64_CXX) $(AARCH64_CXX_FLAGS) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@ + +$(WORK)/aarch64-android/$(LIBRARY_NAME): $(aarch64-android_objs) | $(HFILES) + $(AARCH64_CXX) -fPIC -std=c++17 -g -shared -o $@ $^ $(AARCH64_LDFLAGS) + +clean: + -rm -rf $(WORK) + +.PHONY: all clean diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/config/fastgelu_op_package_htp.xml b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/config/fastgelu_op_package_htp.xml new file mode 100644 index 00000000000..ebe553edc41 --- /dev/null +++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/config/fastgelu_op_package_htp.xml @@ -0,0 +1,39 @@ + + + + + FastGelu + + FastGELU: y = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3))) + + + + x + Input tensor + true + QNN_DATATYPE_FLOAT_32 + QNN_DATATYPE_UFIXED_POINT_8 + + ND + UNDEFINED + elementwise + + + + + y + Output tensor + true + QNN_DATATYPE_FLOAT_32 + QNN_DATATYPE_UFIXED_POINT_8 + + ND + UNDEFINED + same as input + + + + HTP + + + diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/FastGeluOpPackageInterface.cpp b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/FastGeluOpPackageInterface.cpp new file mode 100644 index 00000000000..af62ee8a84e --- /dev/null +++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/FastGeluOpPackageInterface.cpp @@ -0,0 +1,274 @@ +//============================================================================== +// Auto Generated Code for FastGeluOpPackage +//============================================================================== + +#include "HTP/QnnHtpCommon.h" +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "HTP/core/unique_types.h" +#include "QnnOpPackage.h" +#include "QnnSdkBuildId.h" + +DEFINE_UNIQ_TY() +BEGIN_PKG_OPS_OPTS_LIST() + +/** Note that the order of declarations given here defines the order in which ops and graph optimizations are + * registered to the HTP Core. + * Append the latest OpName at the bottom + */ +DECLARE_PKG_OPS_OPTS_LIST(PKG_FastGelu) + +END_PKG_OPS_OPTS_LIST() + +// op package info +static constexpr auto sg_packageName = THIS_PKG_NAME_STR; // package name passed in as compile flag + +static std::array sg_opNames{{"FastGelu"}}; + +static Qnn_ApiVersion_t sg_sdkApiVersion = QNN_HTP_API_VERSION_INIT; +static QnnOpPackage_Info_t sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + +// global data +static QnnOpPackage_GlobalInfrastructure_t sg_globalInfra = +nullptr; // global infrastructure not in use for now +static bool sg_packageInitialized = false; + +/* + * user provided logging call back function + * currently only supported on linux x86-64 and nonrpc versions + * typedef void (*QnnLog_Callback_t)(const char* fmt, + * QnnLog_Level_t level, + * uint64_t timestamp, + * va_list args); + * usage: if(sg_logInitialized && level <= sg_maxLogLevel) + * sg_logCallback(fmt, level, timestamp, args); + * + * for cross rpc versions, skel side user provided logging call back function + * can be defined as part of op packages. maximal log level sg_maxLogLevel + * can be set by Qnn_ErrorHandle_t FastGeluOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) + */ +/* + * for alternative logging method provided by HTP core, please refer to log.h + */ +static QnnLog_Callback_t sg_logCallback = + nullptr; // user provided call back function pointer for logging +static QnnLog_Level_t sg_maxLogLevel = + (QnnLog_Level_t)0; // maximal log level used in user provided logging +static bool sg_logInitialized = + false; // tracks whether user provided logging method has been initialized + + +/* +* op initialization +* needs to be global in the package +* one initialization per package before any op definitions +* syntax: INIT_PACKAGE_OP_DEF() +*/ +INIT_PACKAGE_OP_DEF() + +/* +* optimization initialization +* needs to be global in the package +* one initialization per package before any optimization definitions +* syntax: INIT_PACKAGE_OPTIMIZATION_DEF() +*/ +INIT_PACKAGE_OPTIMIZATION_DEF() + +/* + * op parameter order initialization + * needs to be global in the package + * one initialization per package before any op parameter order definitions + * syntax: INIT_PACKAGE_PARAM_ORDER_DEF() + */ +INIT_PACKAGE_PARAM_ORDER_DEF() + +/* + * axis parameter name list + * optional + * needs to be global in the package + * one list per package + * for listing axis parameter names passed into Qnn_AddNode API + * HTP backend auto-adjusts values in axis parameters based on HTP backfilling + * note: HTP backend backfills tensor dimensions to 4 dimensions + * syntax: LIST_PACKAGE_AXIS_PARAMS(...) + * e.g. LIST_PACKAGE_AXIS_PARAMS("Axis", "AXIS", "axis") + */ +// LIST_PACKAGE_AXIS_PARAMS() + +/* + * per-channel quantized op name list + * optional + * needs to be global in the package + * one list per package + * for listing op names which support per-channel quantization + * per-axis quantization info of an op is embeded in axisScaleOffsetEncoding + * inside Qnn_Tensor_t types + * HTP backend only supports per-channel scale ops + * i.e. along last dimension, offset is always zero + * if an op name is marked as having per-channel scale support, and in + * QNN_AddNode, at least one input, parameter, or output has + * QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET type: + * then: + * HTP backend will pass to op implementation function the following: + * output(s), input(s), parameter(s), + * outputPerChannelScale(s), inputPerChannelScale(s), paramPerChannelScale(s) + * + * optimization rules can be used to remove extra perChannelScale tensors + * + * syntax: LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * e.g. LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(sg_op1Name, sg_op2Name) + */ + +// LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + +/* +* Declare and define the special intialize function for HTP Backend to load +*/ +INIT_PKG_CORE_INIT_FUNC() + +/* op package API's */ + +Qnn_ErrorHandle_t FastGeluOpPackageInit(QnnOpPackage_GlobalInfrastructure_t infrastructure) { + if (sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + + /* + * op parameter order registration + * registers all defined op parameter orders in the package + * syntax: REGISTER_PACKAGE_PARAM_ORDERS() + */ + REGISTER_PACKAGE_PARAM_ORDERS() + + /* + * op axis parameter name registration + * registers all axis parameter names in the package + * used with LIST_PACKAGE_AXIS_PARAMS(...) + * syntax: REGISTER_PACKAGE_AXIS_PARAMS() + */ + REGISTER_PACKAGE_AXIS_PARAMS() + + /* + * per-channel scale op name registration + * registers all per-channel scale op names in the package + * used with LIST_PACKAGE_PER_CHANNEL_QUANTIZED_OPS(...) + * syntax: REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + */ + REGISTER_PACKAGE_PER_CHANNEL_QUANTIZED_OPS() + + sg_globalInfra = infrastructure; + sg_packageInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t FastGeluOpPackageGetInfo(const QnnOpPackage_Info_t** info) { + if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + if (!info) return QNN_OP_PACKAGE_ERROR_INVALID_INFO; + + sg_packageInfo = QNN_OP_PACKAGE_INFO_INIT; + sg_packageInfo.packageName = sg_packageName; + sg_packageInfo.operationNames = sg_opNames.data(); + sg_packageInfo.numOperations = sg_opNames.size(); + sg_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID; + sg_packageInfo.sdkApiVersion = &sg_sdkApiVersion; + + *info = &sg_packageInfo; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t FastGeluOpPackageLogInitialize(QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) { + if (sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED; + if (!callback) return QNN_LOG_ERROR_INVALID_ARGUMENT; + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_logCallback = callback; + sg_maxLogLevel = maxLogLevel; + sg_logInitialized = true; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t FastGeluOpPackageLogSetLevel(QnnLog_Level_t maxLogLevel) { + if (maxLogLevel < QNN_LOG_LEVEL_ERROR) return QNN_LOG_ERROR_INVALID_ARGUMENT; + sg_maxLogLevel = maxLogLevel; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t FastGeluOpPackageLogTerminate() { + if (!sg_logInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + sg_logCallback = nullptr; + sg_maxLogLevel = (QnnLog_Level_t)0; + sg_logInitialized = false; + return QNN_SUCCESS; +} + +Qnn_ErrorHandle_t FastGeluOpPackageValidateOpConfig (Qnn_OpConfig_t opConfig){ + if (std::string(sg_packageName) != opConfig.v1.packageName) { + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* auto-generated validation code below + * Check if op config type matches any registered ops + * If a match is found, check number of inputs, outputs and params + */ + if (std::string(opConfig.v1.typeName) == "FastGelu"){ + if (opConfig.v1.numOfParams != 0 || opConfig.v1.numOfInputs != 1 || opConfig.v1.numOfOutputs != 1){ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + } + else{ + return QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE; + } + + /* + * additional validation code here + * */ + + return QNN_SUCCESS; +} + +/* The following three functions in this comment are not called by HTP backend for now, + * no auto-generated implementations are created. Users should see example for full function signatures. + * (version 1.3.0) Qnn_ErrorHandle_t FastGeluOpPackageCreateKernels (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_Kernel_t** kernels, uint32_t* + * numKernels) + * (version 1.3.0) Qnn_ErrorHandle_t FastGeluOpPackageFreeKernels (QnnOpPackage_Kernel_t* kernels) + * + * (version 1.4.0) Qnn_ErrorHandle_t FastGeluOpPackageCreateOpImpl (QnnOpPackage_GraphInfrastructure_t + * graphInfrastructure, QnnOpPackage_Node_t node, QnnOpPackage_OpImpl_t* opImpl) + *(version 1.4.0) Qnn_ErrorHandle_t FastGeluOpPackageFreeOpImpl (QnnOpPackage_OpImpl_t opImpl) + */ + +Qnn_ErrorHandle_t FastGeluOpPackageTerminate() { +if (!sg_packageInitialized) return QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED; + +sg_globalInfra = nullptr; +sg_packageInitialized = false; +return QNN_SUCCESS; +} + +#ifdef __cplusplus +extern "C" { +#endif + + +/* latest version */ +Qnn_ErrorHandle_t FastGeluOpPackageInterfaceProvider(QnnOpPackage_Interface_t* interface) { + if (!interface) return QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT; + interface->interfaceVersion = {1, 4, 0}; + interface->v1_4.init = FastGeluOpPackageInit; + interface->v1_4.terminate = FastGeluOpPackageTerminate; + interface->v1_4.getInfo = FastGeluOpPackageGetInfo; + interface->v1_4.validateOpConfig = FastGeluOpPackageValidateOpConfig; + interface->v1_4.createOpImpl = nullptr; + interface->v1_4.freeOpImpl = nullptr; + interface->v1_4.logInitialize = FastGeluOpPackageLogInitialize; + interface->v1_4.logSetLevel = FastGeluOpPackageLogSetLevel; + interface->v1_4.logTerminate = FastGeluOpPackageLogTerminate; + return QNN_SUCCESS; +} + +#ifdef __cplusplus +} +#endif + + diff --git a/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp new file mode 100644 index 00000000000..e8ac754f6d9 --- /dev/null +++ b/examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp @@ -0,0 +1,271 @@ +//============================================================================== +// Auto Generated Code for FastGeluOpPackage +//============================================================================== + +#include +#include +#include "HTP/core/constraints.h" +#include "HTP/core/op_package_feature_support.h" +#include "HTP/core/op_register_ext.h" +#include "HTP/core/optimize.h" +#include "HTP/core/simple_reg.h" +#include "QnnOpPackage.h" + +BEGIN_PKG_OP_DEFINITION(PKG_FastGelu); + +// op execute function declarations +template +GraphStatus fastgeluImpl(TensorType& y, const TensorType& x); + +// forward declaration of sample cost function +static float fastgeluCostFunc(const Op* op); + +/* + * method 1 for defining op, using default cost value (i.e. GLACIAL) and default + * flag (Flags::RESOURCE_HVX) syntax: DEF_PACKAGE_OP(F,OP) e.g. + * DEF_PACKAGE_OP((fastgeluImpl), "FastGelu") + */ +DEF_PACKAGE_OP((fastgeluImpl), "FastGelu") + +/* + * method 2 for defining op with specified cost value (one of GLACIAL, SNAIL, + * FAST, FREE) and provided flags syntax: + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS(F,OP,COST,...) can use zero or more flags, + * FLAG options are IS_CONST, INHIBIT_CONST_PROP, RESOURCE_HVX, RESOURCE_HMX(not + * supported in external op packages) e.g. + * DEF_PACKAGE_OP_AND_COST_AND_FLAGS((fastgeluImpl), + * "FastGelu", SNAIL) + */ + +/* + * method 3 for defining op with cost function pointer and provided flags + * cost function pointer type: typedef float (*cost_function) (const Op * op); + * syntax: DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS(F,OP,COST_F,...) + * e.g. DEF_PACKAGE_OP_AND_COST_F_AND_FLAGS((fastgeluImpl), + * "FastGelu", fastgeluCostFunc, Flags::RESOURCE_HVX) + */ + +/* + * optimization definitions + * need to be global in the package + * one definition per optimization + * syntax: + * DEF_PACKAGE_OPTIMIZATION(PRIORITY,MATCHCODE,CONSTRAINTCODE,REPLACECODE) + * PRIORITY predefined values include EARLY(2000), MIDDLE(3000), LATE(4000) + * HTP core provides some replacement functions for op package to use + * for more information about optimization rules, please refer to HTP core + * documentations + */ + +/* + * op parameter order definitions + * need to be global in the package + * one definition per op, and this is optional + * syntax: + * DEF_PACKAGE_PARAM_ORDER(OP,PARAM1,MANDATORY1,DEFAULT1,PARAM2,MANDATORY2,DEFAULT2...) + * one or more parameters can be specified for each op + * order of parameters listed determines the order of parameters passed into op + * execution functions if an op does not have a parameter order definition, + * parameter order passed into Qnn_addNode will be passed into op execution + * functions if an op has a parameter order definition, any parameter passed + * into Qnn_addNode with unlisted name will be abandoned if two or more op + * packages with the same package name will be registered, they cannot list + * conflicting parameter orders + * PARAM refers to parameter name as a string literal + * MANDATORY refers to whether this parameter is required to be provided at + * Qnn_addNode DEFAULT is used when MANDATORY is false if provided as + * Qnn_Param_t*, DEFAULT will be used for graph construction when this parameter + * is not provided at Qnn_addNode if provided as nullptr, graph construction + * will skip this parameter when this parameter is not provided at Qnn_addNode + */ + +/* execute functions for ops */ + +// template +// GraphStatus fastgeluImpl(TensorType& y, const TensorType& x) { +// const uint32_t numElements = x.total_storage_elements(); + +// if (y.total_storage_elements() != numElements) { +// return GraphStatus::ErrorFatal; +// } + +// const float kAlpha = 0.7978845608f; // sqrt(2/pi) +// const float kCoeff = 0.044715f; + +// float* yData = reinterpret_cast(y.raw_data()); +// const float* xData = reinterpret_cast(x.raw_data_const()); + +// for (uint32_t i = 0; i < numElements; ++i) { +// const float v = xData[i]; +// const float inner = kAlpha * (v + kCoeff * v * v * v); +// yData[i] = 0.5f * v * (1.0f + std::tanh(inner)); +// } + +// return GraphStatus::Success; +// } + +template +GraphStatus fastgeluImpl(TensorType& y, const TensorType& x) { + const uint32_t N = x.total_storage_elements(); + + if (y.total_storage_elements() != N) { + return GraphStatus::ErrorFatal; + } + + const auto in_info = x.get_dtype_intfc(); + const auto out_info = y.get_dtype_intfc(); + + if (in_info.dtype != DType::Float32 || in_info.dtype != DType::QUInt8) { + return GraphStatus::ErrorPrecision; + } + if (in_info.dtype == DType::Float32 && out_info.dtype == DType::Float32) { + const float* xData = static_cast(x.raw_data_const()); + float* yData = static_cast(y.raw_data()); + + // --- Temporary FP16 buffers --- + std::vector tmp_in(N); + std::vector tmp_out(N); + + for (uint32_t i = 0; i < N; ++i) { + tmp_in[i] = static_cast(xData[i]); + } + +#ifdef __hexagon__ + union { + Float16 f; + uint16_t b; + } kAlpha = {(Float16)0.7978845608f}; // sqrt(2/pi) + union { + Float16 f; + uint16_t b; + } kCoeff = {(Float16)0.044715f}; + union { + Float16 f; + uint16_t b; + } kHalf = {(Float16)0.5f}; + union { + Float16 f; + uint16_t b; + } kOne = {(Float16)1.0f}; + union { + Float16 f; + uint16_t b; + } k27 = {(Float16)27.0f}; + union { + Float16 f; + uint16_t b; + } kInv27 = {(Float16)(1.0f / 27.0f)}; + union { + Float16 f; + uint16_t b; + } kOne3 = {(Float16)(1.0f / 3.0f)}; + union { + Float16 f; + uint16_t b; + } kOne9 = {(Float16)(1.0f / 9.0f)}; + + HVX_Vector v_alpha = Q6_Vh_vsplat_R(kAlpha.b); + HVX_Vector v_coeff = Q6_Vh_vsplat_R(kCoeff.b); + HVX_Vector v_half = Q6_Vh_vsplat_R(kHalf.b); + HVX_Vector v_one = Q6_Vh_vsplat_R(kOne.b); + HVX_Vector v_27 = Q6_Vh_vsplat_R(k27.b); + HVX_Vector v_inv27 = Q6_Vh_vsplat_R(kInv27.b); + HVX_Vector v_1_3 = Q6_Vh_vsplat_R(kOne3.b); + HVX_Vector v_1_9 = Q6_Vh_vsplat_R(kOne9.b); + + const int VBYTES = 128; + const int ELEMS = VBYTES / sizeof(Float16); // 64 + + for (uint32_t i = 0; i < N; i += ELEMS) { + HVX_Vector vx = q6op_V_vldu_A(&tmp_in[i]); // x + HVX_Vector vx2 = Q6_Vhf_vmpy_VhfVhf(vx, vx); // x^2 + HVX_Vector vx3 = Q6_Vhf_vmpy_VhfVhf(vx2, vx); // x^3 + + // z = α * (x + c*x^3) + HVX_Vector vcx3 = Q6_Vhf_vmpy_VhfVhf(vx3, v_coeff); + HVX_Vector vsum = Q6_Vhf_vadd_VhfVhf(vx, vcx3); + HVX_Vector vz = Q6_Vhf_vmpy_VhfVhf(vsum, v_alpha); + + // z^2, z^4 + HVX_Vector vz2 = Q6_Vhf_vmpy_VhfVhf(vz, vz); + HVX_Vector vz4 = Q6_Vhf_vmpy_VhfVhf(vz2, vz2); + + // inv_den ≈ (1/27) * (1 - (1/3) z^2 + (1/9) z^4) + HVX_Vector term1 = Q6_Vhf_vmpy_VhfVhf(vz2, v_1_3); // (1/3) z^2 + HVX_Vector one_m_t = Q6_Vhf_vsub_VhfVhf(v_one, term1); // 1 - (1/3) z^2 + HVX_Vector term2 = Q6_Vhf_vmpy_VhfVhf(vz4, v_1_9); // (1/9) z^4 + HVX_Vector poly = + Q6_Vhf_vadd_VhfVhf(one_m_t, term2); // 1 - 1/3 z^2 + 1/9 z^4 + HVX_Vector inv_den = Q6_Vhf_vmpy_VhfVhf(poly, v_inv27); // * (1/27) + + // num = z * (27 + z^2) = 27z + z^3 + HVX_Vector z3 = Q6_Vhf_vmpy_VhfVhf(vz2, vz); + HVX_Vector t27z = Q6_Vhf_vmpy_VhfVhf(vz, v_27); + HVX_Vector num = Q6_Vhf_vadd_VhfVhf(t27z, z3); + + // tanh(z) ≈ num * inv_den + HVX_Vector vtanh = Q6_Vhf_vmpy_VhfVhf(num, inv_den); + + // y = 0.5 * x * (1 + tanh) + HVX_Vector one_plus_tanh = Q6_Vhf_vadd_VhfVhf(v_one, vtanh); + HVX_Vector t = Q6_Vhf_vmpy_VhfVhf(vx, one_plus_tanh); + HVX_Vector vy = Q6_Vhf_vmpy_VhfVhf(t, v_half); + + q6op_vstu_AV(&tmp_out[i], vy); + } +#else + // Scalar fallback + for (uint32_t i = 0; i < N; ++i) { + const float v = xData[i]; + const float inner = 0.7978845608f * (v + 0.044715f * v * v * v); + yData[i] = 0.5f * v * (1.0f + std::tanh(inner)); + } +#endif + + for (uint32_t i = 0; i < N; ++i) { + yData[i] = static_cast(tmp_out[i]); + } + return GraphStatus::Success; + } else if (in_info.dtype == DType::QUInt8) { + const uint8_t* xData = static_cast(x.raw_data_const()); + uint8_t* yData = static_cast(y.raw_data()); + + const float x_scale = in_info.scale; + const float y_scale = out_info.scale; + const int32_t x_zero = in_info.offset; + const int32_t y_zero = out_info.offset; + + alignas(128) static uint8_t lut[256]; + static bool lut_init = false; + if (!lut_init) { + for (int i = 0; i < 256; ++i) { + float x_f = (i - x_zero) * x_scale; + float inner = 0.7978845608f * (x_f + 0.044715f * x_f * x_f * x_f); + float y_f = 0.5f * x_f * (1.0f + std::tanh(inner)); + int y_q = static_cast(std::round(y_f / y_scale)) + y_zero; + lut[i] = static_cast(std::clamp(y_q, 0, 255)); + } + lut_init = true; + } + for (uint32_t i = 0; i < N; ++i) { + yData[i] = lut[xData[i]]; + } + return GraphStatus::Success; + } else { + return GraphStatus::ErrorFatal; + } +} + +__attribute__((unused)) static float fastgeluCostFunc(const Op* op) { + /* + * add code here + * */ + + float cost = 0.0; // add cost computation here + return cost; +} + +/* At the bottom of the op file, call END_PKG_OP_DEFINITION(), + where is as BEGIN_PKG_OP_DEFINITION +*/ +END_PKG_OP_DEFINITION(PKG_FastGelu);