|
1 | | -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 1 | +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
2 | 2 | # SPDX-License-Identifier: Apache-2.0 |
3 | 3 | # |
4 | 4 | # Licensed under the Apache License, Version 2.0 (the "License"); |
|
19 | 19 | in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based |
20 | 20 | region analysis to efficiently explore and optimize Q/DQ insertion strategies. |
21 | 21 |
|
22 | | -**Key Features:** |
23 | | -
|
24 | | -- **Automated Region Discovery**: Hierarchical decomposition of computation graphs into |
25 | | - LEAF and COMPOSITE regions with automatic pattern identification |
26 | | -
|
27 | | -- **Pattern-Based Optimization**: Groups structurally-similar regions and optimizes them |
28 | | - together, making the process efficient and consistent |
29 | | -
|
30 | | -- **TensorRT Performance Measurement**: Direct integration with TensorRT Python API for |
31 | | - accurate latency profiling of each Q/DQ configuration |
32 | | -
|
33 | | -- **State Management**: Checkpoint/resume capability for long-running optimizations with |
34 | | - incremental state saving after each region |
35 | | -
|
36 | | -- **Pattern Cache**: Warm-start optimization using learned schemes from previous runs, |
37 | | - enabling transfer learning across models |
38 | | -
|
39 | 22 | **Core Components:** |
40 | 23 |
|
41 | 24 | Autotuner Classes: |
|
64 | 47 | - TensorRTPyBenchmark: Benchmark using TensorRT Python API (recommended) |
65 | 48 | - TrtExecBenchmark: Benchmark using trtexec command-line tool (legacy) |
66 | 49 |
|
67 | | -**Quick Start:** |
68 | | -
|
69 | | - >>> from modelopt.onnx.quantization.autotune import QDQAutotuner, Config |
70 | | - >>> import onnx |
71 | | - >>> # Load model and initialize autotuner |
72 | | - >>> model = onnx.load("model.onnx") |
73 | | - >>> autotuner = QDQAutotuner(model) |
74 | | - >>> # Configure autotuning parameters |
75 | | - >>> config = Config(default_quant_type="int8") |
76 | | - >>> autotuner.initialize(config) |
77 | | - >>> # Generate and test Q/DQ schemes |
78 | | - >>> # (see workflows.region_pattern_autotuning_workflow for complete example) |
79 | | -
|
80 | | -**Command-Line Interface:** |
81 | | -
|
82 | | - The package can be run directly as a module: |
83 | | -
|
84 | | - $ python -m modelopt.onnx.quantization.autotune --model model.onnx --output ./output |
85 | | - $ python -m modelopt.onnx.quantization.autotune --model model.onnx --quant-type fp8 |
86 | | -
|
87 | 50 | **See Also:** |
88 | 51 |
|
89 | 52 | - workflows.region_pattern_autotuning_workflow: Complete end-to-end optimization |
|
101 | 64 | PatternCache, |
102 | 65 | PatternSchemes, |
103 | 66 | Region, |
104 | | - RegionError, |
105 | 67 | RegionType, |
106 | 68 | ) |
107 | | - |
108 | | -# Insertion points (from dedicated module) |
109 | 69 | from .insertion_points import ( |
110 | 70 | ChildRegionInputInsertionPoint, |
111 | 71 | NodeInputInsertionPoint, |
112 | 72 | RegionOutputInsertionPoint, |
113 | 73 | ResolvedInsertionPoint, |
114 | 74 | ) |
115 | | - |
116 | | -# Pattern analysis |
117 | 75 | from .region_pattern import RegionPattern |
118 | | - |
119 | | -# Region search |
120 | 76 | from .region_search import CombinedRegionSearch |
121 | 77 |
|
122 | | -# Public API |
123 | 78 | __all__ = [ |
124 | | - # Exceptions |
125 | 79 | "AutotunerError", |
126 | 80 | "AutotunerNotInitializedError", |
127 | 81 | "ChildRegionInputInsertionPoint", |
128 | 82 | "CombinedRegionSearch", |
129 | | - # Configuration and state |
130 | 83 | "Config", |
131 | | - # Q/DQ insertion |
132 | 84 | "InsertionScheme", |
133 | 85 | "InvalidSchemeError", |
134 | 86 | "NodeInputInsertionPoint", |
135 | | - "ResolvedInsertionPoint", |
136 | 87 | "PatternCache", |
137 | 88 | "PatternSchemes", |
138 | | - # Region classes |
139 | 89 | "Region", |
140 | 90 | "RegionError", |
141 | 91 | "RegionOutputInsertionPoint", |
142 | 92 | "RegionPattern", |
143 | 93 | "RegionType", |
| 94 | + "ResolvedInsertionPoint", |
144 | 95 | ] |
0 commit comments