Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions tests/inference_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from gptqmodel.utils.torch import torch_empty_cache


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


Expand Down Expand Up @@ -44,6 +43,25 @@ class InferenceSpeed(unittest.TestCase):
MAX_DELTA_FLOOR_PERCENT = 0.25
MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.25

@staticmethod
def _move_batch_to_device(batch, device):
if device is None:
return batch

if hasattr(batch, "items"):
for key, value in list(batch.items()):
if torch.is_tensor(value):
batch[key] = value.to(device)
return batch

if torch.is_tensor(batch):
return batch.to(device)

if hasattr(batch, "to"):
return batch.to(device)

return batch

def inference(self, model_path, backend, tokens_per_second, assert_result=True, optimize=False, fullgraph=False, warmup_runs=0, device=None):
if device == "cuda" and torch.cuda.is_available():
device = f"cuda:{torch.cuda.current_device()}"
Expand All @@ -59,8 +77,8 @@ def inference(self, model_path, backend, tokens_per_second, assert_result=True,

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left').to(
model.device)
inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left')
inp = self._move_batch_to_device(inp, model.device)

# compile kernels need JIT compile (Bitblas, IPEX, Triton) so we should do some warmup before actual speed run
if warmup_runs > 0:
Expand Down
43 changes: 42 additions & 1 deletion tests/test_inference_speed_harness.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
# SPDX-License-Identifier: Apache-2.0

import inference_speed as inference_speed_module
import torch

import inference_speed as inference_speed_module
from inference_speed import InferenceSpeed


Expand Down Expand Up @@ -114,3 +115,43 @@ def from_quantized(cls, model_path, backend, device):
)

assert captured["device"] == "cuda:3"


def test_inference_speed_ignores_non_tensor_batch_values_when_moving_to_device(monkeypatch):
class _Harness(InferenceSpeed):
NUM_RUNS = 1
MAX_NEW_TOKENS = 1
PROMPTS = ["a"]

class _BatchWithOptional(dict):
def __init__(self):
super().__init__({
"input_ids": torch.zeros((1, 3), dtype=torch.long),
"attention_mask": torch.ones((1, 3), dtype=torch.long),
"token_type_ids": None,
})

def to(self, _device):
raise AssertionError("BatchEncoding.to() should not be used for optional None fields")

class _TokenizerWithOptional(_FakeTokenizer):
def __call__(self, prompts, **_kwargs):
return _BatchWithOptional()

timestamps = iter([0.0, 1.0])

monkeypatch.setattr(inference_speed_module, "logger", _FakeLogger())
monkeypatch.setattr(inference_speed_module, "GPTQModel", _FakeModel)
monkeypatch.setattr(inference_speed_module, "AutoTokenizer", _TokenizerWithOptional)
monkeypatch.setattr(inference_speed_module, "torch_empty_cache", lambda: None)
monkeypatch.setattr(inference_speed_module.time, "time", lambda: next(timestamps))

measured_tps = _Harness().inference(
model_path="unused",
backend="fake-backend",
tokens_per_second=1.0,
warmup_runs=0,
device="cpu",
)

assert measured_tps == 1.0
Loading