diff --git a/tests/inference_speed.py b/tests/inference_speed.py index 687d12ec4..00d2140a1 100644 --- a/tests/inference_speed.py +++ b/tests/inference_speed.py @@ -10,7 +10,6 @@ from gptqmodel.utils.torch import torch_empty_cache - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" @@ -44,6 +43,25 @@ class InferenceSpeed(unittest.TestCase): MAX_DELTA_FLOOR_PERCENT = 0.25 MAX_POSITIVE_DELTA_CEIL_PERCENT = 0.25 + @staticmethod + def _move_batch_to_device(batch, device): + if device is None: + return batch + + if hasattr(batch, "items"): + for key, value in list(batch.items()): + if torch.is_tensor(value): + batch[key] = value.to(device) + return batch + + if torch.is_tensor(batch): + return batch.to(device) + + if hasattr(batch, "to"): + return batch.to(device) + + return batch + def inference(self, model_path, backend, tokens_per_second, assert_result=True, optimize=False, fullgraph=False, warmup_runs=0, device=None): if device == "cuda" and torch.cuda.is_available(): device = f"cuda:{torch.cuda.current_device()}" @@ -59,8 +77,8 @@ def inference(self, model_path, backend, tokens_per_second, assert_result=True, tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer.pad_token_id = tokenizer.eos_token_id - inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left').to( - model.device) + inp = tokenizer(self.PROMPTS, padding=True, truncation=True, return_tensors="pt", padding_side='left') + inp = self._move_batch_to_device(inp, model.device) # compile kernels need JIT compile (Bitblas, IPEX, Triton) so we should do some warmup before actual speed run if warmup_runs > 0: diff --git a/tests/test_inference_speed_harness.py b/tests/test_inference_speed_harness.py index e14738c21..90b143f2e 100644 --- a/tests/test_inference_speed_harness.py +++ b/tests/test_inference_speed_harness.py @@ -1,8 +1,9 @@ # SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai # SPDX-License-Identifier: Apache-2.0 -import inference_speed as inference_speed_module import torch + +import inference_speed as inference_speed_module from inference_speed import InferenceSpeed @@ -114,3 +115,43 @@ def from_quantized(cls, model_path, backend, device): ) assert captured["device"] == "cuda:3" + + +def test_inference_speed_ignores_non_tensor_batch_values_when_moving_to_device(monkeypatch): + class _Harness(InferenceSpeed): + NUM_RUNS = 1 + MAX_NEW_TOKENS = 1 + PROMPTS = ["a"] + + class _BatchWithOptional(dict): + def __init__(self): + super().__init__({ + "input_ids": torch.zeros((1, 3), dtype=torch.long), + "attention_mask": torch.ones((1, 3), dtype=torch.long), + "token_type_ids": None, + }) + + def to(self, _device): + raise AssertionError("BatchEncoding.to() should not be used for optional None fields") + + class _TokenizerWithOptional(_FakeTokenizer): + def __call__(self, prompts, **_kwargs): + return _BatchWithOptional() + + timestamps = iter([0.0, 1.0]) + + monkeypatch.setattr(inference_speed_module, "logger", _FakeLogger()) + monkeypatch.setattr(inference_speed_module, "GPTQModel", _FakeModel) + monkeypatch.setattr(inference_speed_module, "AutoTokenizer", _TokenizerWithOptional) + monkeypatch.setattr(inference_speed_module, "torch_empty_cache", lambda: None) + monkeypatch.setattr(inference_speed_module.time, "time", lambda: next(timestamps)) + + measured_tps = _Harness().inference( + model_path="unused", + backend="fake-backend", + tokens_per_second=1.0, + warmup_runs=0, + device="cpu", + ) + + assert measured_tps == 1.0