-
Notifications
You must be signed in to change notification settings - Fork 666
evaluate Video-MME_1fps on SmolVLM got cuda OOM error #1401
Copy link
Copy link
Open
Description
I'm using 4 A100 (80G) to evaluate. Evaluating Video-MME_1fps on SmolVLM got cuda OOM error but Video-MME_8frame works fine.
command:
torchrun --nproc-per-node=4 run.py --data Video-MME_1fps --model SmolVLM --work-dir eval
Error message:
Traceback (most recent call last):
File "/dump/algoswnvme1/wthsu/LLM/VLMEvalKit/run.py", line 320, in main
model = infer_data_job_video(
^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/LLM/VLMEvalKit/vlmeval/inference_video.py", line 227, in infer_data_job_video
model = infer_data(
^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/LLM/VLMEvalKit/vlmeval/inference_video.py", line 191, in infer_data
response = model.generate(message=struct, dataset=dataset_name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/LLM/VLMEvalKit/vlmeval/vlm/base.py", line 116, in generate
return self.generate_inner(message, dataset)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/LLM/VLMEvalKit/vlmeval/vlm/smolvlm.py", line 97, in generate_inner
generated_ids = self.model.generate(**inputs, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/generation/utils.py", line 2629, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/generation/utils.py", line 3610, in _sample
outputs = self(**model_inputs, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py", line 959, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 973, in forward
outputs = self.model(
^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py", line 959, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 795, in forward
image_hidden_states = self.get_image_features(pixel_values, pixel_attention_mask)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 722, in get_image_features
image_hidden_states = self.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 572, in forward
encoder_outputs = self.encoder(
^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 397, in forward
layer_outputs = encoder_layer(
^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/modeling_layers.py", line 94, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 326, in forward
hidden_states = self.mlp(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/models/idefics3/modeling_idefics3.py", line 271, in forward
hidden_states = self.activation_fn(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/dump/algoswnvme1/wthsu/miniforge3/envs/llm/lib/python3.11/site-packages/transformers/activations.py", line 37, in forward
return nn.functional.gelu(input, approximate="tanh")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 14.74 GiB. GPU 0 has a total capacity of 79.25 GiB of which 6.89 GiB is free. Including non-PyTorch memory, this process has 72.35 GiB memory in use. Of the allocated memory 49.20 GiB is allocated by PyTorch, and 22.51 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels