Skip to content
11 changes: 10 additions & 1 deletion run.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,16 @@ def main():
judge_kwargs['model'] = 'gpt-4-turbo'
elif listinstr(['VGRPBench'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius', 'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name): # noqa: E501
elif listinstr(['MSSBench', 'SIUO', 'SIUO_GEN'], dataset_name):
# Align SafeWork-R1 default judge for MSSBench / SIUO.
judge_kwargs['model'] = 'gpt-4o-2024-11-20'
elif listinstr(['XSTest', 'Flames'], dataset_name):
# Align SafeWork-R1 default judge for XSTest / Flames.
judge_kwargs['model'] = 'gpt-4o'
elif listinstr(['MMSafetyBench'], dataset_name):
# Align SafeWork-R1 default judge for MM-SafetyBench.
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius'], dataset_name): # noqa: E501
judge_kwargs['model'] = 'gpt-4o-mini'
elif listinstr(['OlympiadBench'], dataset_name):
use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False)
Expand Down
1 change: 1 addition & 0 deletions vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ def evaluate(self, eval_file, **judge_kwargs):
# add by EASI team
VIDEO_DATASET += [SiteBenchVideo, VsiBench, VsiSuperRecall, VsiSuperCount, MMSIVideoBench, STIBench, DSRBench] # noqa: E501


TEXT_DATASET = [
TextMCQDataset, SGI_Bench_Wet_Experiment, SGI_Bench_Dry_Experiment,
SGI_Bench_Deep_Research, SGI_Bench_Idea_Generation, XSTestDataset, FlamesDataset
Expand Down
8 changes: 5 additions & 3 deletions vlmeval/dataset/flames.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import re
import warnings

from ..smp import d2df, dump, load
from ..smp.file import get_intermediate_file_path
from ..smp.file import dump, get_intermediate_file_path, load
from ..smp.misc import d2df
from ..utils import track_progress_rich
from .text_base import TextBaseDataset
from .utils import DEBUG_MESSAGE, build_judge
Expand Down Expand Up @@ -72,14 +72,16 @@ def evaluate(self, eval_file, **judge_kwargs):
data = load(eval_file)
assert 'prediction' in data and 'question' in data

model_name = judge_kwargs.pop('model', 'gpt-4o-mini')
model_name = judge_kwargs.pop('model', 'gpt-4o')
nproc = judge_kwargs.pop('nproc', 4)
tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}_judge', 'pkl')

if model_name == 'exact_matching':
data['score_1to3'] = [_parse_flames_score(x) for x in data['prediction']]
data['judge_log'] = ['rule'] * len(data)
else:
# Align with SafeWork-R1 Flames scorer defaults.
judge_kwargs.setdefault('temperature', 0.0)
judge = build_judge(model=model_name, **judge_kwargs)
if hasattr(judge, 'working') and not judge.working():
warnings.warn('Judge is not working. Fallback to rule parser.\n' + DEBUG_MESSAGE)
Expand Down
2 changes: 1 addition & 1 deletion vlmeval/dataset/hipho.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,7 +841,7 @@ def _build_result_item(self, row, index, result):
earned_points = max(result['fine_grained_score'], result['coarse_grained_score'])

return {
"id": str(row.get('id', f"{self.dataset_name}_{index+1}")),
"id": str(row.get('id', f"{self.dataset_name}_{index + 1}")),
"context": str(row.get('context', '')).strip(),
"question": str(row.get('question', '')).strip(),
"solution": str(row.get('solution', '')).strip(),
Expand Down
65 changes: 51 additions & 14 deletions vlmeval/dataset/m3oralbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,20 @@

import pandas as pd

from ..smp import d2df, dump, load, toliststr
from ..smp.file import get_intermediate_file_path
from ..smp.file import dump, get_intermediate_file_path, load
from ..smp.misc import d2df, toliststr
from .image_base import ImageBaseDataset


def _extract_option(pred):
s = str(pred or "")
# Align with SafeWork-R1 extraction: boxed answer first.
boxed = re.findall(r"\\boxed\{\s*([A-Z])\s*\}", s)
if len(boxed):
if boxed:
return boxed[-1]
# Fallback: use the last parenthesized option.
m = re.findall(r"\(([A-Z])\)", s)
if len(m):
return m[-1]
m = re.findall(r"\b([A-G])\b", s.upper())
if len(m):
if m:
return m[-1]
return ""

Expand All @@ -31,17 +30,55 @@ class M3oralBenchDataset(ImageBaseDataset):
def supported_datasets(cls):
return ['M3oralBench']

def __init__(self, dataset='M3oralBench', **kwargs):
super().__init__(dataset=dataset, **kwargs)
self._instruction_map = {}
if self.QUERY_JSON.exists():
try:
items = load(str(self.QUERY_JSON))
self._instruction_map = {
int(item['id']): str(item.get('instruction', '')).strip()
for item in items
if 'id' in item and str(item.get('instruction', '')).strip()
}
except Exception:
self._instruction_map = {}

def build_prompt(self, line):
if isinstance(line, int):
line = self.data.iloc[line]
tgt_path = self.dump_image(line) if not self.meta_only else toliststr(line['image_path'])
options = {k: line[k] for k in 'ABCDEFG' if k in line and not pd.isna(line[k])}
prompt = f"Question: {line['question']}\n"
if len(options):
prompt += 'Options:\n'
for k, v in options.items():
prompt += f'{k}. {v}\n'
prompt += 'Please answer with only one option letter.'
prompt = None
line_id = line.get('sample_id', None)
if line_id is None:
line_id = line.get('id', None)
if line_id is None and line.get('index', None) is not None:
try:
line_id = int(line['index']) + 1
except Exception:
line_id = None
if line_id is not None:
try:
prompt = self._instruction_map.get(int(line_id), None)
except Exception:
prompt = None
if (
not prompt
and 'instruction' in line
and not pd.isna(line['instruction'])
and str(line['instruction']).strip()
):
prompt = str(line['instruction'])
if prompt is None:
options = {k: line[k] for k in 'ABCDEFG' if k in line and not pd.isna(line[k])}
prompt = f"Question: {line['question']}\n"
if len(options):
prompt += 'Options:\n'
for k, v in options.items():
prompt += f'{k}. {v}\n'
letters = "/".join([f"({k})" for k in options.keys()])
prompt += f'Please answer with only one option in this format: {letters}.'
# Align with SafeWork-R1 M3oralBench input construction: user content is image first, then text.
msgs = [dict(type='image', value=p) for p in tgt_path]
msgs.append(dict(type='text', value=prompt))
return msgs
Expand Down
Loading
Loading