open-compass · Gugugugugutian · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 20, 2026
diff --git a/run.py b/run.py
@@ -396,7 +396,16 @@ def main():
                             judge_kwargs['model'] = 'gpt-4-turbo'
                     elif listinstr(['VGRPBench'], dataset_name):
                         judge_kwargs['model'] = 'gpt-4o'
-                    elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius', 'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name):  # noqa: E501
+                    elif listinstr(['MSSBench', 'SIUO', 'SIUO_GEN'], dataset_name):
+                        # Align SafeWork-R1 default judge for MSSBench / SIUO.
+                        judge_kwargs['model'] = 'gpt-4o-2024-11-20'
+                    elif listinstr(['XSTest', 'Flames'], dataset_name):
+                        # Align SafeWork-R1 default judge for XSTest / Flames.
+                        judge_kwargs['model'] = 'gpt-4o'
+                    elif listinstr(['MMSafetyBench'], dataset_name):
+                        # Align SafeWork-R1 default judge for MM-SafetyBench.
+                        judge_kwargs['model'] = 'gpt-4o-mini'
+                    elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius'], dataset_name):  # noqa: E501
                         judge_kwargs['model'] = 'gpt-4o-mini'
                     elif listinstr(['OlympiadBench'], dataset_name):
                         use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False)

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -314,6 +314,7 @@ def evaluate(self, eval_file, **judge_kwargs):
 # add by EASI team
 VIDEO_DATASET += [SiteBenchVideo, VsiBench, VsiSuperRecall, VsiSuperCount, MMSIVideoBench, STIBench, DSRBench]  # noqa: E501
 
+
 TEXT_DATASET = [
     TextMCQDataset, SGI_Bench_Wet_Experiment, SGI_Bench_Dry_Experiment,
     SGI_Bench_Deep_Research, SGI_Bench_Idea_Generation, XSTestDataset, FlamesDataset

diff --git a/vlmeval/dataset/flames.py b/vlmeval/dataset/flames.py
@@ -2,8 +2,8 @@
 import re
 import warnings
 
-from ..smp import d2df, dump, load
-from ..smp.file import get_intermediate_file_path
+from ..smp.file import dump, get_intermediate_file_path, load
+from ..smp.misc import d2df
 from ..utils import track_progress_rich
 from .text_base import TextBaseDataset
 from .utils import DEBUG_MESSAGE, build_judge
@@ -72,14 +72,16 @@ def evaluate(self, eval_file, **judge_kwargs):
         data = load(eval_file)
         assert 'prediction' in data and 'question' in data
 
-        model_name = judge_kwargs.pop('model', 'gpt-4o-mini')
+        model_name = judge_kwargs.pop('model', 'gpt-4o')
         nproc = judge_kwargs.pop('nproc', 4)
         tmp_file = get_intermediate_file_path(eval_file, f'_{model_name}_judge', 'pkl')
 
         if model_name == 'exact_matching':
             data['score_1to3'] = [_parse_flames_score(x) for x in data['prediction']]
             data['judge_log'] = ['rule'] * len(data)
         else:
+            # Align with SafeWork-R1 Flames scorer defaults.
+            judge_kwargs.setdefault('temperature', 0.0)
             judge = build_judge(model=model_name, **judge_kwargs)
             if hasattr(judge, 'working') and not judge.working():
                 warnings.warn('Judge is not working. Fallback to rule parser.\n' + DEBUG_MESSAGE)

diff --git a/vlmeval/dataset/hipho.py b/vlmeval/dataset/hipho.py
@@ -841,7 +841,7 @@ def _build_result_item(self, row, index, result):
         earned_points = max(result['fine_grained_score'], result['coarse_grained_score'])
 
         return {
-            "id": str(row.get('id', f"{self.dataset_name}_{index+1}")),
+            "id": str(row.get('id', f"{self.dataset_name}_{index + 1}")),
             "context": str(row.get('context', '')).strip(),
             "question": str(row.get('question', '')).strip(),
             "solution": str(row.get('solution', '')).strip(),

diff --git a/vlmeval/dataset/m3oralbench.py b/vlmeval/dataset/m3oralbench.py
@@ -2,21 +2,20 @@
 
 import pandas as pd
 
-from ..smp import d2df, dump, load, toliststr
-from ..smp.file import get_intermediate_file_path
+from ..smp.file import dump, get_intermediate_file_path, load
+from ..smp.misc import d2df, toliststr
 from .image_base import ImageBaseDataset
 
 
 def _extract_option(pred):
     s = str(pred or "")
+    # Align with SafeWork-R1 extraction: boxed answer first.
     boxed = re.findall(r"\\boxed\{\s*([A-Z])\s*\}", s)
-    if len(boxed):
+    if boxed:
         return boxed[-1]
+    # Fallback: use the last parenthesized option.
     m = re.findall(r"\(([A-Z])\)", s)
-    if len(m):
-        return m[-1]
-    m = re.findall(r"\b([A-G])\b", s.upper())
-    if len(m):
+    if m:
         return m[-1]
     return ""
 
@@ -31,17 +30,55 @@ class M3oralBenchDataset(ImageBaseDataset):
     def supported_datasets(cls):
         return ['M3oralBench']
 
+    def __init__(self, dataset='M3oralBench', **kwargs):
+        super().__init__(dataset=dataset, **kwargs)
+        self._instruction_map = {}
+        if self.QUERY_JSON.exists():
+            try:
+                items = load(str(self.QUERY_JSON))
+                self._instruction_map = {
+                    int(item['id']): str(item.get('instruction', '')).strip()
+                    for item in items
+                    if 'id' in item and str(item.get('instruction', '')).strip()
+                }
+            except Exception:
+                self._instruction_map = {}
+
     def build_prompt(self, line):
         if isinstance(line, int):
             line = self.data.iloc[line]
         tgt_path = self.dump_image(line) if not self.meta_only else toliststr(line['image_path'])
-        options = {k: line[k] for k in 'ABCDEFG' if k in line and not pd.isna(line[k])}
-        prompt = f"Question: {line['question']}\n"
-        if len(options):
-            prompt += 'Options:\n'
-            for k, v in options.items():
-                prompt += f'{k}. {v}\n'
-            prompt += 'Please answer with only one option letter.'
+        prompt = None
+        line_id = line.get('sample_id', None)
+        if line_id is None:
+            line_id = line.get('id', None)
+        if line_id is None and line.get('index', None) is not None:
+            try:
+                line_id = int(line['index']) + 1
+            except Exception:
+                line_id = None
+        if line_id is not None:
+            try:
+                prompt = self._instruction_map.get(int(line_id), None)
+            except Exception:
+                prompt = None
+        if (
+            not prompt
+            and 'instruction' in line
+            and not pd.isna(line['instruction'])
+            and str(line['instruction']).strip()
+        ):
+            prompt = str(line['instruction'])
+        if prompt is None:
+            options = {k: line[k] for k in 'ABCDEFG' if k in line and not pd.isna(line[k])}
+            prompt = f"Question: {line['question']}\n"
+            if len(options):
+                prompt += 'Options:\n'
+                for k, v in options.items():
+                    prompt += f'{k}. {v}\n'
+                letters = "/".join([f"({k})" for k in options.keys()])
+                prompt += f'Please answer with only one option in this format: {letters}.'
+        # Align with SafeWork-R1 M3oralBench input construction: user content is image first, then text.
         msgs = [dict(type='image', value=p) for p in tgt_path]
         msgs.append(dict(type='text', value=prompt))
         return msgs