Question about CALVIN benchmark

Thank you for sharing this great work! I've been trying to reproduce X-VLA on the CALVIN benchmark, but I'm encountering a significant performance gap.
Results Comparison:

My training model result: **1.97**
Test _2toINF/X-VLA-Calvin-ABC_D_ on my GPU: **4.32**

My Setup:
I wrote my own script to process the CALVIN ABC-D original dataset to HDF5 format, and followed your guidance to convert from CALVIN proprio (robot_obs[...,:6]) to abs eef actions. And I noticed that the training loss remains around 0.05 throughout training.

**Questions:**

1. Could you help verify if my processing code is correct?
2. Would it be possible to release your CALVIN dataset processing scripts or the processed datasets?
3. Are there any specific details about data preprocessing that might be critical for achieving the reported performance?

Here is my conversion script:
```import os
import h5py
import cv2
import numpy as np
from tqdm import tqdm

# ---------------------------
# Config
# ---------------------------
root = "datasets/56/task_ABC_D/training"
save_root = "datasets/processed/task_ABC_D/training"

os.makedirs(save_root, exist_ok=True)

JPEG_QUALITY = 95


def encode_jpg_rgb(img_rgb: np.ndarray) -> np.ndarray:

    assert img_rgb.dtype == np.uint8
    img_bgr = img_rgb[..., ::-1]  # RGB -> BGR
    ok, enc = cv2.imencode(
        ".jpg",
        img_bgr,
        [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY],
    )
    if not ok:
        raise RuntimeError("cv2.imencode failed")
    return enc


def load_episode_data(root, idx):
    path = os.path.join(root, f"episode_{idx:07d}.npz")
    try:
        data = np.load(path)
        out = {
            "third_view": data["rgb_static"],        # (200,200,3) uint8
            "wrist_view": data["rgb_gripper"],     # (84,84,3) uint8
            "robot_state": data["robot_obs"],   # (15,)
        }
        data.close()
        return out
    except FileNotFoundError:
        raise FileNotFoundError(f"Episode file not found: {path}")


# ---------------------------
# Load language segments
# ---------------------------
anno = np.load(os.path.join(root, "lang_annotations/auto_lang_ann.npy"),allow_pickle=True,).item()

instructions = anno["language"]["ann"]
indices = anno["info"]["indx"]

segments = [(start_idx.item(), end_idx.item(), instruction) for (start_idx, end_idx),instruction in zip(indices, instructions)]
segments.sort(key=lambda x: x[0])

print("Total trajectories:", len(segments))


# ---------------------------
# Convert
# ---------------------------
for traj_id, (start, end, lang) in enumerate(tqdm(segments)):
    save_path = os.path.join(save_root, f"traj_{traj_id:06d}.hdf5")

    third_imgs = []
    wrist_imgs = []
    proprios = []

    for frame_idx in range(start, end + 1):
        fr = load_episode_data(root, frame_idx)
        # images
        third_imgs.append(encode_jpg_rgb(fr["third_view"])) 
        wrist_imgs.append(encode_jpg_rgb(fr["wrist_view"]))

        # proprio = robot_obs[:6] + robot_obs[-1]
        robot_state = fr["robot_state"]
        proprio = np.concatenate([robot_state[:6], robot_state[-1:]],axis=0,)
        proprios.append(proprio)

    num_frames = len(proprios)

    # ---------------------------
    # Write HDF5
    # ---------------------------
    with h5py.File(save_path, "w") as f:

        f.create_dataset("language_instruction", data=lang, dtype=h5py.string_dtype("utf-8"),)

        # observation group
        g = f.create_group("observation")

        vlen_u8 = h5py.vlen_dtype(np.uint8)
        d_third = g.create_dataset("third_image", (num_frames,), dtype=vlen_u8)
        d_wrist = g.create_dataset("wrist_image", (num_frames,), dtype=vlen_u8)

        for i in range(num_frames):
            d_third[i] = third_imgs[i]
            d_wrist[i] = wrist_imgs[i]

        # proprio
        f.create_dataset("proprio", data=np.stack(proprios).astype(np.float64),)

print("\n✓ Processing complete!")
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Question about CALVIN benchmark #79

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Question about CALVIN benchmark #79

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions