-
Notifications
You must be signed in to change notification settings - Fork 49
Question about CALVIN benchmark #79
Copy link
Copy link
Open
Description
Thank you for sharing this great work! I've been trying to reproduce X-VLA on the CALVIN benchmark, but I'm encountering a significant performance gap.
Results Comparison:
My training model result: 1.97
Test 2toINF/X-VLA-Calvin-ABC_D on my GPU: 4.32
My Setup:
I wrote my own script to process the CALVIN ABC-D original dataset to HDF5 format, and followed your guidance to convert from CALVIN proprio (robot_obs[...,:6]) to abs eef actions. And I noticed that the training loss remains around 0.05 throughout training.
Questions:
- Could you help verify if my processing code is correct?
- Would it be possible to release your CALVIN dataset processing scripts or the processed datasets?
- Are there any specific details about data preprocessing that might be critical for achieving the reported performance?
Here is my conversion script:
import h5py
import cv2
import numpy as np
from tqdm import tqdm
# ---------------------------
# Config
# ---------------------------
root = "datasets/56/task_ABC_D/training"
save_root = "datasets/processed/task_ABC_D/training"
os.makedirs(save_root, exist_ok=True)
JPEG_QUALITY = 95
def encode_jpg_rgb(img_rgb: np.ndarray) -> np.ndarray:
assert img_rgb.dtype == np.uint8
img_bgr = img_rgb[..., ::-1] # RGB -> BGR
ok, enc = cv2.imencode(
".jpg",
img_bgr,
[int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY],
)
if not ok:
raise RuntimeError("cv2.imencode failed")
return enc
def load_episode_data(root, idx):
path = os.path.join(root, f"episode_{idx:07d}.npz")
try:
data = np.load(path)
out = {
"third_view": data["rgb_static"], # (200,200,3) uint8
"wrist_view": data["rgb_gripper"], # (84,84,3) uint8
"robot_state": data["robot_obs"], # (15,)
}
data.close()
return out
except FileNotFoundError:
raise FileNotFoundError(f"Episode file not found: {path}")
# ---------------------------
# Load language segments
# ---------------------------
anno = np.load(os.path.join(root, "lang_annotations/auto_lang_ann.npy"),allow_pickle=True,).item()
instructions = anno["language"]["ann"]
indices = anno["info"]["indx"]
segments = [(start_idx.item(), end_idx.item(), instruction) for (start_idx, end_idx),instruction in zip(indices, instructions)]
segments.sort(key=lambda x: x[0])
print("Total trajectories:", len(segments))
# ---------------------------
# Convert
# ---------------------------
for traj_id, (start, end, lang) in enumerate(tqdm(segments)):
save_path = os.path.join(save_root, f"traj_{traj_id:06d}.hdf5")
third_imgs = []
wrist_imgs = []
proprios = []
for frame_idx in range(start, end + 1):
fr = load_episode_data(root, frame_idx)
# images
third_imgs.append(encode_jpg_rgb(fr["third_view"]))
wrist_imgs.append(encode_jpg_rgb(fr["wrist_view"]))
# proprio = robot_obs[:6] + robot_obs[-1]
robot_state = fr["robot_state"]
proprio = np.concatenate([robot_state[:6], robot_state[-1:]],axis=0,)
proprios.append(proprio)
num_frames = len(proprios)
# ---------------------------
# Write HDF5
# ---------------------------
with h5py.File(save_path, "w") as f:
f.create_dataset("language_instruction", data=lang, dtype=h5py.string_dtype("utf-8"),)
# observation group
g = f.create_group("observation")
vlen_u8 = h5py.vlen_dtype(np.uint8)
d_third = g.create_dataset("third_image", (num_frames,), dtype=vlen_u8)
d_wrist = g.create_dataset("wrist_image", (num_frames,), dtype=vlen_u8)
for i in range(num_frames):
d_third[i] = third_imgs[i]
d_wrist[i] = wrist_imgs[i]
# proprio
f.create_dataset("proprio", data=np.stack(proprios).astype(np.float64),)
print("\n✓ Processing complete!")
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels