Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,5 @@ docs/image.png

__pycache__
/design
/plan
/tests
13 changes: 13 additions & 0 deletions CubeAPI/src/cubemaster/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,19 @@ pub struct CreateSandboxRequest {
skip_serializing_if = "Option::is_none"
)]
pub cube_network_config: Option<CubeNetworkConfig>,

/// Auto-pause: when true, CubeMaster publishes this sandbox to the
/// auto-pause registry consumed by CubeProxy-sidecar; once the proxy
/// reports it idle for `timeout` seconds the sidecar pauses it.
/// Field name matches CubeMaster's `auto_pause` JSON tag.
#[serde(skip_serializing_if = "std::ops::Not::not", default)]
pub auto_pause: bool,

/// Auto-resume: when true, an incoming request hitting a paused sandbox
/// is transparently resumed instead of erroring. Field name matches
/// CubeMaster's `auto_resume` JSON tag.
#[serde(skip_serializing_if = "std::ops::Not::not", default)]
pub auto_resume: bool,
}

/// Network egress control sent to CubeMaster.
Expand Down
6 changes: 2 additions & 4 deletions CubeAPI/src/handlers/agenthub.rs
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,7 @@ pub async fn create_agent_instance(
.create_sandbox(NewSandbox {
template_id: template_id.clone(),
timeout,
auto_pause: false,
auto_resume: None,
lifecycle: None,
secure: None,
allow_internet_access: Some(true),
network: network_config,
Expand Down Expand Up @@ -1995,8 +1994,7 @@ pub async fn clone_agent_instance(
.create_sandbox(NewSandbox {
template_id: snapshot_id.clone(),
timeout,
auto_pause: false,
auto_resume: None,
lifecycle: None,
secure: None,
allow_internet_access: Some(true),
network: network_config,
Expand Down
50 changes: 39 additions & 11 deletions CubeAPI/src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,37 @@ pub struct EgressRuleInject {
pub format: Option<String>,
}

/// Auto-resume configuration for paused sandboxes.
#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)]
pub struct SandboxAutoResumeConfig {
pub enabled: bool,
/// Sandbox lifecycle configuration. Mirrors the e2b SDK's `lifecycle` object —
/// see https://e2b.dev/docs/sandbox/auto-resume for the canonical reference.
///
/// `on_timeout` decides what happens when the sandbox idle timer fires; the
/// historical default is "kill" (delete the sandbox) which matches today's
/// behaviour. `auto_resume` only takes effect when `on_timeout = "pause"` —
/// it tells the proxy/sidecar to wake a paused sandbox up automatically when
/// activity arrives, instead of returning an error.
#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)]
pub struct SandboxLifecycleConfig {
/// "kill" (default) | "pause".
#[serde(rename = "onTimeout", default)]
pub on_timeout: SandboxOnTimeout,

/// Auto-resume on activity. Defaults to false. Only meaningful when
/// `on_timeout` is set to "pause".
#[serde(rename = "autoResume", default)]
pub auto_resume: bool,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, ToSchema)]
#[serde(rename_all = "lowercase")]
pub enum SandboxOnTimeout {
Kill,
Pause,
}

impl Default for SandboxOnTimeout {
fn default() -> Self {
Self::Kill
}
}

/// Volume mount inside the sandbox.
Expand All @@ -133,8 +160,9 @@ pub struct SandboxVolumeMount {

/// Request body for POST /sandboxes
/// Field names match exactly what the E2B SDK sends.
/// Rule: ID abbreviations → uppercase (templateID, sandboxID, envVars, autoPause);
/// allow_internet_access is a known SDK snake_case quirk.
/// Rule: ID abbreviations → uppercase (templateID, sandboxID, envVars);
/// allow_internet_access is a known SDK snake_case quirk;
/// lifecycle is a nested object — see SandboxLifecycleConfig.
#[derive(Debug, Deserialize, Validate, ToSchema)]
#[allow(dead_code)]
pub struct NewSandbox {
Expand All @@ -145,11 +173,11 @@ pub struct NewSandbox {
#[serde(default = "default_timeout")]
pub timeout: i32,

#[serde(rename = "autoPause", default)]
pub auto_pause: bool,

#[serde(rename = "autoResume", skip_serializing_if = "Option::is_none")]
pub auto_resume: Option<SandboxAutoResumeConfig>,
/// Sandbox lifecycle configuration. Maps to e2b's `lifecycle` object so
/// callers that already speak e2b can pass through unchanged. Absent
/// (None) means today's behaviour: idle sandboxes are killed.
#[serde(skip_serializing_if = "Option::is_none")]
pub lifecycle: Option<SandboxLifecycleConfig>,

#[serde(skip_serializing_if = "Option::is_none")]
pub secure: Option<bool>,
Expand Down
127 changes: 126 additions & 1 deletion CubeAPI/src/services/sandboxes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,21 @@ impl SandboxService {
let cube_network_config =
build_cube_network_config(body.allow_internet_access, body.network.as_ref())?;

// Derive the two CubeMaster-side bools from the e2b-shaped lifecycle
// object. Absent lifecycle keeps today's behaviour: idle sandboxes
// are killed (auto_pause = false), and auto_resume defaults off.
let (auto_pause, auto_resume) = body
.lifecycle
.as_ref()
.map(|lc| {
use crate::models::SandboxOnTimeout;
(
matches!(lc.on_timeout, SandboxOnTimeout::Pause),
lc.auto_resume,
)
})
.unwrap_or((false, false));

let req = CreateSandboxRequest {
request_id: new_request_id(),
instance_type: self.instance_type.clone(),
Expand All @@ -150,6 +165,8 @@ impl SandboxService {
exposed_ports: vec![],
network_type: Some("tap".to_string()),
cube_network_config,
auto_pause,
auto_resume,
};

let resp = self
Expand Down Expand Up @@ -718,7 +735,7 @@ mod tests {
use std::collections::HashMap;

use super::{build_cube_network_config, filter_by_metadata, from_cubemaster_info};
use crate::cubemaster::{ListSandboxResponse, SandboxInfo};
use crate::cubemaster::{CreateSandboxRequest, ListSandboxResponse, SandboxInfo};
use crate::models::{
EgressRule, EgressRuleAction, EgressRuleInject, EgressRuleMatch, SandboxNetworkConfig,
SandboxState,
Expand Down Expand Up @@ -953,4 +970,112 @@ mod tests {
.iter()
.all(|sandbox| sandbox.state == SandboxState::Paused));
}

/// CubeMaster keys lifecycle metadata off these exact JSON field names —
/// `auto_pause` / `auto_resume`. If they ever rename or get dropped during
/// serialization the auto-pause sidecar silently treats every new sandbox
/// as opted-out. Lock the wire shape down with a serialization snapshot.
#[test]
fn create_sandbox_request_serializes_lifecycle_flags() {
let mut req = CreateSandboxRequest {
request_id: "req-1".to_string(),
instance_type: "cubebox".to_string(),
timeout: Some(60),
annotations: HashMap::new(),
labels: None,
volumes: None,
containers: vec![],
exposed_ports: vec![],
network_type: None,
cube_network_config: None,
auto_pause: false,
auto_resume: false,
};

// Both false → both fields are omitted (skip_serializing_if = Not::not).
let json = serde_json::to_value(&req).unwrap();
assert!(
json.get("auto_pause").is_none(),
"auto_pause=false should be omitted, got: {json}"
);
assert!(
json.get("auto_resume").is_none(),
"auto_resume=false should be omitted, got: {json}"
);

// Flip on → fields appear with snake_case key matching CubeMaster's
// `json:"auto_pause,omitempty"` and `json:"auto_resume,omitempty"`.
req.auto_pause = true;
req.auto_resume = true;
let json = serde_json::to_value(&req).unwrap();
assert_eq!(json.get("auto_pause"), Some(&serde_json::Value::Bool(true)));
assert_eq!(
json.get("auto_resume"),
Some(&serde_json::Value::Bool(true))
);
}

/// The inbound API mirrors the e2b `lifecycle` object (camelCase nested
/// struct). CubeAPI then translates it to the two CubeMaster-side bools
/// when constructing the create-sandbox RPC. Verify the translation
/// covers each meaningful combination.
#[test]
fn lifecycle_object_translates_to_cubemaster_bools() {
use crate::models::{NewSandbox, SandboxLifecycleConfig, SandboxOnTimeout};

// Helper that mimics services::create_sandbox's lifecycle decoding.
fn translate(body: &NewSandbox) -> (bool, bool) {
body.lifecycle
.as_ref()
.map(|lc| {
(
matches!(lc.on_timeout, SandboxOnTimeout::Pause),
lc.auto_resume,
)
})
.unwrap_or((false, false))
}

// Absent lifecycle => preserve historical behaviour.
let absent: NewSandbox = serde_json::from_value(serde_json::json!({
"templateID": "tpl",
}))
.unwrap();
assert_eq!(translate(&absent), (false, false));

// Explicit kill (with auto_resume=true) is still kill — auto_resume
// doesn't auto-imply pause. Server-side enforcement of the e2b
// semantic ("auto_resume only meaningful when on_timeout=pause") is
// delegated to CubeMaster.
let kill: NewSandbox = serde_json::from_value(serde_json::json!({
"templateID": "tpl",
"lifecycle": {"onTimeout": "kill", "autoResume": true},
}))
.unwrap();
assert_eq!(translate(&kill), (false, true));

// Pause + auto_resume — the canonical e2b auto-resume case.
let pause_with_resume: NewSandbox = serde_json::from_value(serde_json::json!({
"templateID": "tpl",
"lifecycle": {"onTimeout": "pause", "autoResume": true},
}))
.unwrap();
assert_eq!(translate(&pause_with_resume), (true, true));

// Pause without auto_resume — caller must call connect() manually.
let pause_only: NewSandbox = serde_json::from_value(serde_json::json!({
"templateID": "tpl",
"lifecycle": {"onTimeout": "pause"},
}))
.unwrap();
assert_eq!(translate(&pause_only), (true, false));

// Empty lifecycle object — defaults: kill on timeout, no auto-resume.
let empty: NewSandbox = serde_json::from_value(serde_json::json!({
"templateID": "tpl",
"lifecycle": {},
}))
.unwrap();
assert_eq!(translate(&empty), (false, false));
}
}
9 changes: 9 additions & 0 deletions CubeMaster/cmd/cubemaster/app/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/cubelet/grpcconn"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/errorcode"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/instancecache"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/lifecycle"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/localcache"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/nodemeta"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/scheduler"
Expand Down Expand Up @@ -162,6 +163,14 @@ func coreInit(ctx context.Context, cfg *config.Config) error {
return err
}

// lifecycle wires the auto-pause / auto-resume metadata channel into the
// sandbox create/destroy hooks. It is non-fatal: a Redis hiccup must not
// block CubeMaster from serving sandboxes, only the sidecar's view goes
// stale until the next reconcile.
if err := lifecycle.Init(ctx); err != nil {
log.G(ctx).Warnf("lifecycle init fail (non-fatal): %v", err)
}

scheduler.InitScheduler(ctx)

if err := sandbox.Init(ctx, cfg); err != nil {
Expand Down
89 changes: 89 additions & 0 deletions CubeMaster/pkg/lifecycle/init.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright (c) 2026 Tencent Inc.
// SPDX-License-Identifier: Apache-2.0
//

package lifecycle

import (
"context"
"time"

"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/log"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/wrapredis"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/service/sandbox"
sandboxtypes "github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/service/sandbox/types"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/task"
)

// Init wires the lifecycle metadata channel into the sandbox create/destroy
// hooks. Call exactly once at process start, after wrapredis is reachable.
//
// Failures here are intentionally non-fatal: lifecycle metadata is an
// observability/coordination side channel for the auto-pause sidecar; if it
// is missing the rest of CubeMaster keeps working and sandboxes still serve
// traffic. Callers (main.go) should log a warning and proceed.
//
// We use the single shared wrapredis pool. The sidecar consumes lifecycle
// metadata and the bypass_host_proxy map from the same Redis instance, so
// any pool that can write proxy entries can also write lifecycle entries.
func Init(ctx context.Context) error {
pool := wrapredis.GetRedis()
if isNilPool(pool) {
log.G(ctx).Warnf("lifecycle: redis pool unavailable; auto-pause metadata channel disabled")
return nil
}

store := NewStore(pool)
setDefaultStore(store)

sandbox.RegisterAfterCreateSandboxSuccessHook(onAfterCreate)
// Both the synchronous destroy path (sandbox_remove.callCubelet) and the
// asynchronous task executor end with their own success hook. Register on
// both so we publish exactly once for either deletion mode.
sandbox.RegisterAfterDestroySandboxSuccessHook(onAfterDestroy)
task.RegisterAfterDestroyTaskSuccessHook(onAfterDestroy)

log.G(ctx).Infof("lifecycle: auto-pause metadata channel ready (key=%s, stream=%s)",
MetaKey, EventStreamKey)
return nil
}

// isNilPool guards against wrapredis.GetRedis returning a typed-nil
// (*RedisWrap)(nil) — that satisfies a nil interface check via != nil but
// is functionally unusable. We unwrap by inspecting the concrete pool.
func isNilPool(w *wrapredis.RedisWrap) bool {
return w == nil || w.RedisConnPool == nil
}

func onAfterCreate(ctx context.Context, sandboxID, hostID, hostIP string, req *sandboxtypes.CreateCubeSandboxReq) error {
store := getDefaultStore()
if store == nil || req == nil {
return nil
}
meta := &SandboxLifecycleMeta{
SandboxID: sandboxID,
HostID: hostID,
HostIP: hostIP,
InstanceType: req.InstanceType,
TimeoutSeconds: req.Timeout,
AutoPause: req.AutoPause,
AutoResume: req.AutoResume,
CreatedAt: time.Now().UnixMilli(),
}
if req.Annotations != nil {
// Template ID is conventionally carried via annotations from CubeAPI;
// the field is informational so we tolerate it being absent.
if v, ok := req.Annotations["template_id"]; ok {
meta.TemplateID = v
}
}
store.PublishCreate(ctx, meta)
return nil
}

func onAfterDestroy(ctx context.Context, sandboxID string) error {
if store := getDefaultStore(); store != nil {
store.PublishDelete(ctx, sandboxID)
}
return nil
}
Loading
Loading