Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CubeMaster/conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ redis:
max_active: 32
idle_timeout: 30
max_retry: 2
node_metric_ttl_sec: 600
# 0 = disabled; route key has no refresh path (see cubemaster.yaml comment).
sandbox_proxy_ttl_sec: 0

scheduler:
priority_select_num: 1
Expand Down
15 changes: 9 additions & 6 deletions CubeMaster/integration/mock_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,15 @@ func mocktest_InitGlobalResources() {

func mocktest_CleanupGlobalResources() {

conn := wrapredis.GetRedis()

redisKeysBefore, _ := redis.Int(conn.Do("DBSIZE"))
conn.Do("FLUSHDB")

redisRecordsCleaned := redisKeysBefore
// Clear only this test's dedicated miniredis instance instead of issuing a
// raw FLUSHDB over the shared connection. If RedisConf.Nodes were ever
// pointed at a real shared Redis (multiple services share db_no=0), FLUSHDB
// would wipe other services' data. Flushing the owned mock server is safe.
redisRecordsCleaned := 0
if mocktest_RedisSrv != nil {
redisRecordsCleaned = len(mocktest_RedisSrv.Keys())
mocktest_RedisSrv.FlushDB()
}
stdlog.Printf("Redis database cleanup completed, cleaned %d keys", redisRecordsCleaned)

if mocktest_OssDb != nil {
Expand Down
24 changes: 24 additions & 0 deletions CubeMaster/pkg/base/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,17 @@ type RedisConf struct {

Nodes string `yaml:"nodes"`
MaxRetry int `yaml:"max_retry"`

// NodeMetricTTLSec is the safety TTL (seconds) for node-metric keys so an
// offline node's entry auto-expires; refreshed on every heartbeat write.
// A value <= 0 disables the TTL.
NodeMetricTTLSec int `yaml:"node_metric_ttl_sec"`
// SandboxProxyTTLSec is an OPTIONAL safety TTL (seconds) for sandbox proxy
// routing keys. It defaults to 0 (disabled) because the route key has no
// refresh path; enabling it is only safe if the TTL exceeds the maximum
// sandbox lifetime, otherwise a live route would expire and break routing.
// Normal teardown removes the key via DEL.
SandboxProxyTTLSec int `yaml:"sandbox_proxy_ttl_sec"`
}

type SchedulerConf struct {
Expand Down Expand Up @@ -762,6 +773,19 @@ func preComHandleConf(config *Config) error {
if config.Common.DescribeTaskExpireTime == 0 {
config.Common.DescribeTaskExpireTime = 86400
}

if config.RedisConf != nil {
if config.RedisConf.NodeMetricTTLSec == 0 {
// Node metrics are rewritten on every heartbeat, so a short safety
// TTL only auto-cleans offline nodes and never expires live ones.
config.RedisConf.NodeMetricTTLSec = 600
}
// SandboxProxyTTLSec intentionally has no positive default: the proxy
// route key is written once at sandbox creation with no refresh path, so
// any TTL shorter than the max sandbox lifetime would expire a live
// route and break CubeProxy. Lifecycle is managed by explicit DEL.
// Leave it 0 (disabled) unless a refresh mechanism is added first.
}
if config.Common.DbMaxRetryCount == 0 {
config.Common.DbMaxRetryCount = 5
}
Expand Down
109 changes: 109 additions & 0 deletions CubeMaster/pkg/base/rediskey/rediskey.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (c) 2026 Tencent Inc.
// SPDX-License-Identifier: Apache-2.0
//

// Package rediskey centralizes Redis key construction so every key shares a
// unified namespace ("cube:{ver}:{scope}:{resource}[:{sub}]:{id}") instead of
// scattered string literals.
//
// Writes use the new namespaced keys only. Reads try the new key first and
// fall back to the legacy key so a simultaneous CubeMaster/CubeProxy upgrade
// can still see data written before the cutover.
//
// See docs/architecture/redis-key-spec.md for the full convention.
package rediskey

import "strings"

const (
// Prefix is the fixed product namespace guarding against collisions with
// third-party keys in the shared Redis instance.
Prefix = "cube"
// Version is the schema version; bump only on breaking value-structure
// changes (adding fields / Hash members does not require a bump).
Version = "v1"

// ScopeMaster marks data read/written only by CubeMaster.
ScopeMaster = "master"
// ScopeProxy marks data private to CubeProxy.
ScopeProxy = "proxy"
// ScopeAPI marks data private to CubeAPI (reserved, not yet used).
ScopeAPI = "api"
// ScopeShared marks cross-service contracts (renaming needs coordination).
ScopeShared = "shared"
)

func join(segs ...string) string { return strings.Join(segs, ":") }

// ---- standard key builders ----

// NodeMetric is the per-node resource metric Hash key.
func NodeMetric(nodeID string) string {
return join(Prefix, Version, ScopeMaster, "node", "metric", nodeID)
}

// SandboxProxy is the sandbox proxy routing Hash key, shared with CubeProxy.
func SandboxProxy(sandboxID string) string {
return join(Prefix, Version, ScopeShared, "sandbox", "proxy", sandboxID)
}

// InstanceInfo is the instance metadata Hash key.
func InstanceInfo(insID string) string {
return join(Prefix, Version, ScopeMaster, "instance", "info", insID)
}

// DescribeTask is the async describe-task result Hash key.
func DescribeTask(taskID string) string {
return join(Prefix, Version, ScopeMaster, "task", "describe", taskID)
}

// InstanceMeta is the generic instance metadata key (string/list).
func InstanceMeta(objs ...string) string {
return join(append([]string{Prefix, Version, ScopeMaster, "instance", "meta"}, objs...)...)
}

// SandboxLifecycleMeta is the global HSet of sandbox lifecycle snapshots.
func SandboxLifecycleMeta() string {
return join(Prefix, Version, ScopeShared, "sandbox", "lifecycle", "meta")
}

// SandboxLifecycleEvents is the append-only create/delete event stream.
func SandboxLifecycleEvents() string {
return join(Prefix, Version, ScopeShared, "sandbox", "lifecycle", "events")
}

// SandboxLifecycleState is the per-sandbox pause/resume coordination key.
func SandboxLifecycleState(sandboxID string) string {
return join(Prefix, Version, ScopeShared, "sandbox", "lifecycle", "state", sandboxID)
}

// ---- legacy key builders (read fallback / delete cleanup only) ----

// LegacyNodeMetric is the bare node ID used before namespacing.
func LegacyNodeMetric(nodeID string) string { return nodeID }

// LegacySandboxProxy is the pre-standardization sandbox proxy key.
func LegacySandboxProxy(sandboxID string) string { return "bypass_host_proxy:" + sandboxID }

// LegacyInstanceInfo is the pre-standardization instance info key.
func LegacyInstanceInfo(insID string) string { return "cube_instance_info:" + insID }

// LegacyDescribeTask is the pre-standardization describe-task key.
func LegacyDescribeTask(taskID string) string { return "describetask:" + taskID }

// LegacyInstanceMeta is the pre-standardization metadata key.
func LegacyInstanceMeta(objs ...string) string {
return join(append([]string{"instance", "metadata"}, objs...)...)
}

// ReadKeysWithFallback returns the key try-order for reads: new first, legacy
// second. Used while legacy keys may still exist in Redis after upgrade.
func ReadKeysWithFallback(newKey, legacyKey string) []string {
return []string{newKey, legacyKey}
}

// DeleteKeys returns both the new and legacy keys so teardown removes residue
// from either naming generation.
func DeleteKeys(newKey, legacyKey string) []string {
return []string{newKey, legacyKey}
}
74 changes: 40 additions & 34 deletions CubeMaster/pkg/instancecache/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ package instancecache

import (
"context"
"strings"
"time"

"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/constants"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/log"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/rediskey"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/wrapredis"
"github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/errorcode"
"github.com/tencentcloud/CubeSandbox/cubelog"
Expand All @@ -32,18 +32,18 @@ func trace(ctx context.Context, action string, op string, start time.Time, err e
}
}

// KeyMetadata builds the namespaced metadata key for the given segments.
func KeyMetadata(objs ...string) string {
segs := []string{"instance", "metadata"}
segs = append(segs, objs...)
return strings.Join(segs, ":")
return rediskey.InstanceMeta(objs...)
}

func MetadataSet(ctx context.Context, key string, value string) (err error) {
const (
redisOp = "SET"
)
func MetadataSet(ctx context.Context, value string, objs ...string) (err error) {
const redisOp = "SET"
start := time.Now()
defer trace(ctx, "Create", redisOp, start, err)
defer func() {
trace(ctx, "Create", redisOp, start, err)
}()
key := rediskey.InstanceMeta(objs...)
_, err = wrapredis.GetRedis().Do(redisOp, key, value)
if err != nil {
log.G(ctx).Errorf("redis %s error, key: %s, err: %s", redisOp, key, err)
Expand All @@ -55,13 +55,13 @@ func MetadataSet(ctx context.Context, key string, value string) (err error) {
return nil
}

func MetadataPush(ctx context.Context, key string, value string) (err error) {
const (
redisOp = "RPUSH"
)
func MetadataPush(ctx context.Context, value string, objs ...string) (err error) {
const redisOp = "RPUSH"
start := time.Now()
defer trace(ctx, "Create", redisOp, start, err)

defer func() {
trace(ctx, "Create", redisOp, start, err)
}()
key := rediskey.InstanceMeta(objs...)
_, err = wrapredis.GetRedis().Do(redisOp, key, value)
if err != nil {
log.G(ctx).Errorf("redis %s error, key: %s, err: %s", redisOp, key, err)
Expand All @@ -73,13 +73,13 @@ func MetadataPush(ctx context.Context, key string, value string) (err error) {
return nil
}

func MetadataLRem(ctx context.Context, key string, value string) (err error) {
const (
redisOp = "LREM"
)
func MetadataLRem(ctx context.Context, value string, objs ...string) (err error) {
const redisOp = "LREM"
start := time.Now()
defer trace(ctx, "Destroy", redisOp, start, err)

defer func() {
trace(ctx, "Destroy", redisOp, start, err)
}()
key := rediskey.InstanceMeta(objs...)
_, err = wrapredis.GetRedis().Do(redisOp, key, 0, value)
if err != nil {
log.G(ctx).Errorf("redis %s error, key: %s, err: %s", redisOp, key, err)
Expand All @@ -91,19 +91,25 @@ func MetadataLRem(ctx context.Context, key string, value string) (err error) {
return nil
}

func MetadataDel(ctx context.Context, key string) (err error) {
const (
redisOp = "DEL"
)
func MetadataDel(ctx context.Context, objs ...string) (err error) {
const redisOp = "DEL"
start := time.Now()
defer trace(ctx, "Destroy", redisOp, start, err)
_, err = wrapredis.GetRedis().Do(redisOp, key)
if err != nil {
log.G(ctx).Errorf("redis %s error, key: %s, err: %s", redisOp, key, err)
return err
}
if log.IsDebug() {
log.G(ctx).Debugf("redis.%s:%s", redisOp, key)
defer func() {
trace(ctx, "Destroy", redisOp, start, err)
}()
var firstErr error
for _, key := range rediskey.DeleteKeys(rediskey.InstanceMeta(objs...), rediskey.LegacyInstanceMeta(objs...)) {
if _, e := wrapredis.GetRedis().Do(redisOp, key); e != nil {
log.G(ctx).Errorf("redis %s error, key: %s, err: %s", redisOp, key, e)
if firstErr == nil {
firstErr = e
}
continue
}
if log.IsDebug() {
log.G(ctx).Debugf("redis.%s:%s", redisOp, key)
}
}
return nil
err = firstErr
return err
}
5 changes: 3 additions & 2 deletions CubeMaster/pkg/lifecycle/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ import (
// traffic. Callers (main.go) should log a warning and proceed.
//
// We use the single shared wrapredis pool. The sidecar consumes lifecycle
// metadata and the bypass_host_proxy map from the same Redis instance, so
// any pool that can write proxy entries can also write lifecycle entries.
// metadata and the sandbox proxy map (cube:v1:shared:sandbox:proxy) from the
// same Redis instance, so any pool that can write proxy entries can also write
// lifecycle entries.
func Init(ctx context.Context) error {
pool := wrapredis.GetRedis()
if isNilPool(pool) {
Expand Down
23 changes: 14 additions & 9 deletions CubeMaster/pkg/lifecycle/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,33 @@
//
// CubeMaster is the single writer for the canonical view:
//
// - cube:sandbox:meta HSet, field=sandboxID, value=JSON snapshot.
// Sidecars HGETALL it on startup to bootstrap the registry.
// - cube:sandbox:events Stream, append-only event log of
// create/delete operations. Sidecars consume via XREADGROUP for
// - cube:v1:shared:sandbox:lifecycle:meta HSet, field=sandboxID,
// value=JSON snapshot. Sidecars HGETALL it on startup to bootstrap
// the registry.
// - cube:v1:shared:sandbox:lifecycle:events Stream, append-only event
// log of create/delete operations. Sidecars consume via XREADGROUP for
// incremental updates after the bootstrap.
//
// Updates (pause/resume action) intentionally do NOT publish to the stream:
// state transitions are driven and observed by the sidecar itself, so making
// CubeMaster also publish them would just be a redundant round trip.
package lifecycle

// Redis key / field constants. Keep them centralized so the sidecar (Go) and
// any other consumer can import the same source of truth.
const (
import "github.com/tencentcloud/CubeSandbox/CubeMaster/pkg/base/rediskey"

// Redis key constants. Keep them centralized so the sidecar (Go) and any
// other consumer can import the same source of truth.
var (
// MetaKey is the HSet snapshot of every live sandbox the sidecar should
// know about. Field = sandbox ID, value = JSON-encoded SandboxLifecycleMeta.
MetaKey = "cube:sandbox:meta"
MetaKey = rediskey.SandboxLifecycleMeta()

// EventStreamKey is the append-only stream of create/delete events. The
// sidecar maintains a consumer group on it; entries trim with MAXLEN ~.
EventStreamKey = "cube:sandbox:events"
EventStreamKey = rediskey.SandboxLifecycleEvents()
)

const (
// EventStreamMaxLen caps the stream so an offline sidecar cannot drive
// unbounded Redis growth. Sidecars also bootstrap from MetaKey, so any
// trimmed events are recovered on the next full sync.
Expand Down
Loading
Loading