Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/gateway/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
export { ensureGateway, findExistingGatewayProcess } from './process';
export { ensureGateway, findExistingGatewayProcess, killGateway } from './process';
export { waitForProcess } from './utils';
13 changes: 13 additions & 0 deletions src/gateway/process.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,19 @@ describe('findExistingGatewayProcess', () => {
expect(result).toBe(gatewayProcess);
});

it('matches bash-invoked startup script with full path', async () => {
const gatewayProcess = createFullMockProcess({
id: 'gateway-1',
command: 'bash /usr/local/bin/start-openclaw.sh',
status: 'running',
});
const { sandbox, listProcessesMock } = createMockSandbox();
listProcessesMock.mockResolvedValue([gatewayProcess]);

const result = await findExistingGatewayProcess(sandbox);
expect(result).toBe(gatewayProcess);
});

it('matches legacy clawdbot gateway command (transition compat)', async () => {
const gatewayProcess = createFullMockProcess({
id: 'gateway-1',
Expand Down
48 changes: 48 additions & 0 deletions src/gateway/process.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,53 @@ import type { OpenClawEnv } from '../types';
import { GATEWAY_PORT, STARTUP_TIMEOUT_MS } from '../config';
import { buildEnvVars } from './env';

/**
* Force kill the gateway process and clean up lock files.
*
* start-openclaw.sh execs into "openclaw" which forks "openclaw-gateway".
* Process.kill() only kills the tracked PID, but the forked child keeps
* port 18789. We use multiple strategies to ensure everything is dead.
*/
export async function killGateway(sandbox: Sandbox): Promise<void> {
// Strategy 1: pgrep by exact name (most precise)
// Strategy 2: pkill by pattern (broader match)
// Strategy 3: ss to find PID by port (most reliable but needs ss)
try {
await sandbox.exec(
[
'kill -9 $(pgrep -x "openclaw-gateway" 2>/dev/null) $(pgrep -x "openclaw" 2>/dev/null) 2>/dev/null',
'pkill -9 -f "openclaw" 2>/dev/null',
`kill -9 $(ss -tlnp sport = :${GATEWAY_PORT} 2>/dev/null | grep -oP "pid=\\K[0-9]+") 2>/dev/null`,
'true',
].join('; '),
);
} catch {
// Process may not exist or tools not available
}

// Also kill via the Process API
const process = await findExistingGatewayProcess(sandbox);
if (process) {
try {
await process.kill();
} catch {
// may already be dead
}
}

// Clean up lock files that prevent restart
try {
await sandbox.exec(
'rm -f /tmp/openclaw-gateway.lock /root/.openclaw/gateway.lock /home/openclaw/.openclaw/gateway.lock 2>/dev/null; true',
);
} catch {
// ignore
}

// Wait for process to fully die
await new Promise((r) => setTimeout(r, 2000));
}

/**
* Check if the gateway port is already listening via a TCP probe.
* Used as a safety net when listProcesses() fails to detect the gateway.
Expand All @@ -26,6 +73,7 @@ export async function findExistingGatewayProcess(sandbox: Sandbox): Promise<Proc
// Don't match CLI commands like "openclaw devices list"
const isGatewayProcess =
proc.command.includes('start-openclaw.sh') ||
proc.command.includes('/usr/local/bin/start-openclaw.sh') ||
proc.command.includes('openclaw gateway') ||
// Legacy: match old startup script during transition
proc.command.includes('start-moltbot.sh') ||
Expand Down
92 changes: 77 additions & 15 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import { getSandbox, Sandbox, type SandboxOptions } from '@cloudflare/sandbox';
import type { AppEnv, OpenClawEnv } from './types';
import { GATEWAY_PORT } from './config';
import { createAccessMiddleware } from './auth';
import { ensureGateway, findExistingGatewayProcess } from './gateway';
import { ensureGateway, findExistingGatewayProcess, killGateway } from './gateway';
import { publicRoutes, api, adminUi, debug, cdp } from './routes';
import { redactSensitiveParams } from './utils/logging';
import { restoreIfNeeded, createSnapshot } from './persistence';
Expand All @@ -48,6 +48,18 @@ function transformErrorMessage(message: string, host: string): string {
return message;
}

/**
* Check if an error indicates the gateway process has crashed.
* The Sandbox SDK throws this when containerFetch/wsConnect is called
* but the target process is no longer listening.
*/
function isGatewayCrashedError(error: unknown): boolean {
if (!(error instanceof Error)) return false;
return error.message.includes('is not listening');
}

// killGateway is imported from './gateway' (shared with restart handler)

export { Sandbox };

/**
Expand Down Expand Up @@ -133,7 +145,12 @@ app.use('*', async (c, next) => {
await next();
});

// Middleware: Initialize sandbox and restore backup if available
// Middleware: Initialize sandbox stub and restore backup if available.
// Note: we intentionally do NOT call sandbox.start() here. The Sandbox SDK's
// containerFetch() auto-starts the container when needed, and the catch-all
// proxy route uses ensureGateway() which handles startup explicitly.
// Adding start() here would add an unnecessary RPC call on every request,
// including static assets and health checks that don't need the container.
app.use('*', async (c, next) => {
const options = buildSandboxOptions(c.env);
const sandbox = getSandbox(c.env.Sandbox, 'openclaw', options);
Expand Down Expand Up @@ -241,20 +258,25 @@ app.all('*', async (c) => {

console.log('[PROXY] Handling request:', url.pathname);

// Restore from backup before starting the gateway.
// This is only called here (catch-all) and from /api/status — NOT from admin
// routes like sync or debug/cli, because the SDK resets the FUSE overlay on
// createBackup, wiping upper-layer writes.
try {
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
} catch (err) {
console.error('[PROXY] Backup restore failed:', err);
}

// Check if gateway is already running
const existingProcess = await findExistingGatewayProcess(sandbox);
const isGatewayReady = existingProcess !== null && existingProcess.status === 'running';

// Only restore from backup when the gateway needs to start.
// Restoring on every request (including WebSocket reconnects) would mount a
// FUSE overlay that interferes with createBackup — the SDK resets the overlay
// on backup, wiping upper-layer writes.
if (!isGatewayReady) {
try {
await Promise.race([
restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET),
new Promise((_, reject) => setTimeout(() => reject(new Error('Restore timeout')), 15_000)),
]);
} catch (err) {
console.error('[PROXY] Backup restore failed/timeout:', err);
}
}

// For browser requests (non-WebSocket, non-API), show loading page if gateway isn't ready
const isWebSocketRequest = request.headers.get('Upgrade')?.toLowerCase() === 'websocket';
const acceptsHtml = request.headers.get('Accept')?.includes('text/html');
Expand Down Expand Up @@ -317,8 +339,26 @@ app.all('*', async (c) => {
wsRequest = new Request(tokenUrl.toString(), request);
}

// Get WebSocket connection to the container
const containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT);
// Get WebSocket connection to the container (with retry on crash)
let containerResponse: Response;
try {
containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT);
} catch (err) {
if (isGatewayCrashedError(err)) {
console.log('[WS] Gateway crashed, attempting restart and retry...');
await killGateway(sandbox);
await ensureGateway(sandbox, c.env);
try {
containerResponse = await sandbox.wsConnect(wsRequest, GATEWAY_PORT);
} catch (retryErr) {
console.error('[WS] Retry after restart also failed:', retryErr);
return new Response('Gateway crashed and recovery failed', { status: 503 });
}
} else {
console.error('[WS] WebSocket proxy error:', err);
return new Response('WebSocket proxy error', { status: 502 });
}
}
console.log('[WS] wsConnect response status:', containerResponse.status);

// Get the container-side WebSocket
Expand Down Expand Up @@ -447,7 +487,29 @@ app.all('*', async (c) => {
}

console.log('[HTTP] Proxying:', url.pathname + url.search);
const httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT);

let httpResponse: Response;
try {
httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT);
} catch (err) {
if (isGatewayCrashedError(err)) {
console.log('[HTTP] Gateway crashed, attempting restart and retry...');
await killGateway(sandbox);
await ensureGateway(sandbox, c.env);
try {
httpResponse = await sandbox.containerFetch(request, GATEWAY_PORT);
} catch (retryErr) {
console.error('[HTTP] Retry after restart also failed:', retryErr);
return c.json({ error: 'Gateway crashed and recovery failed' }, 503);
}
} else {
console.error('[HTTP] Proxy error:', err);
return c.json(
{ error: 'Proxy error', message: err instanceof Error ? err.message : String(err) },
502,
);
}
}
console.log('[HTTP] Response status:', httpResponse.status);

// Add debug header to verify worker handled the request
Expand Down
58 changes: 4 additions & 54 deletions src/routes/api.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Hono } from 'hono';
import type { AppEnv } from '../types';
import { createAccessMiddleware } from '../auth';
import { ensureGateway, findExistingGatewayProcess, waitForProcess } from '../gateway';
import { ensureGateway, findExistingGatewayProcess, killGateway, waitForProcess } from '../gateway';
import { createSnapshot, getLastBackupId, clearPersistenceCache } from '../persistence';

// CLI commands can take 10-15 seconds to complete due to WebSocket connection overhead
Expand Down Expand Up @@ -245,60 +245,10 @@ adminApi.post('/gateway/restart', async (c) => {
const sandbox = c.get('sandbox');

try {
// Find and kill the existing gateway process
// Kill the gateway process (shared logic with crash retry)
const existingProcess = await findExistingGatewayProcess(sandbox);

// Kill via the Process API first
if (existingProcess) {
console.log('[Restart] Killing via Process API:', existingProcess.id);
try {
await existingProcess.kill();
} catch {
// Ignore
}
}

// Force kill the gateway via exec — more reliable than Process.kill()
// because start-openclaw.sh execs into "openclaw" which forks
// "openclaw-gateway". Process.kill() only kills the tracked shell PID,
// but the forked child keeps port 18789. (Credit: dalexeenko #261)
//
// Use multiple strategies since we don't know what tools are available.
try {
const killResult = await sandbox.exec(
[
// Strategy 1: pgrep by exact name (most precise)
'kill -9 $(pgrep -x "openclaw-gateway" 2>/dev/null) $(pgrep -x "openclaw" 2>/dev/null) 2>/dev/null',
// Strategy 2: pkill by pattern (broader match)
'pkill -9 -f "openclaw" 2>/dev/null',
// Strategy 3: find by port (most reliable but needs ss/fuser)
'kill -9 $(ss -tlnp sport = :18789 2>/dev/null | grep -oP "pid=\\K[0-9]+") 2>/dev/null',
// Always succeed
'true',
].join('; '),
);
console.log('[Restart] Kill result:', killResult.stdout?.trim(), killResult.stderr?.trim());
} catch (e) {
console.error('[Restart] Kill exec failed:', e);
}

// Clean up lock files that prevent restart
try {
await sandbox.exec(
'rm -f /tmp/openclaw-gateway.lock /root/.openclaw/gateway.lock /home/openclaw/.openclaw/gateway.lock 2>/dev/null; true',
);
} catch {
// Ignore
}

// Wait for process to fully die and verify
await new Promise((r) => setTimeout(r, 3000));
try {
const check = await sandbox.exec('ps aux | grep -v grep | grep openclaw || echo "ALL DEAD"');
console.log('[Restart] Surviving processes:', check.stdout?.trim());
} catch {
// Ignore
}
console.log('[Restart] Killing gateway, existing process:', existingProcess?.id ?? 'none');
await killGateway(sandbox);

// Clear the restore flag so the next request re-restores from R2.
// We intentionally do NOT start the gateway here — the next incoming
Expand Down
23 changes: 16 additions & 7 deletions src/routes/public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,28 @@ publicRoutes.get('/logo-small.png', (c) => {
publicRoutes.get('/api/status', async (c) => {
const sandbox = c.get('sandbox');

// Restore from backup before checking/starting the gateway
try {
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
} catch (err) {
console.error('[api/status] Backup restore failed:', err);
}

try {
let process = await findExistingGatewayProcess(sandbox);
console.log('[api/status] existing process:', process?.id ?? 'none', process?.status ?? '');
if (!process) {
// Restore from backup only when the gateway needs to start.
// Restoring while the gateway is running would mount a FUSE overlay
// that interferes with createBackup.
try {
await Promise.race([
restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Restore timeout')), 15_000),
),
]);
} catch (err) {
console.error('[api/status] Backup restore failed/timeout:', err);
}

// No gateway process found — start it with a short timeout.
// The loading page polls /api/status every few seconds, so even
// if this attempt times out, subsequent polls will retry.
console.log('[api/status] No process found, starting gateway...');
const QUICK_START_TIMEOUT = 30_000;
try {
await Promise.race([
Expand Down
20 changes: 20 additions & 0 deletions test/e2e/_setup.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,28 @@ navigate to main page and wait for worker to be ready
===
TOKEN=$(cat "$CCTR_FIXTURE_DIR/gateway-token.txt")
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")

# Start a background debug loop that polls /api/status and logs responses.
# This helps diagnose why the gateway sometimes fails to start in CI.
(
for i in $(seq 1 60); do
RESP=$(./curl-auth -s -m 10 "$WORKER_URL/api/status" 2>/dev/null || echo '{"error":"curl failed"}')
echo "[debug-poll $i] $(date +%H:%M:%S) $RESP" >&2
OK=$(echo "$RESP" | jq -r '.ok // false' 2>/dev/null)
if [ "$OK" = "true" ]; then
echo "[debug-poll] Gateway is up!" >&2
break
fi
sleep 8
done
) &
DEBUG_PID=$!

plwr -S moltworker-e2e open "$WORKER_URL/?token=$TOKEN" -T 480000
# Wait for "Pairing required" — this ensures the WS connection has been
# attempted and the gateway has registered a pending pairing request.
plwr -S moltworker-e2e wait 'text=Pairing required' -T 480000

# Clean up debug loop
kill $DEBUG_PID 2>/dev/null || true
---
Loading
Loading