Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,82 @@ describe('Scale down runners', () => {
checkNonTerminated(runners);
});

it(`Should not terminate a runner that became busy between deregister and post-deregister check.`, async () => {
// setup: runner appears idle on first check, deregister succeeds,
// but post-deregister re-check finds it busy (race condition)
const runners = [
createRunnerTestData(
'race-condition-1',
type,
MINIMUM_TIME_RUNNING_IN_MINUTES + 1,
true,
false,
false,
),
];

mockGitHubRunners(runners);
mockAwsRunners(runners);

// First call returns not-busy (pre-deregister), second returns busy (post-deregister)
let callCount = 0;
const busyCheckMock = () => {
callCount++;
if (callCount <= 1) {
return { data: { busy: false } };
}
return { data: { busy: true } };
};
mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock);
mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock);

// act
await expect(scaleDown()).resolves.not.toThrow();

// assert: runner should NOT be terminated
checkTerminated(runners);
checkNonTerminated(runners);
});

it(`Should terminate a runner when post-deregister busy check returns 404.`, async () => {
// setup: after deregistration, GitHub API returns 404 (runner fully removed)
const runners = [
createRunnerTestData(
'deregistered-404',
type,
MINIMUM_TIME_RUNNING_IN_MINUTES + 1,
true,
false,
true,
),
];

mockGitHubRunners(runners);
mockAwsRunners(runners);

// First call returns not-busy, second throws 404
let callCount = 0;
const busyCheckMock = () => {
callCount++;
if (callCount <= 1) {
return { data: { busy: false } };
}
const error = new Error('Not Found');
(error as any).status = 404;
Object.setPrototypeOf(error, RequestError.prototype);
throw error;
};
mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock);
mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock);

// act
await expect(scaleDown()).resolves.not.toThrow();

// assert: runner should be terminated (404 = not busy)
checkTerminated(runners);
checkNonTerminated(runners);
});

it(`Should terminate orphan (Non JIT)`, async () => {
// setup
const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false);
Expand Down
74 changes: 49 additions & 25 deletions lambdas/functions/control-plane/src/scale-runners/scale-down.ts
Original file line number Diff line number Diff line change
Expand Up @@ -138,39 +138,63 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi
return;
}

// Step 1: Check busy state as a fast-path to skip runners that are obviously busy.
const states = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
// Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition.
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
}),
);

if (states.every((busy) => busy === false)) {
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
);
if (!states.every((busy) => busy === false)) {
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
return;
}

if (statuses.every((status) => status == 204)) {
await terminateRunner(ec2runner.instanceId);
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
} else {
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
}
// Step 2: De-register the runner from GitHub. This prevents GitHub from assigning new jobs
// to this runner, closing the race window where a job could be assigned between the busy
// check above and the termination below.
const statuses = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return (
ec2runner.type === 'Org'
? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({
runner_id: ghRunnerId,
org: ec2runner.owner,
})
: await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({
runner_id: ghRunnerId,
owner: ec2runner.owner.split('/')[0],
repo: ec2runner.owner.split('/')[1],
})
).status;
}),
);

if (!statuses.every((status) => status == 204)) {
logger.error(`Failed to de-register GitHub runner: ${statuses}`);
return;
}

// Step 3: Re-check busy state after de-registration. A job may have been assigned between
// step 1 and step 2. After de-registration no new jobs can be assigned, so this check is
// now stable. If the runner is busy, the in-flight job will complete using its job-scoped
// OAuth token (the runner worker uses credentials from the job message, not the runner
// registration). We leave the instance running and it will be cleaned up as an orphan.
const postDeregisterStates = await Promise.all(
ghRunnerIds.map(async (ghRunnerId) => {
return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId);
}),
);

if (postDeregisterStates.every((busy) => busy === false)) {
await terminateRunner(ec2runner.instanceId);
logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`);
} else {
logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`);
logger.warn(
`Runner '${ec2runner.instanceId}' became busy between idle check and de-registration. ` +
`Skipping termination to allow the in-flight job to complete. ` +
`The instance will be cleaned up as an orphan on a subsequent cycle.`,
);
}
} catch (e) {
logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {
Expand Down
Loading