diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts index 2dfb190a38..0d0d0ac708 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.test.ts @@ -336,6 +336,82 @@ describe('Scale down runners', () => { checkNonTerminated(runners); }); + it(`Should not terminate a runner that became busy between deregister and post-deregister check.`, async () => { + // setup: runner appears idle on first check, deregister succeeds, + // but post-deregister re-check finds it busy (race condition) + const runners = [ + createRunnerTestData( + 'race-condition-1', + type, + MINIMUM_TIME_RUNNING_IN_MINUTES + 1, + true, + false, + false, + ), + ]; + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // First call returns not-busy (pre-deregister), second returns busy (post-deregister) + let callCount = 0; + const busyCheckMock = () => { + callCount++; + if (callCount <= 1) { + return { data: { busy: false } }; + } + return { data: { busy: true } }; + }; + mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock); + mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert: runner should NOT be terminated + checkTerminated(runners); + checkNonTerminated(runners); + }); + + it(`Should terminate a runner when post-deregister busy check returns 404.`, async () => { + // setup: after deregistration, GitHub API returns 404 (runner fully removed) + const runners = [ + createRunnerTestData( + 'deregistered-404', + type, + MINIMUM_TIME_RUNNING_IN_MINUTES + 1, + true, + false, + true, + ), + ]; + + mockGitHubRunners(runners); + mockAwsRunners(runners); + + // First call returns not-busy, second throws 404 + let callCount = 0; + const busyCheckMock = () => { + callCount++; + if (callCount <= 1) { + return { data: { busy: false } }; + } + const error = new Error('Not Found'); + (error as any).status = 404; + Object.setPrototypeOf(error, RequestError.prototype); + throw error; + }; + mockOctokit.actions.getSelfHostedRunnerForRepo.mockImplementation(busyCheckMock); + mockOctokit.actions.getSelfHostedRunnerForOrg.mockImplementation(busyCheckMock); + + // act + await expect(scaleDown()).resolves.not.toThrow(); + + // assert: runner should be terminated (404 = not busy) + checkTerminated(runners); + checkNonTerminated(runners); + }); + it(`Should terminate orphan (Non JIT)`, async () => { // setup const orphanRunner = createRunnerTestData('orphan-1', type, MINIMUM_BOOT_TIME + 1, false, false, false); diff --git a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts index 6086af7714..85e5ea8b4e 100644 --- a/lambdas/functions/control-plane/src/scale-runners/scale-down.ts +++ b/lambdas/functions/control-plane/src/scale-runners/scale-down.ts @@ -138,39 +138,63 @@ async function removeRunner(ec2runner: RunnerInfo, ghRunnerIds: number[]): Promi return; } + // Step 1: Check busy state as a fast-path to skip runners that are obviously busy. const states = await Promise.all( ghRunnerIds.map(async (ghRunnerId) => { - // Get busy state instead of using the output of listGitHubRunners(...) to minimize to race condition. return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); }), ); - if (states.every((busy) => busy === false)) { - const statuses = await Promise.all( - ghRunnerIds.map(async (ghRunnerId) => { - return ( - ec2runner.type === 'Org' - ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ - runner_id: ghRunnerId, - org: ec2runner.owner, - }) - : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ - runner_id: ghRunnerId, - owner: ec2runner.owner.split('/')[0], - repo: ec2runner.owner.split('/')[1], - }) - ).status; - }), - ); + if (!states.every((busy) => busy === false)) { + logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); + return; + } - if (statuses.every((status) => status == 204)) { - await terminateRunner(ec2runner.instanceId); - logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); - } else { - logger.error(`Failed to de-register GitHub runner: ${statuses}`); - } + // Step 2: De-register the runner from GitHub. This prevents GitHub from assigning new jobs + // to this runner, closing the race window where a job could be assigned between the busy + // check above and the termination below. + const statuses = await Promise.all( + ghRunnerIds.map(async (ghRunnerId) => { + return ( + ec2runner.type === 'Org' + ? await githubAppClient.actions.deleteSelfHostedRunnerFromOrg({ + runner_id: ghRunnerId, + org: ec2runner.owner, + }) + : await githubAppClient.actions.deleteSelfHostedRunnerFromRepo({ + runner_id: ghRunnerId, + owner: ec2runner.owner.split('/')[0], + repo: ec2runner.owner.split('/')[1], + }) + ).status; + }), + ); + + if (!statuses.every((status) => status == 204)) { + logger.error(`Failed to de-register GitHub runner: ${statuses}`); + return; + } + + // Step 3: Re-check busy state after de-registration. A job may have been assigned between + // step 1 and step 2. After de-registration no new jobs can be assigned, so this check is + // now stable. If the runner is busy, the in-flight job will complete using its job-scoped + // OAuth token (the runner worker uses credentials from the job message, not the runner + // registration). We leave the instance running and it will be cleaned up as an orphan. + const postDeregisterStates = await Promise.all( + ghRunnerIds.map(async (ghRunnerId) => { + return await getGitHubRunnerBusyState(githubAppClient, ec2runner, ghRunnerId); + }), + ); + + if (postDeregisterStates.every((busy) => busy === false)) { + await terminateRunner(ec2runner.instanceId); + logger.info(`AWS runner instance '${ec2runner.instanceId}' is terminated and GitHub runner is de-registered.`); } else { - logger.info(`Runner '${ec2runner.instanceId}' cannot be de-registered, because it is still busy.`); + logger.warn( + `Runner '${ec2runner.instanceId}' became busy between idle check and de-registration. ` + + `Skipping termination to allow the in-flight job to complete. ` + + `The instance will be cleaned up as an orphan on a subsequent cycle.`, + ); } } catch (e) { logger.error(`Runner '${ec2runner.instanceId}' cannot be de-registered. Error: ${e}`, {