Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/services/api/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,24 @@ export function getAssistantMessageFromError(
// Check for request too large errors (413 status)
// This typically happens when a large PDF + conversation context exceeds the 32MB API limit
if (error instanceof APIError && error.status === 413) {
const lowerMessage = error.message.toLowerCase()
const looksLikeTokenRateLimit =
lowerMessage.includes('tokens per minute') ||
lowerMessage.includes('"code":"rate_limit_exceeded"') ||
lowerMessage.includes('"type":"tokens"')

if (looksLikeTokenRateLimit) {
const stripped = error.message.replace(/^413\s+/, '')
const innerMessage = stripped.match(/"message"\s*:\s*"([^"]*)"/)?.[1]
const detail = innerMessage || stripped

return createAssistantAPIErrorMessage({
content: `${API_ERROR_MESSAGE_PREFIX}: Token limit exceeded for this provider request · ${detail}. Try /compact, reduce enabled tools, or use a provider/model with higher limits.`,
error: 'rate_limit',
errorDetails: error.message,
})
}

return createAssistantAPIErrorMessage({
content: getRequestTooLargeErrorMessage(),
error: 'invalid_request',
Expand Down
101 changes: 101 additions & 0 deletions src/services/api/openaiShim.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,107 @@ test('preserves usage from final OpenAI stream chunk with empty choices', async
expect(usageEvent?.usage?.output_tokens).toBe(45)
})

test('compacts Groq payloads by token budget', async () => {
let requestBody: Record<string, unknown> | undefined

process.env.OPENAI_BASE_URL = 'https://api.groq.com/openai/v1'

globalThis.fetch = (async (_input, init) => {
requestBody = JSON.parse(String(init?.body))

return new Response(
JSON.stringify({
id: 'chatcmpl-1',
model: 'llama-3.1',
choices: [
{
message: {
role: 'assistant',
content: 'done',
},
finish_reason: 'stop',
},
],
}),
{
headers: {
'Content-Type': 'application/json',
},
},
)
}) as FetchType

const client = createOpenAIShimClient({}) as OpenAIShimClient

await client.beta.messages.create({
model: 'llama-3.1',
system: 'test',
messages: [{ role: 'user', content: 'oi' }],
tools: [
{
name: 'LargeTokenTool',
description: 'd'.repeat(300 * 1024),
input_schema: {
type: 'object',
properties: {},
},
},
],
max_tokens: 256,
stream: false,
})

const firstTool = (requestBody?.tools as Array<{ function?: { description?: string } }> | undefined)?.[0]
expect(firstTool?.function?.description).toBeUndefined()
expect(requestBody).not.toHaveProperty('stream_options')
expect(requestBody).toHaveProperty('max_tokens')
})

test('caps Groq max_tokens when prompt is near token budget', async () => {
let requestBody: Record<string, unknown> | undefined

process.env.OPENAI_BASE_URL = 'https://api.groq.com/openai/v1'

globalThis.fetch = (async (_input, init) => {
requestBody = JSON.parse(String(init?.body))

return new Response(
JSON.stringify({
id: 'chatcmpl-1',
model: 'llama-3.1',
choices: [
{
message: {
role: 'assistant',
content: 'done',
},
finish_reason: 'stop',
},
],
}),
{
headers: {
'Content-Type': 'application/json',
},
},
)
}) as FetchType

const client = createOpenAIShimClient({}) as OpenAIShimClient

await client.beta.messages.create({
model: 'llama-3.1',
system: 'test',
messages: [{ role: 'user', content: 'x'.repeat(17 * 1024) }],
max_tokens: 4000,
stream: false,
})

expect(typeof requestBody?.max_tokens).toBe('number')
expect((requestBody?.max_tokens as number)).toBeLessThan(4000)
expect((requestBody?.max_tokens as number)).toBeGreaterThanOrEqual(1)
})

test('preserves Gemini tool call extra_content in follow-up requests', async () => {
let requestBody: Record<string, unknown> | undefined

Expand Down
109 changes: 109 additions & 0 deletions src/services/api/openaiShim.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ const GITHUB_429_MAX_RETRIES = 3
const GITHUB_429_BASE_DELAY_SEC = 1
const GITHUB_429_MAX_DELAY_SEC = 32
const GEMINI_API_HOST = 'generativelanguage.googleapis.com'
const GROQ_MAX_REQUEST_TOKENS = 12_000
const GROQ_TARGET_PROMPT_TOKENS = 9_000
const GROQ_COMPLETION_TOKEN_SAFETY_MARGIN = 500
const GROQ_TOKEN_ESTIMATE_DIVISOR = 2

function isGithubModelsMode(): boolean {
return isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB)
Expand All @@ -76,6 +80,15 @@ function hasGeminiApiHost(baseUrl: string | undefined): boolean {
}
}

function isGroqBaseUrl(baseUrl: string): boolean {
try {
const hostname = new URL(baseUrl).hostname.toLowerCase()
return hostname === 'groq.com' || hostname.endsWith('.groq.com')
} catch {
return baseUrl.toLowerCase().includes('groq.com')
}
}

function formatRetryAfterHint(response: Response): string {
const ra = response.headers.get('retry-after')
return ra ? ` (Retry-After: ${ra})` : ''
Expand Down Expand Up @@ -116,6 +129,96 @@ interface OpenAITool {
}
}

function estimateJsonBytes(value: unknown): number {
return new TextEncoder().encode(JSON.stringify(value)).length
}

function estimateGroqPromptTokens(value: unknown): number {
Comment on lines +132 to +136
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

estimateJsonBytes allocates a new TextEncoder and stringifies the entire payload on every call, and compactPayloadForGroq calls this repeatedly (including inside a loop). For large payloads this can become a noticeable CPU/memory hotspot. Consider reusing a module-scoped TextEncoder and reducing full-body JSON.stringify calls (e.g., estimate only the prompt-bearing fields or cache the serialized form between compaction steps).

Copilot uses AI. Check for mistakes.
return Math.ceil(estimateJsonBytes(value) / GROQ_TOKEN_ESTIMATE_DIVISOR)
}

function stripToolSchemaDescriptions(value: unknown): unknown {
if (Array.isArray(value)) {
return value.map(item => stripToolSchemaDescriptions(item))
}
if (!value || typeof value !== 'object') return value

const record = value as Record<string, unknown>
const reduced: Record<string, unknown> = {}
for (const [key, child] of Object.entries(record)) {
if (key === 'description') continue
reduced[key] = stripToolSchemaDescriptions(child)
}

return reduced
}

function compactPayloadForGroq(body: Record<string, unknown>): void {
let promptTokens = estimateGroqPromptTokens(body)
if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return

if (Array.isArray(body.tools)) {
body.tools = stripToolSchemaDescriptions(body.tools) as typeof body.tools
promptTokens = estimateGroqPromptTokens(body)
}
if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return

if (Array.isArray(body.messages)) {
const messages = [
...(body.messages as Array<{ role?: string } & Record<string, unknown>>),
]

while (promptTokens > GROQ_TARGET_PROMPT_TOKENS) {
const firstNonSystemIndex = messages.findIndex(
(message, index) =>
message.role !== 'system' && index < messages.length - 1,
)
if (firstNonSystemIndex === -1) break

messages.splice(firstNonSystemIndex, 1)
body.messages = messages
Comment on lines +171 to +179
Copy link

Copilot AI Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The message-trimming loop removes the first non-system message without accounting for tool-call/message pairing (assistant tool_calls ↔ subsequent role:'tool' messages). This can leave orphaned tool results or tool calls in body.messages, which OpenAI-compatible APIs typically reject (400) and would negate the intended 413 mitigation. Consider trimming whole “turn” segments and preserving tool_call/tool_result adjacency (e.g., when removing an assistant message with tool_calls, also remove the following tool messages for those ids; or only remove complete user+assistant(+tool) groups from the front).

Copilot uses AI. Check for mistakes.
promptTokens = estimateGroqPromptTokens(body)
}
}
if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return

if (body.tools) {
delete body.tools
body.tool_choice = 'none'
promptTokens = estimateGroqPromptTokens(body)
}
if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return

if (Array.isArray(body.messages)) {
const messages = body.messages as Array<
{ role?: string } & Record<string, unknown>
>
const lastUserMessage = [...messages]
.reverse()
.find(message => message.role === 'user')
const lastMessage = lastUserMessage ?? messages[messages.length - 1]
body.messages = lastMessage ? [lastMessage] : []
}
}

function clampGroqMaxTokens(body: Record<string, unknown>): void {
const currentMaxTokens =
typeof body.max_tokens === 'number'
? body.max_tokens
: typeof body.max_completion_tokens === 'number'
? body.max_completion_tokens
: 4000

const estimatedPromptTokens = estimateGroqPromptTokens(body)
const availableCompletionTokens =
GROQ_MAX_REQUEST_TOKENS -
estimatedPromptTokens -
GROQ_COMPLETION_TOKEN_SAFETY_MARGIN

body.max_tokens = Math.min(currentMaxTokens, Math.max(1, availableCompletionTokens))
delete body.max_completion_tokens
}

function convertSystemPrompt(
system: unknown,
): string {
Expand Down Expand Up @@ -1090,6 +1193,12 @@ class OpenAIShimMessages {
}
}

if (isGroqBaseUrl(request.baseUrl)) {
delete body.stream_options
compactPayloadForGroq(body)
clampGroqMaxTokens(body)
}

const headers: Record<string, string> = {
'Content-Type': 'application/json',
...this.defaultHeaders,
Expand Down
Loading