-
Notifications
You must be signed in to change notification settings - Fork 6.8k
fix: avoid Groq token-limit 413 for small prompts #449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,6 +61,10 @@ const GITHUB_429_MAX_RETRIES = 3 | |
| const GITHUB_429_BASE_DELAY_SEC = 1 | ||
| const GITHUB_429_MAX_DELAY_SEC = 32 | ||
| const GEMINI_API_HOST = 'generativelanguage.googleapis.com' | ||
| const GROQ_MAX_REQUEST_TOKENS = 12_000 | ||
| const GROQ_TARGET_PROMPT_TOKENS = 9_000 | ||
| const GROQ_COMPLETION_TOKEN_SAFETY_MARGIN = 500 | ||
| const GROQ_TOKEN_ESTIMATE_DIVISOR = 2 | ||
|
|
||
| function isGithubModelsMode(): boolean { | ||
| return isEnvTruthy(process.env.CLAUDE_CODE_USE_GITHUB) | ||
|
|
@@ -76,6 +80,15 @@ function hasGeminiApiHost(baseUrl: string | undefined): boolean { | |
| } | ||
| } | ||
|
|
||
| function isGroqBaseUrl(baseUrl: string): boolean { | ||
| try { | ||
| const hostname = new URL(baseUrl).hostname.toLowerCase() | ||
| return hostname === 'groq.com' || hostname.endsWith('.groq.com') | ||
| } catch { | ||
| return baseUrl.toLowerCase().includes('groq.com') | ||
| } | ||
| } | ||
|
|
||
| function formatRetryAfterHint(response: Response): string { | ||
| const ra = response.headers.get('retry-after') | ||
| return ra ? ` (Retry-After: ${ra})` : '' | ||
|
|
@@ -116,6 +129,96 @@ interface OpenAITool { | |
| } | ||
| } | ||
|
|
||
| function estimateJsonBytes(value: unknown): number { | ||
| return new TextEncoder().encode(JSON.stringify(value)).length | ||
| } | ||
|
|
||
| function estimateGroqPromptTokens(value: unknown): number { | ||
| return Math.ceil(estimateJsonBytes(value) / GROQ_TOKEN_ESTIMATE_DIVISOR) | ||
| } | ||
|
|
||
| function stripToolSchemaDescriptions(value: unknown): unknown { | ||
| if (Array.isArray(value)) { | ||
| return value.map(item => stripToolSchemaDescriptions(item)) | ||
| } | ||
| if (!value || typeof value !== 'object') return value | ||
|
|
||
| const record = value as Record<string, unknown> | ||
| const reduced: Record<string, unknown> = {} | ||
| for (const [key, child] of Object.entries(record)) { | ||
| if (key === 'description') continue | ||
| reduced[key] = stripToolSchemaDescriptions(child) | ||
| } | ||
|
|
||
| return reduced | ||
| } | ||
|
|
||
| function compactPayloadForGroq(body: Record<string, unknown>): void { | ||
| let promptTokens = estimateGroqPromptTokens(body) | ||
| if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return | ||
|
|
||
| if (Array.isArray(body.tools)) { | ||
| body.tools = stripToolSchemaDescriptions(body.tools) as typeof body.tools | ||
| promptTokens = estimateGroqPromptTokens(body) | ||
| } | ||
| if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return | ||
|
|
||
| if (Array.isArray(body.messages)) { | ||
| const messages = [ | ||
| ...(body.messages as Array<{ role?: string } & Record<string, unknown>>), | ||
| ] | ||
|
|
||
| while (promptTokens > GROQ_TARGET_PROMPT_TOKENS) { | ||
| const firstNonSystemIndex = messages.findIndex( | ||
| (message, index) => | ||
| message.role !== 'system' && index < messages.length - 1, | ||
| ) | ||
| if (firstNonSystemIndex === -1) break | ||
|
|
||
| messages.splice(firstNonSystemIndex, 1) | ||
| body.messages = messages | ||
|
Comment on lines
+171
to
+179
|
||
| promptTokens = estimateGroqPromptTokens(body) | ||
| } | ||
| } | ||
| if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return | ||
|
|
||
| if (body.tools) { | ||
| delete body.tools | ||
| body.tool_choice = 'none' | ||
| promptTokens = estimateGroqPromptTokens(body) | ||
| } | ||
| if (promptTokens <= GROQ_TARGET_PROMPT_TOKENS) return | ||
|
|
||
| if (Array.isArray(body.messages)) { | ||
| const messages = body.messages as Array< | ||
| { role?: string } & Record<string, unknown> | ||
| > | ||
| const lastUserMessage = [...messages] | ||
| .reverse() | ||
| .find(message => message.role === 'user') | ||
| const lastMessage = lastUserMessage ?? messages[messages.length - 1] | ||
| body.messages = lastMessage ? [lastMessage] : [] | ||
| } | ||
| } | ||
|
|
||
| function clampGroqMaxTokens(body: Record<string, unknown>): void { | ||
| const currentMaxTokens = | ||
| typeof body.max_tokens === 'number' | ||
| ? body.max_tokens | ||
| : typeof body.max_completion_tokens === 'number' | ||
| ? body.max_completion_tokens | ||
| : 4000 | ||
|
|
||
| const estimatedPromptTokens = estimateGroqPromptTokens(body) | ||
| const availableCompletionTokens = | ||
| GROQ_MAX_REQUEST_TOKENS - | ||
| estimatedPromptTokens - | ||
| GROQ_COMPLETION_TOKEN_SAFETY_MARGIN | ||
|
|
||
| body.max_tokens = Math.min(currentMaxTokens, Math.max(1, availableCompletionTokens)) | ||
| delete body.max_completion_tokens | ||
| } | ||
|
|
||
| function convertSystemPrompt( | ||
| system: unknown, | ||
| ): string { | ||
|
|
@@ -1090,6 +1193,12 @@ class OpenAIShimMessages { | |
| } | ||
| } | ||
|
|
||
| if (isGroqBaseUrl(request.baseUrl)) { | ||
| delete body.stream_options | ||
| compactPayloadForGroq(body) | ||
| clampGroqMaxTokens(body) | ||
| } | ||
|
|
||
| const headers: Record<string, string> = { | ||
| 'Content-Type': 'application/json', | ||
| ...this.defaultHeaders, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
estimateJsonBytesallocates a newTextEncoderand stringifies the entire payload on every call, andcompactPayloadForGroqcalls this repeatedly (including inside a loop). For large payloads this can become a noticeable CPU/memory hotspot. Consider reusing a module-scopedTextEncoderand reducing full-bodyJSON.stringifycalls (e.g., estimate only the prompt-bearing fields or cache the serialized form between compaction steps).