Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions examples/wait_for_ready.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""Example: Using wait_for_ready for cold start handling.

This example demonstrates how to use the `wait_for_ready` parameter to
automatically handle model cold starts. When a model is not loaded (cold),
the SDK will trigger a warmup request and poll until the model becomes
healthy before making the actual request.

This is useful for:
- Serverless deployments where models may be unloaded after idle time
- First requests to a model that hasn't been used recently
- Ensuring reliable request completion without manual status checking

Usage:
export KAFEIDO_API_KEY="your-api-key"
python examples/wait_for_ready.py
"""

import asyncio

from kafeido import OpenAI, AsyncOpenAI, WarmupTimeoutError


def sync_example():
"""Synchronous example of wait_for_ready."""
print("=== Synchronous Example ===\n")

client = OpenAI()

# Basic usage - wait for model to be ready before making request
print("1. Basic chat completion with wait_for_ready:")
try:
response = client.chat.completions.create(
model="gpt-oss-20b",
messages=[{"role": "user", "content": "Hello! What is 2+2?"}],
wait_for_ready=True, # Automatically handle cold start
)
print(f" Response: {response.choices[0].message.content}\n")
except WarmupTimeoutError as e:
print(f" Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")

# With custom timeout - useful for models that take longer to load
print("2. Chat completion with custom timeout (2 minutes):")
try:
response = client.chat.completions.create(
model="gpt-oss-20b",
messages=[{"role": "user", "content": "Tell me a short joke."}],
wait_for_ready=True,
warmup_timeout=120.0, # Wait up to 2 minutes
)
print(f" Response: {response.choices[0].message.content}\n")
except WarmupTimeoutError as e:
print(f" Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")

# Streaming also works with wait_for_ready
print("3. Streaming with wait_for_ready:")
try:
stream = client.chat.completions.create(
model="gpt-oss-20b",
messages=[{"role": "user", "content": "Count from 1 to 5."}],
stream=True,
wait_for_ready=True,
)
print(" Response: ", end="")
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print("\n")
except WarmupTimeoutError as e:
print(f" Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")


async def async_example():
"""Asynchronous example of wait_for_ready."""
print("=== Asynchronous Example ===\n")

async with AsyncOpenAI() as client:
# Basic async usage
print("1. Async chat completion with wait_for_ready:")
try:
response = await client.chat.completions.create(
model="gpt-oss-20b",
messages=[{"role": "user", "content": "What is the capital of France?"}],
wait_for_ready=True,
)
print(f" Response: {response.choices[0].message.content}\n")
except WarmupTimeoutError as e:
print(f" Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")


def audio_example():
"""Example with audio transcription."""
print("=== Audio Transcription Example ===\n")

client = OpenAI()

# Note: You need an actual audio file for this example
# This is just to show the API usage
print("Audio transcription with wait_for_ready:")
print(" (Requires an audio file to run)")
print("""
# Example code:
with open("audio.mp3", "rb") as f:
transcript = client.audio.transcriptions.create(
file=f,
model="whisper-large-v3",
wait_for_ready=True, # Wait for whisper model to load
warmup_timeout=180.0, # ASR models may take longer
)
print(transcript.text)
""")


def main():
"""Run all examples."""
print("=" * 60)
print("Kafeido SDK - wait_for_ready Example")
print("=" * 60)
print()

# Run sync example
sync_example()

# Run async example
asyncio.run(async_example())

# Show audio example code
audio_example()

print("=" * 60)
print("Done!")


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions kafeido/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from kafeido.version import __version__
from kafeido.client import OpenAI
from kafeido._async_client import AsyncOpenAI
from kafeido._warmup import WarmupTimeoutError
from kafeido.types import (
# Errors
OpenAIError,
Expand Down Expand Up @@ -95,6 +96,7 @@
"UnprocessableEntityError",
"RateLimitError",
"InternalServerError",
"WarmupTimeoutError",
# Chat
"ChatCompletion",
"ChatCompletionChunk",
Expand Down
19 changes: 14 additions & 5 deletions kafeido/_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from kafeido._auth import get_api_key
from kafeido._http_client import AsyncHTTPClient
from kafeido._warmup import AsyncWarmupHelper
from kafeido.resources._async_chat import AsyncChat
from kafeido.resources._async_audio import AsyncAudio
from kafeido.resources._async_models import AsyncModels
Expand Down Expand Up @@ -76,13 +77,21 @@ def __init__(
max_retries=max_retries,
)

# Initialize resources
self._chat = AsyncChat(self._http_client)
self._audio = AsyncAudio(self._http_client)
# Initialize models resource first (needed for warmup helper)
self._models = AsyncModels(self._http_client)

# Initialize warmup helper for cold start handling
self._warmup_helper = AsyncWarmupHelper(
status_fn=self._models.status,
warmup_fn=lambda m: self._models.warmup(model=m),
)

# Initialize resources with warmup helper
self._chat = AsyncChat(self._http_client, self._warmup_helper)
self._audio = AsyncAudio(self._http_client, self._warmup_helper)
self._files = AsyncFiles(self._http_client)
self._ocr = AsyncOCR(self._http_client)
self._vision = AsyncVision(self._http_client)
self._ocr = AsyncOCR(self._http_client, self._warmup_helper)
self._vision = AsyncVision(self._http_client, self._warmup_helper)
self._jobs = AsyncJobs(self._http_client)

@property
Expand Down
181 changes: 181 additions & 0 deletions kafeido/_warmup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""Cold start waiting / warmup helpers.

This module provides helpers for handling model cold starts by automatically
triggering warmup and polling until the model is ready before making requests.
"""

from __future__ import annotations

import asyncio
import time
from typing import TYPE_CHECKING, Awaitable, Callable, Optional

if TYPE_CHECKING:
from kafeido.types.models import ModelStatus, WarmupResponse


# Default configuration
DEFAULT_POLL_INTERVAL = 2.0 # seconds between status checks
DEFAULT_MAX_WAIT_TIME = 300.0 # 5 minutes max wait
HEALTHY_STATUS = "healthy"


class WarmupTimeoutError(Exception):
"""Raised when model warmup times out.

Attributes:
model: The model ID that timed out.
waited_seconds: How long we waited before timing out.
"""

def __init__(self, model: str, waited_seconds: float) -> None:
super().__init__(
f"Model '{model}' did not become ready within {waited_seconds:.1f}s"
)
self.model = model
self.waited_seconds = waited_seconds


class WarmupHelper:
"""Synchronous warmup helper for cold start waiting.

This helper triggers model warmup and polls until the model becomes healthy.
"""

def __init__(
self,
status_fn: Callable[[str], "ModelStatus"],
warmup_fn: Callable[[str], "WarmupResponse"],
poll_interval: float = DEFAULT_POLL_INTERVAL,
max_wait_time: float = DEFAULT_MAX_WAIT_TIME,
) -> None:
"""Initialize warmup helper.

Args:
status_fn: Function to get model status (typically models.status).
warmup_fn: Function to trigger warmup (typically models.warmup).
poll_interval: Seconds between status checks.
max_wait_time: Maximum seconds to wait before timeout.
"""
self._status_fn = status_fn
self._warmup_fn = warmup_fn
self._poll_interval = poll_interval
self._max_wait_time = max_wait_time

def wait_for_ready(
self, model: str, timeout: Optional[float] = None
) -> None:
"""Wait for model to be ready, triggering warmup if needed.

This method will:
1. Trigger a warmup request to start loading the model
2. If model is already warm, return immediately
3. Otherwise, poll the status endpoint until the model is healthy
4. Raise WarmupTimeoutError if the model doesn't become ready in time

Args:
model: The model ID to wait for.
timeout: Optional timeout override in seconds. If None, uses
the default max_wait_time from initialization.

Raises:
WarmupTimeoutError: If model doesn't become ready within timeout.
"""
max_wait = timeout if timeout is not None else self._max_wait_time

# First, trigger warmup
warmup_response = self._warmup_fn(model)

if warmup_response.already_warm:
return # Model is already ready

# Poll until ready or timeout
start_time = time.monotonic()

while True:
elapsed = time.monotonic() - start_time

if elapsed >= max_wait:
raise WarmupTimeoutError(model, elapsed)

# Check status
status = self._status_fn(model)

if status.status and status.status.status == HEALTHY_STATUS:
return # Model is ready

# Wait before next poll
time.sleep(self._poll_interval)


class AsyncWarmupHelper:
"""Asynchronous warmup helper for cold start waiting.

This helper triggers model warmup and polls until the model becomes healthy,
using async/await for non-blocking operation.
"""

def __init__(
self,
status_fn: Callable[[str], Awaitable["ModelStatus"]],
warmup_fn: Callable[[str], Awaitable["WarmupResponse"]],
poll_interval: float = DEFAULT_POLL_INTERVAL,
max_wait_time: float = DEFAULT_MAX_WAIT_TIME,
) -> None:
"""Initialize async warmup helper.

Args:
status_fn: Async function to get model status.
warmup_fn: Async function to trigger warmup.
poll_interval: Seconds between status checks.
max_wait_time: Maximum seconds to wait before timeout.
"""
self._status_fn = status_fn
self._warmup_fn = warmup_fn
self._poll_interval = poll_interval
self._max_wait_time = max_wait_time

async def wait_for_ready(
self, model: str, timeout: Optional[float] = None
) -> None:
"""Wait for model to be ready asynchronously.

This method will:
1. Trigger a warmup request to start loading the model
2. If model is already warm, return immediately
3. Otherwise, poll the status endpoint until the model is healthy
4. Raise WarmupTimeoutError if the model doesn't become ready in time

Args:
model: The model ID to wait for.
timeout: Optional timeout override in seconds. If None, uses
the default max_wait_time from initialization.

Raises:
WarmupTimeoutError: If model doesn't become ready within timeout.
"""
max_wait = timeout if timeout is not None else self._max_wait_time

# First, trigger warmup
warmup_response = await self._warmup_fn(model)

if warmup_response.already_warm:
return # Model is already ready

# Poll until ready or timeout
start_time = time.monotonic()

while True:
elapsed = time.monotonic() - start_time

if elapsed >= max_wait:
raise WarmupTimeoutError(model, elapsed)

# Check status
status = await self._status_fn(model)

if status.status and status.status.status == HEALTHY_STATUS:
return # Model is ready

# Wait before next poll
await asyncio.sleep(self._poll_interval)
Loading