FootprintAI · hsinatfootprintai · Feb 23, 2026 · Feb 23, 2026
diff --git a/examples/wait_for_ready.py b/examples/wait_for_ready.py
@@ -0,0 +1,134 @@
+"""Example: Using wait_for_ready for cold start handling.
+
+This example demonstrates how to use the `wait_for_ready` parameter to
+automatically handle model cold starts. When a model is not loaded (cold),
+the SDK will trigger a warmup request and poll until the model becomes
+healthy before making the actual request.
+
+This is useful for:
+- Serverless deployments where models may be unloaded after idle time
+- First requests to a model that hasn't been used recently
+- Ensuring reliable request completion without manual status checking
+
+Usage:
+    export KAFEIDO_API_KEY="your-api-key"
+    python examples/wait_for_ready.py
+"""
+
+import asyncio
+
+from kafeido import OpenAI, AsyncOpenAI, WarmupTimeoutError
+
+
+def sync_example():
+    """Synchronous example of wait_for_ready."""
+    print("=== Synchronous Example ===\n")
+
+    client = OpenAI()
+
+    # Basic usage - wait for model to be ready before making request
+    print("1. Basic chat completion with wait_for_ready:")
+    try:
+        response = client.chat.completions.create(
+            model="gpt-oss-20b",
+            messages=[{"role": "user", "content": "Hello! What is 2+2?"}],
+            wait_for_ready=True,  # Automatically handle cold start
+        )
+        print(f"   Response: {response.choices[0].message.content}\n")
+    except WarmupTimeoutError as e:
+        print(f"   Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")
+
+    # With custom timeout - useful for models that take longer to load
+    print("2. Chat completion with custom timeout (2 minutes):")
+    try:
+        response = client.chat.completions.create(
+            model="gpt-oss-20b",
+            messages=[{"role": "user", "content": "Tell me a short joke."}],
+            wait_for_ready=True,
+            warmup_timeout=120.0,  # Wait up to 2 minutes
+        )
+        print(f"   Response: {response.choices[0].message.content}\n")
+    except WarmupTimeoutError as e:
+        print(f"   Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")
+
+    # Streaming also works with wait_for_ready
+    print("3. Streaming with wait_for_ready:")
+    try:
+        stream = client.chat.completions.create(
+            model="gpt-oss-20b",
+            messages=[{"role": "user", "content": "Count from 1 to 5."}],
+            stream=True,
+            wait_for_ready=True,
+        )
+        print("   Response: ", end="")
+        for chunk in stream:
+            if chunk.choices[0].delta.content:
+                print(chunk.choices[0].delta.content, end="", flush=True)
+        print("\n")
+    except WarmupTimeoutError as e:
+        print(f"   Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")
+
+
+async def async_example():
+    """Asynchronous example of wait_for_ready."""
+    print("=== Asynchronous Example ===\n")
+
+    async with AsyncOpenAI() as client:
+        # Basic async usage
+        print("1. Async chat completion with wait_for_ready:")
+        try:
+            response = await client.chat.completions.create(
+                model="gpt-oss-20b",
+                messages=[{"role": "user", "content": "What is the capital of France?"}],
+                wait_for_ready=True,
+            )
+            print(f"   Response: {response.choices[0].message.content}\n")
+        except WarmupTimeoutError as e:
+            print(f"   Model {e.model} didn't warm up in {e.waited_seconds:.1f}s\n")
+
+
+def audio_example():
+    """Example with audio transcription."""
+    print("=== Audio Transcription Example ===\n")
+
+    client = OpenAI()
+
+    # Note: You need an actual audio file for this example
+    # This is just to show the API usage
+    print("Audio transcription with wait_for_ready:")
+    print("   (Requires an audio file to run)")
+    print("""
+   # Example code:
+   with open("audio.mp3", "rb") as f:
+       transcript = client.audio.transcriptions.create(
+           file=f,
+           model="whisper-large-v3",
+           wait_for_ready=True,  # Wait for whisper model to load
+           warmup_timeout=180.0,  # ASR models may take longer
+       )
+   print(transcript.text)
+   """)
+
+
+def main():
+    """Run all examples."""
+    print("=" * 60)
+    print("Kafeido SDK - wait_for_ready Example")
+    print("=" * 60)
+    print()
+
+    # Run sync example
+    sync_example()
+
+    # Run async example
+    asyncio.run(async_example())
+
+    # Show audio example code
+    audio_example()
+
+    print("=" * 60)
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/kafeido/__init__.py b/kafeido/__init__.py
@@ -18,6 +18,7 @@
 from kafeido.version import __version__
 from kafeido.client import OpenAI
 from kafeido._async_client import AsyncOpenAI
+from kafeido._warmup import WarmupTimeoutError
 from kafeido.types import (
     # Errors
     OpenAIError,
@@ -95,6 +96,7 @@
     "UnprocessableEntityError",
     "RateLimitError",
     "InternalServerError",
+    "WarmupTimeoutError",
     # Chat
     "ChatCompletion",
     "ChatCompletionChunk",

diff --git a/kafeido/_async_client.py b/kafeido/_async_client.py
@@ -7,6 +7,7 @@
 
 from kafeido._auth import get_api_key
 from kafeido._http_client import AsyncHTTPClient
+from kafeido._warmup import AsyncWarmupHelper
 from kafeido.resources._async_chat import AsyncChat
 from kafeido.resources._async_audio import AsyncAudio
 from kafeido.resources._async_models import AsyncModels
@@ -76,13 +77,21 @@ def __init__(
             max_retries=max_retries,
         )
 
-        # Initialize resources
-        self._chat = AsyncChat(self._http_client)
-        self._audio = AsyncAudio(self._http_client)
+        # Initialize models resource first (needed for warmup helper)
         self._models = AsyncModels(self._http_client)
+
+        # Initialize warmup helper for cold start handling
+        self._warmup_helper = AsyncWarmupHelper(
+            status_fn=self._models.status,
+            warmup_fn=lambda m: self._models.warmup(model=m),
+        )
+
+        # Initialize resources with warmup helper
+        self._chat = AsyncChat(self._http_client, self._warmup_helper)
+        self._audio = AsyncAudio(self._http_client, self._warmup_helper)
         self._files = AsyncFiles(self._http_client)
-        self._ocr = AsyncOCR(self._http_client)
-        self._vision = AsyncVision(self._http_client)
+        self._ocr = AsyncOCR(self._http_client, self._warmup_helper)
+        self._vision = AsyncVision(self._http_client, self._warmup_helper)
         self._jobs = AsyncJobs(self._http_client)
 
     @property

diff --git a/kafeido/_warmup.py b/kafeido/_warmup.py
@@ -0,0 +1,181 @@
+"""Cold start waiting / warmup helpers.
+
+This module provides helpers for handling model cold starts by automatically
+triggering warmup and polling until the model is ready before making requests.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from typing import TYPE_CHECKING, Awaitable, Callable, Optional
+
+if TYPE_CHECKING:
+    from kafeido.types.models import ModelStatus, WarmupResponse
+
+
+# Default configuration
+DEFAULT_POLL_INTERVAL = 2.0  # seconds between status checks
+DEFAULT_MAX_WAIT_TIME = 300.0  # 5 minutes max wait
+HEALTHY_STATUS = "healthy"
+
+
+class WarmupTimeoutError(Exception):
+    """Raised when model warmup times out.
+
+    Attributes:
+        model: The model ID that timed out.
+        waited_seconds: How long we waited before timing out.
+    """
+
+    def __init__(self, model: str, waited_seconds: float) -> None:
+        super().__init__(
+            f"Model '{model}' did not become ready within {waited_seconds:.1f}s"
+        )
+        self.model = model
+        self.waited_seconds = waited_seconds
+
+
+class WarmupHelper:
+    """Synchronous warmup helper for cold start waiting.
+
+    This helper triggers model warmup and polls until the model becomes healthy.
+    """
+
+    def __init__(
+        self,
+        status_fn: Callable[[str], "ModelStatus"],
+        warmup_fn: Callable[[str], "WarmupResponse"],
+        poll_interval: float = DEFAULT_POLL_INTERVAL,
+        max_wait_time: float = DEFAULT_MAX_WAIT_TIME,
+    ) -> None:
+        """Initialize warmup helper.
+
+        Args:
+            status_fn: Function to get model status (typically models.status).
+            warmup_fn: Function to trigger warmup (typically models.warmup).
+            poll_interval: Seconds between status checks.
+            max_wait_time: Maximum seconds to wait before timeout.
+        """
+        self._status_fn = status_fn
+        self._warmup_fn = warmup_fn
+        self._poll_interval = poll_interval
+        self._max_wait_time = max_wait_time
+
+    def wait_for_ready(
+        self, model: str, timeout: Optional[float] = None
+    ) -> None:
+        """Wait for model to be ready, triggering warmup if needed.
+
+        This method will:
+        1. Trigger a warmup request to start loading the model
+        2. If model is already warm, return immediately
+        3. Otherwise, poll the status endpoint until the model is healthy
+        4. Raise WarmupTimeoutError if the model doesn't become ready in time
+
+        Args:
+            model: The model ID to wait for.
+            timeout: Optional timeout override in seconds. If None, uses
+                the default max_wait_time from initialization.
+
+        Raises:
+            WarmupTimeoutError: If model doesn't become ready within timeout.
+        """
+        max_wait = timeout if timeout is not None else self._max_wait_time
+
+        # First, trigger warmup
+        warmup_response = self._warmup_fn(model)
+
+        if warmup_response.already_warm:
+            return  # Model is already ready
+
+        # Poll until ready or timeout
+        start_time = time.monotonic()
+
+        while True:
+            elapsed = time.monotonic() - start_time
+
+            if elapsed >= max_wait:
+                raise WarmupTimeoutError(model, elapsed)
+
+            # Check status
+            status = self._status_fn(model)
+
+            if status.status and status.status.status == HEALTHY_STATUS:
+                return  # Model is ready
+
+            # Wait before next poll
+            time.sleep(self._poll_interval)
+
+
+class AsyncWarmupHelper:
+    """Asynchronous warmup helper for cold start waiting.
+
+    This helper triggers model warmup and polls until the model becomes healthy,
+    using async/await for non-blocking operation.
+    """
+
+    def __init__(
+        self,
+        status_fn: Callable[[str], Awaitable["ModelStatus"]],
+        warmup_fn: Callable[[str], Awaitable["WarmupResponse"]],
+        poll_interval: float = DEFAULT_POLL_INTERVAL,
+        max_wait_time: float = DEFAULT_MAX_WAIT_TIME,
+    ) -> None:
+        """Initialize async warmup helper.
+
+        Args:
+            status_fn: Async function to get model status.
+            warmup_fn: Async function to trigger warmup.
+            poll_interval: Seconds between status checks.
+            max_wait_time: Maximum seconds to wait before timeout.
+        """
+        self._status_fn = status_fn
+        self._warmup_fn = warmup_fn
+        self._poll_interval = poll_interval
+        self._max_wait_time = max_wait_time
+
+    async def wait_for_ready(
+        self, model: str, timeout: Optional[float] = None
+    ) -> None:
+        """Wait for model to be ready asynchronously.
+
+        This method will:
+        1. Trigger a warmup request to start loading the model
+        2. If model is already warm, return immediately
+        3. Otherwise, poll the status endpoint until the model is healthy
+        4. Raise WarmupTimeoutError if the model doesn't become ready in time
+
+        Args:
+            model: The model ID to wait for.
+            timeout: Optional timeout override in seconds. If None, uses
+                the default max_wait_time from initialization.
+
+        Raises:
+            WarmupTimeoutError: If model doesn't become ready within timeout.
+        """
+        max_wait = timeout if timeout is not None else self._max_wait_time
+
+        # First, trigger warmup
+        warmup_response = await self._warmup_fn(model)
+
+        if warmup_response.already_warm:
+            return  # Model is already ready
+
+        # Poll until ready or timeout
+        start_time = time.monotonic()
+
+        while True:
+            elapsed = time.monotonic() - start_time
+
+            if elapsed >= max_wait:
+                raise WarmupTimeoutError(model, elapsed)
+
+            # Check status
+            status = await self._status_fn(model)
+
+            if status.status and status.status.status == HEALTHY_STATUS:
+                return  # Model is ready
+
+            # Wait before next poll
+            await asyncio.sleep(self._poll_interval)