roboflow · hansent · Feb 27, 2026 · Feb 27, 2026 · Feb 27, 2026
@@ -669,6 +669,19 @@
     os.getenv("PRELOAD_MODELS").split(",") if os.getenv("PRELOAD_MODELS") else None
 )
 
+# API key used exclusively for model preloading. Use this instead of API_KEY on
+# user-facing deployments where setting API_KEY globally would affect per-request
+# auth, billing attribution, and model-access fallback behaviour.
+# Falls back to API_KEY if not set.
+PRELOAD_API_KEY = os.getenv("PRELOAD_API_KEY") or API_KEY
+
+# Models that must always be loaded at startup and never evicted from cache.
+# Unlike PRELOAD_MODELS, this bypasses the LAMBDA/GCP_SERVERLESS gate.
+# Comma-separated list of model IDs.
+PINNED_MODELS = (
+    os.getenv("PINNED_MODELS").split(",") if os.getenv("PINNED_MODELS") else None
+)
+
 LOAD_ENTERPRISE_BLOCKS = str2bool(os.getenv("LOAD_ENTERPRISE_BLOCKS", "False"))
 TRANSIENT_ROBOFLOW_API_ERRORS = set(
     int(e)

@@ -131,7 +131,6 @@
 from inference.core.env import (
     ALLOW_ORIGINS,
     API_BASE_URL,
-    API_KEY,
     API_LOGGING_ENABLED,
     BUILDER_ORIGIN,
     CONFIDENCE_LOWER_BOUND_OOM_PREVENTION,
@@ -169,6 +168,8 @@
     NOTEBOOK_ENABLED,
     NOTEBOOK_PASSWORD,
     NOTEBOOK_PORT,
+    PINNED_MODELS,
+    PRELOAD_API_KEY,
     PRELOAD_MODELS,
     PROFILE,
     ROBOFLOW_INTERNAL_SERVICE_NAME,
@@ -1781,9 +1782,9 @@ async def consume(
 
         # Enable preloading models at startup
         if (
-            (PRELOAD_MODELS or DEDICATED_DEPLOYMENT_WORKSPACE_URL)
-            and API_KEY
-            and not (LAMBDA or GCP_SERVERLESS)
+            (PRELOAD_MODELS or PINNED_MODELS or DEDICATED_DEPLOYMENT_WORKSPACE_URL)
+            and PRELOAD_API_KEY
+            and (PINNED_MODELS or not (LAMBDA or GCP_SERVERLESS))
         ):
 
             class ModelInitState:
@@ -1798,32 +1799,46 @@ def __init__(self):
 
             def initialize_models(state: ModelInitState):
                 """Perform asynchronous initialization tasks to load models."""
-                # Limit the number of concurrent tasks to prevent resource exhaustion
 
                 def load_model(model_id):
-                    logger.debug(f"load_model({model_id}) - starting")
+                    t_start = time.perf_counter()
+                    de_aliased = resolve_roboflow_model_alias(model_id=model_id)
+                    logger.info(
+                        f"Preload: starting model load for '{model_id}' (resolved: '{de_aliased}')"
+                    )
                     try:
-                        # TODO: how to add timeout here? Probably best to timeout model loading?
-                        model_add(
-                            AddModelRequest(
-                                model_id=model_id,
-                                model_type=None,
-                                api_key=API_KEY,
-                            )
+                        self.model_manager.add_model(
+                            de_aliased,
+                            PRELOAD_API_KEY,
+                        )
+                        load_time = time.perf_counter() - t_start
+                        logger.info(
+                            f"Preload: model '{model_id}' loaded successfully in {load_time:.1f}s"
                         )
-                        logger.info(f"Model {model_id} loaded successfully.")
                     except Exception as e:
-                        error_msg = f"Error loading model {model_id}: {e}"
+                        load_time = time.perf_counter() - t_start
+                        error_msg = f"Preload: error loading model '{model_id}' after {load_time:.1f}s: {e}"
                         logger.error(error_msg)
                         with state.lock:
                             state.initialization_errors.append((model_id, str(e)))
-                    logger.debug(f"load_model({model_id}) - finished")
+                        return
 
-                if PRELOAD_MODELS:
+                    # Pin if this model is in PINNED_MODELS
+                    if (
+                        PINNED_MODELS
+                        and model_id in PINNED_MODELS
+                        and hasattr(self.model_manager, "pin_model")
+                    ):
+                        self.model_manager.pin_model(de_aliased)
+
+                all_models = list(
+                    dict.fromkeys((PRELOAD_MODELS or []) + (PINNED_MODELS or []))
+                )
+                if all_models:
                     # Create tasks for each model to be loaded
                     model_loading_executor = ThreadPoolExecutor(max_workers=2)
                     loaded_futures: List[Tuple[str, Future]] = []
-                    for model_id in PRELOAD_MODELS:
+                    for model_id in all_models:
                         future = model_loading_executor.submit(
                             load_model, model_id=model_id
                         )

@@ -38,6 +38,16 @@ def __init__(self, model_manager: ModelManager, max_size: int = 8):
         self.max_size = max_size
         self._key_queue = deque(self.model_manager.keys())
         self._queue_lock = Lock()
+        self._pinned_models: set = set()
+
+    def pin_model(self, model_id: str) -> None:
+        """Mark a model as pinned so it won't be evicted by the LRU cache.
+
+        Pinned models (typically preloaded models) are protected from eviction
+        when the cache is full or under memory pressure.
+        """
+        self._pinned_models.add(model_id)
+        logger.debug(f"Model '{model_id}' pinned — will not be evicted from cache.")
 
     def add_model(
         self,
@@ -89,7 +99,9 @@ def add_model(
                 len(self) >= self.max_size
                 or (MEMORY_FREE_THRESHOLD and self.memory_pressure_detected())
             ):
-                # To prevent flapping around the threshold, remove 3 models to make some space.
+                # To prevent flapping around the threshold, remove up to 3 models to make some space.
+                evicted_count = 0
+                skipped_pinned = []
                 for _ in range(3):
                     if not self._key_queue:
                         logger.error(
@@ -101,10 +113,23 @@ def add_model(
                         )
                         break
                     to_remove_model_id = self._key_queue.popleft()
+                    if to_remove_model_id in self._pinned_models:
+                        skipped_pinned.append(to_remove_model_id)
+                        continue
                     super().remove(
                         to_remove_model_id, delete_from_disk=DISK_CACHE_CLEANUP
                     )  # LRU model overflow cleanup may or maynot need the weights removed from disk
                     logger.debug(f"Model {to_remove_model_id} successfully unloaded.")
+                    evicted_count += 1
+                # Put pinned models back at the front of the queue
+                for mid in reversed(skipped_pinned):
+                    self._key_queue.appendleft(mid)
+                if evicted_count == 0:
+                    logger.warning(
+                        "Cannot free model cache space — all remaining models are pinned (preloaded). "
+                        "Proceeding with cache exceeding max_size."
+                    )
+                    break
                 gc.collect()
             logger.debug(f"Marking new model {queue_id} as most recently used.")
             self._key_queue.append(queue_id)