lightspeed-core
diff --git a/‎examples/lightspeed-stack-rlsapi-cla.yaml‎
Lines changed: 11 additions & 0 deletions b/‎examples/lightspeed-stack-rlsapi-cla.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/app/endpoints/rlsapi_v1.py‎
Lines changed: 65 additions & 6 deletions b/‎src/app/endpoints/rlsapi_v1.py‎
Lines changed: 65 additions & 6 deletions
diff --git a/‎src/configuration.py‎
Lines changed: 16 additions & 0 deletions b/‎src/configuration.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/models/config.py‎
Lines changed: 78 additions & 15 deletions b/‎src/models/config.py‎
Lines changed: 78 additions & 15 deletions
@@ -20,6 +20,17 @@ inference:
   default_provider: google-vertex
   default_model: gemini-2.5-flash
 
+# rlsapi v1 endpoint settings (CLA-specific)
+rlsapi_v1:
+  # Quota enforcement: which identity field to track token usage by.
+  # Options: "user_id", "org_id", "system_id"
+  # Requires quota_handlers to be configured. Omit to disable quotas.
+  # quota_subject: "org_id"
+
+  # Allow /v1/infer to return extended metadata when clients send
+  # "include_metadata": true. Should NOT be enabled in production.
+  # allow_verbose_infer: false
+
 # Red Hat Identity authentication (typical for CLA deployments)
 authentication:
   module: "rh-identity"
 
@@ -40,9 +40,11 @@
 from observability import InferenceEventData, build_inference_event, send_splunk_event
 from utils.endpoints import check_configuration_loaded
 from utils.query import (
+    consume_query_tokens,
     extract_provider_and_model_from_model_id,
     handle_known_apistatus_errors,
 )
+from utils.quota import check_tokens_available
 from utils.responses import (
     build_turn_summary,
     extract_text_from_response_items,
@@ -456,12 +458,57 @@ def _is_verbose_enabled(infer_request: RlsapiV1InferRequest) -> bool:
         True if both server config and client request enable verbose mode.
     """
     return (
-        configuration.customization is not None
-        and configuration.customization.allow_verbose_infer
-        and infer_request.include_metadata
+        configuration.rlsapi_v1.allow_verbose_infer and infer_request.include_metadata
     )
 
 
+def _resolve_quota_subject(request: Request, auth: AuthTuple) -> str | None:
+    """Resolve the quota subject identifier based on rlsapi_v1 configuration.
+
+    Returns None when quota enforcement is disabled (quota_subject not set),
+    signaling the caller to skip quota checks entirely.
+
+    When the configured subject source (org_id or system_id) is unavailable
+    (e.g., rh-identity auth is not active), falls back to user_id from the
+    auth tuple so quota enforcement still applies.
+
+    Args:
+        request: The FastAPI request object (for accessing rh-identity state).
+        auth: Authentication tuple from the configured auth provider.
+
+    Returns:
+        The resolved subject identifier string, or None if quota is disabled.
+    """
+    quota_subject = configuration.rlsapi_v1.quota_subject
+    if quota_subject is None:
+        return None
+
+    user_id = auth[0]
+
+    if quota_subject == "user_id":
+        return user_id
+
+    org_id, system_id = _get_rh_identity_context(request)
+
+    if quota_subject == "org_id":
+        if org_id == AUTH_DISABLED:
+            logger.warning(
+                "quota_subject is 'org_id' but rh-identity data is unavailable, "
+                "falling back to user_id"
+            )
+            return user_id
+        return org_id
+
+    # quota_subject == "system_id"
+    if system_id == AUTH_DISABLED:
+        logger.warning(
+            "quota_subject is 'system_id' but rh-identity data is unavailable, "
+            "falling back to user_id"
+        )
+        return user_id
+    return system_id
+
+
 def _build_infer_response(
     response_text: str,
     request_id: str,
@@ -594,10 +641,14 @@ async def infer_endpoint(  # pylint: disable=R0914
         HTTPException: 503 if the LLM service is unavailable.
     """
     # Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
-    _ = auth
-
     check_configuration_loaded(configuration)
 
+    # Quota enforcement: resolve subject and check availability before any work.
+    # No-op when quota_subject is not configured or no quota limiters exist.
+    quota_id = _resolve_quota_subject(request, auth)
+    if quota_id is not None:
+        check_tokens_available(configuration.quota_limiters, quota_id)
+
     request_id = get_suid()
 
     logger.info("Processing rlsapi v1 /infer request %s", request_id)
@@ -634,7 +685,7 @@ async def infer_endpoint(  # pylint: disable=R0914
             model_id=model_id,
         )
         response_text = extract_text_from_response_items(response.output)
-        extract_token_usage(response.usage, model_id)
+        token_usage = extract_token_usage(response.usage, model_id)
         inference_time = time.monotonic() - start_time
     except _INFER_HANDLED_EXCEPTIONS as error:
         if response is not None:
@@ -662,6 +713,14 @@ async def infer_endpoint(  # pylint: disable=R0914
         logger.warning("Empty response from LLM for request %s", request_id)
         response_text = constants.UNABLE_TO_PROCESS_RESPONSE
 
+    # Consume quota tokens after successful inference.
+    if quota_id is not None:
+        consume_query_tokens(
+            user_id=quota_id,
+            model_id=model_id,
+            token_usage=token_usage,
+        )
+
     _queue_splunk_event(
         background_tasks,
         infer_request,
 
@@ -27,6 +27,7 @@
     OkpConfiguration,
     QuotaHandlersConfiguration,
     RagConfiguration,
+    RlsapiV1Configuration,
     ServiceConfiguration,
     SplunkConfiguration,
     UserDataCollection,
@@ -286,6 +287,21 @@ def customization(self) -> Optional[Customization]:
             raise LogicError("logic error: configuration is not loaded")
         return self._configuration.customization
 
+    @property
+    def rlsapi_v1(self) -> RlsapiV1Configuration:
+        """Return rlsapi v1 endpoint configuration.
+
+        Returns:
+            RlsapiV1Configuration: Configuration for the rlsapi v1 /infer
+            endpoint (CLA-specific settings).
+
+        Raises:
+            LogicError: If the configuration has not been loaded.
+        """
+        if self._configuration is None:
+            raise LogicError("logic error: configuration is not loaded")
+        return self._configuration.rlsapi_v1
+
     @property
     def inference(self) -> InferenceConfiguration:
         """Return inference configuration.
 
@@ -1326,21 +1326,6 @@ class Customization(ConfigurationBase):
     agent_card_config: Optional[dict[str, Any]] = None
     custom_profile: Optional[CustomProfile] = Field(default=None, init=False)
 
-    # Debugging: Allow /v1/infer to return extended metadata
-    # WARNING: This should NOT be enabled in production environments.
-    # Setting this to True allows clients to request extended response data
-    # (tool_calls, rag_chunks, token_usage, etc.) from the /v1/infer endpoint
-    # by including "include_metadata": true in the request body.
-    #
-    # If this feature were wanted in production, consider RBAC-based access control instead:
-    # 1. Add Action.RLSAPI_V1_INFER_VERBOSE to models/config.py Action enum
-    # 2. Check authorization in infer_endpoint:
-    #    if infer_request.include_metadata:
-    #        if Action.RLSAPI_V1_INFER_VERBOSE not in request.state.authorized_actions:
-    #            raise HTTPException(status_code=403, detail="Verbose infer not authorized")
-    # 3. Add the action to authorization rules for specific users/roles
-    allow_verbose_infer: bool = False
-
     @model_validator(mode="after")
     def check_customization_model(self) -> Self:
         """
@@ -1381,6 +1366,35 @@ def check_customization_model(self) -> Self:
         return self
 
 
+class RlsapiV1Configuration(ConfigurationBase):
+    """Configuration for the rlsapi v1 /infer endpoint.
+
+    Settings specific to the RHEL Lightspeed Command Line Assistant (CLA)
+    stateless inference endpoint. Kept separate from shared configuration
+    sections so that CLA-specific options do not affect other endpoints.
+    """
+
+    allow_verbose_infer: bool = Field(
+        default=False,
+        title="Allow verbose infer",
+        description="Allow /v1/infer to return extended metadata "
+        "(tool_calls, rag_chunks, token_usage) when the client sends "
+        '"include_metadata": true. Should NOT be enabled in production. '
+        "If production use is needed, consider RBAC-based access control "
+        "via an Action.RLSAPI_V1_INFER_VERBOSE authorization rule.",
+    )
+
+    quota_subject: Optional[Literal["user_id", "org_id", "system_id"]] = Field(
+        default=None,
+        title="Quota subject",
+        description="Identity field used as the quota subject for /v1/infer. "
+        "When set, token quota enforcement is enabled for this endpoint. "
+        "Requires quota_handlers to be configured. "
+        '"org_id" and "system_id" require rh-identity authentication; '
+        "falls back to user_id when rh-identity data is unavailable.",
+    )
+
+
 class InferenceConfiguration(ConfigurationBase):
     """Inference configuration."""
 
@@ -1911,6 +1925,13 @@ class Configuration(ConfigurationBase):
     )
     azure_entra_id: Optional[AzureEntraIdConfiguration] = None
 
+    rlsapi_v1: RlsapiV1Configuration = Field(
+        default_factory=RlsapiV1Configuration,
+        title="rlsapi v1 configuration",
+        description="Configuration for the rlsapi v1 /infer endpoint used by "
+        "the RHEL Lightspeed Command Line Assistant (CLA).",
+    )
+
     splunk: Optional[SplunkConfiguration] = Field(
         default=None,
         title="Splunk configuration",
@@ -1993,6 +2014,48 @@ def validate_mcp_auth_headers(self) -> Self:
         self.mcp_servers = valid_mcp_servers
         return self
 
+    @model_validator(mode="after")
+    def validate_rlsapi_v1_quota_configuration(self) -> Self:
+        """Validate rlsapi_v1 quota settings against authentication and quota handlers.
+
+        Enforces that quota_subject values requiring rh-identity data ("org_id",
+        "system_id") are only used when rh-identity authentication is active.
+        Warns when quota_subject is set but no quota limiters are configured,
+        since quota enforcement would be a silent no-op.
+
+        Returns:
+            Self: The validated configuration instance.
+
+        Raises:
+            ValueError: If quota_subject requires rh-identity but a different
+                authentication module is configured.
+        """
+        quota_subject = self.rlsapi_v1.quota_subject  # pylint: disable=no-member
+        if quota_subject is None:
+            return self
+
+        auth_module = self.authentication.module  # pylint: disable=no-member
+
+        if quota_subject in ("org_id", "system_id") and (
+            auth_module != constants.AUTH_MOD_RH_IDENTITY
+        ):
+            raise ValueError(
+                f"rlsapi_v1.quota_subject='{quota_subject}' requires "
+                f"authentication.module='{constants.AUTH_MOD_RH_IDENTITY}', "
+                f"but got '{auth_module}'. Use quota_subject='user_id' or "
+                f"switch authentication to '{constants.AUTH_MOD_RH_IDENTITY}'."
+            )
+
+        if not self.quota_handlers.limiters:  # pylint: disable=no-member
+            logger.warning(
+                "rlsapi_v1.quota_subject is '%s' but no quota limiters are "
+                "configured in quota_handlers.limiters. Token quota enforcement "
+                "will not take effect until limiters are added.",
+                quota_subject,
+            )
+
+        return self
+
     def dump(self, filename: str | Path = "configuration.json") -> None:
         """
         Write the current Configuration model to a JSON file.