Skip to content

Commit 3c2c22a

Browse files
committed
feat: add rlsapi_v1 config section with quota enforcement
Move allow_verbose_infer from the shared Customization config into a new dedicated rlsapi_v1 config section so CLA-specific settings are consolidated in one place and don't clutter shared configuration. Add configurable quota enforcement for /v1/infer via quota_subject, which selects the identity field (user_id, org_id, or system_id) used as the quota subject. Disabled by default (quota_subject: null). Startup validation rejects org_id/system_id quota_subject when the authentication module is not rh-identity, and warns when quota_subject is set but no quota limiters are configured. Falls back to user_id at runtime if a specific request lacks rh-identity data. No changes to the core quota system (utils/quota.py, quota limiters, factory, or scheduler). All existing endpoints are unaffected. Signed-off-by: Major Hayden <major@redhat.com>
1 parent 9626f91 commit 3c2c22a

File tree

7 files changed

+594
-40
lines changed

7 files changed

+594
-40
lines changed

examples/lightspeed-stack-rlsapi-cla.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,17 @@ inference:
2020
default_provider: google-vertex
2121
default_model: gemini-2.5-flash
2222

23+
# rlsapi v1 endpoint settings (CLA-specific)
24+
rlsapi_v1:
25+
# Quota enforcement: which identity field to track token usage by.
26+
# Options: "user_id", "org_id", "system_id"
27+
# Requires quota_handlers to be configured. Omit to disable quotas.
28+
# quota_subject: "org_id"
29+
30+
# Allow /v1/infer to return extended metadata when clients send
31+
# "include_metadata": true. Should NOT be enabled in production.
32+
# allow_verbose_infer: false
33+
2334
# Red Hat Identity authentication (typical for CLA deployments)
2435
authentication:
2536
module: "rh-identity"

src/app/endpoints/rlsapi_v1.py

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,11 @@
4040
from observability import InferenceEventData, build_inference_event, send_splunk_event
4141
from utils.endpoints import check_configuration_loaded
4242
from utils.query import (
43+
consume_query_tokens,
4344
extract_provider_and_model_from_model_id,
4445
handle_known_apistatus_errors,
4546
)
47+
from utils.quota import check_tokens_available
4648
from utils.responses import (
4749
build_turn_summary,
4850
extract_text_from_response_items,
@@ -456,12 +458,57 @@ def _is_verbose_enabled(infer_request: RlsapiV1InferRequest) -> bool:
456458
True if both server config and client request enable verbose mode.
457459
"""
458460
return (
459-
configuration.customization is not None
460-
and configuration.customization.allow_verbose_infer
461-
and infer_request.include_metadata
461+
configuration.rlsapi_v1.allow_verbose_infer and infer_request.include_metadata
462462
)
463463

464464

465+
def _resolve_quota_subject(request: Request, auth: AuthTuple) -> str | None:
466+
"""Resolve the quota subject identifier based on rlsapi_v1 configuration.
467+
468+
Returns None when quota enforcement is disabled (quota_subject not set),
469+
signaling the caller to skip quota checks entirely.
470+
471+
When the configured subject source (org_id or system_id) is unavailable
472+
(e.g., rh-identity auth is not active), falls back to user_id from the
473+
auth tuple so quota enforcement still applies.
474+
475+
Args:
476+
request: The FastAPI request object (for accessing rh-identity state).
477+
auth: Authentication tuple from the configured auth provider.
478+
479+
Returns:
480+
The resolved subject identifier string, or None if quota is disabled.
481+
"""
482+
quota_subject = configuration.rlsapi_v1.quota_subject
483+
if quota_subject is None:
484+
return None
485+
486+
user_id = auth[0]
487+
488+
if quota_subject == "user_id":
489+
return user_id
490+
491+
org_id, system_id = _get_rh_identity_context(request)
492+
493+
if quota_subject == "org_id":
494+
if org_id == AUTH_DISABLED:
495+
logger.warning(
496+
"quota_subject is 'org_id' but rh-identity data is unavailable, "
497+
"falling back to user_id"
498+
)
499+
return user_id
500+
return org_id
501+
502+
# quota_subject == "system_id"
503+
if system_id == AUTH_DISABLED:
504+
logger.warning(
505+
"quota_subject is 'system_id' but rh-identity data is unavailable, "
506+
"falling back to user_id"
507+
)
508+
return user_id
509+
return system_id
510+
511+
465512
def _build_infer_response(
466513
response_text: str,
467514
request_id: str,
@@ -594,10 +641,14 @@ async def infer_endpoint( # pylint: disable=R0914
594641
HTTPException: 503 if the LLM service is unavailable.
595642
"""
596643
# Authentication enforced by get_auth_dependency(), authorization by @authorize decorator.
597-
_ = auth
598-
599644
check_configuration_loaded(configuration)
600645

646+
# Quota enforcement: resolve subject and check availability before any work.
647+
# No-op when quota_subject is not configured or no quota limiters exist.
648+
quota_id = _resolve_quota_subject(request, auth)
649+
if quota_id is not None:
650+
check_tokens_available(configuration.quota_limiters, quota_id)
651+
601652
request_id = get_suid()
602653

603654
logger.info("Processing rlsapi v1 /infer request %s", request_id)
@@ -634,7 +685,7 @@ async def infer_endpoint( # pylint: disable=R0914
634685
model_id=model_id,
635686
)
636687
response_text = extract_text_from_response_items(response.output)
637-
extract_token_usage(response.usage, model_id)
688+
token_usage = extract_token_usage(response.usage, model_id)
638689
inference_time = time.monotonic() - start_time
639690
except _INFER_HANDLED_EXCEPTIONS as error:
640691
if response is not None:
@@ -662,6 +713,14 @@ async def infer_endpoint( # pylint: disable=R0914
662713
logger.warning("Empty response from LLM for request %s", request_id)
663714
response_text = constants.UNABLE_TO_PROCESS_RESPONSE
664715

716+
# Consume quota tokens after successful inference.
717+
if quota_id is not None:
718+
consume_query_tokens(
719+
user_id=quota_id,
720+
model_id=model_id,
721+
token_usage=token_usage,
722+
)
723+
665724
_queue_splunk_event(
666725
background_tasks,
667726
infer_request,

src/configuration.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
OkpConfiguration,
2828
QuotaHandlersConfiguration,
2929
RagConfiguration,
30+
RlsapiV1Configuration,
3031
ServiceConfiguration,
3132
SplunkConfiguration,
3233
UserDataCollection,
@@ -286,6 +287,21 @@ def customization(self) -> Optional[Customization]:
286287
raise LogicError("logic error: configuration is not loaded")
287288
return self._configuration.customization
288289

290+
@property
291+
def rlsapi_v1(self) -> RlsapiV1Configuration:
292+
"""Return rlsapi v1 endpoint configuration.
293+
294+
Returns:
295+
RlsapiV1Configuration: Configuration for the rlsapi v1 /infer
296+
endpoint (CLA-specific settings).
297+
298+
Raises:
299+
LogicError: If the configuration has not been loaded.
300+
"""
301+
if self._configuration is None:
302+
raise LogicError("logic error: configuration is not loaded")
303+
return self._configuration.rlsapi_v1
304+
289305
@property
290306
def inference(self) -> InferenceConfiguration:
291307
"""Return inference configuration.

src/models/config.py

Lines changed: 78 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,21 +1326,6 @@ class Customization(ConfigurationBase):
13261326
agent_card_config: Optional[dict[str, Any]] = None
13271327
custom_profile: Optional[CustomProfile] = Field(default=None, init=False)
13281328

1329-
# Debugging: Allow /v1/infer to return extended metadata
1330-
# WARNING: This should NOT be enabled in production environments.
1331-
# Setting this to True allows clients to request extended response data
1332-
# (tool_calls, rag_chunks, token_usage, etc.) from the /v1/infer endpoint
1333-
# by including "include_metadata": true in the request body.
1334-
#
1335-
# If this feature were wanted in production, consider RBAC-based access control instead:
1336-
# 1. Add Action.RLSAPI_V1_INFER_VERBOSE to models/config.py Action enum
1337-
# 2. Check authorization in infer_endpoint:
1338-
# if infer_request.include_metadata:
1339-
# if Action.RLSAPI_V1_INFER_VERBOSE not in request.state.authorized_actions:
1340-
# raise HTTPException(status_code=403, detail="Verbose infer not authorized")
1341-
# 3. Add the action to authorization rules for specific users/roles
1342-
allow_verbose_infer: bool = False
1343-
13441329
@model_validator(mode="after")
13451330
def check_customization_model(self) -> Self:
13461331
"""
@@ -1381,6 +1366,35 @@ def check_customization_model(self) -> Self:
13811366
return self
13821367

13831368

1369+
class RlsapiV1Configuration(ConfigurationBase):
1370+
"""Configuration for the rlsapi v1 /infer endpoint.
1371+
1372+
Settings specific to the RHEL Lightspeed Command Line Assistant (CLA)
1373+
stateless inference endpoint. Kept separate from shared configuration
1374+
sections so that CLA-specific options do not affect other endpoints.
1375+
"""
1376+
1377+
allow_verbose_infer: bool = Field(
1378+
default=False,
1379+
title="Allow verbose infer",
1380+
description="Allow /v1/infer to return extended metadata "
1381+
"(tool_calls, rag_chunks, token_usage) when the client sends "
1382+
'"include_metadata": true. Should NOT be enabled in production. '
1383+
"If production use is needed, consider RBAC-based access control "
1384+
"via an Action.RLSAPI_V1_INFER_VERBOSE authorization rule.",
1385+
)
1386+
1387+
quota_subject: Optional[Literal["user_id", "org_id", "system_id"]] = Field(
1388+
default=None,
1389+
title="Quota subject",
1390+
description="Identity field used as the quota subject for /v1/infer. "
1391+
"When set, token quota enforcement is enabled for this endpoint. "
1392+
"Requires quota_handlers to be configured. "
1393+
'"org_id" and "system_id" require rh-identity authentication; '
1394+
"falls back to user_id when rh-identity data is unavailable.",
1395+
)
1396+
1397+
13841398
class InferenceConfiguration(ConfigurationBase):
13851399
"""Inference configuration."""
13861400

@@ -1911,6 +1925,13 @@ class Configuration(ConfigurationBase):
19111925
)
19121926
azure_entra_id: Optional[AzureEntraIdConfiguration] = None
19131927

1928+
rlsapi_v1: RlsapiV1Configuration = Field(
1929+
default_factory=RlsapiV1Configuration,
1930+
title="rlsapi v1 configuration",
1931+
description="Configuration for the rlsapi v1 /infer endpoint used by "
1932+
"the RHEL Lightspeed Command Line Assistant (CLA).",
1933+
)
1934+
19141935
splunk: Optional[SplunkConfiguration] = Field(
19151936
default=None,
19161937
title="Splunk configuration",
@@ -1993,6 +2014,48 @@ def validate_mcp_auth_headers(self) -> Self:
19932014
self.mcp_servers = valid_mcp_servers
19942015
return self
19952016

2017+
@model_validator(mode="after")
2018+
def validate_rlsapi_v1_quota_configuration(self) -> Self:
2019+
"""Validate rlsapi_v1 quota settings against authentication and quota handlers.
2020+
2021+
Enforces that quota_subject values requiring rh-identity data ("org_id",
2022+
"system_id") are only used when rh-identity authentication is active.
2023+
Warns when quota_subject is set but no quota limiters are configured,
2024+
since quota enforcement would be a silent no-op.
2025+
2026+
Returns:
2027+
Self: The validated configuration instance.
2028+
2029+
Raises:
2030+
ValueError: If quota_subject requires rh-identity but a different
2031+
authentication module is configured.
2032+
"""
2033+
quota_subject = self.rlsapi_v1.quota_subject # pylint: disable=no-member
2034+
if quota_subject is None:
2035+
return self
2036+
2037+
auth_module = self.authentication.module # pylint: disable=no-member
2038+
2039+
if quota_subject in ("org_id", "system_id") and (
2040+
auth_module != constants.AUTH_MOD_RH_IDENTITY
2041+
):
2042+
raise ValueError(
2043+
f"rlsapi_v1.quota_subject='{quota_subject}' requires "
2044+
f"authentication.module='{constants.AUTH_MOD_RH_IDENTITY}', "
2045+
f"but got '{auth_module}'. Use quota_subject='user_id' or "
2046+
f"switch authentication to '{constants.AUTH_MOD_RH_IDENTITY}'."
2047+
)
2048+
2049+
if not self.quota_handlers.limiters: # pylint: disable=no-member
2050+
logger.warning(
2051+
"rlsapi_v1.quota_subject is '%s' but no quota limiters are "
2052+
"configured in quota_handlers.limiters. Token quota enforcement "
2053+
"will not take effect until limiters are added.",
2054+
quota_subject,
2055+
)
2056+
2057+
return self
2058+
19962059
def dump(self, filename: str | Path = "configuration.json") -> None:
19972060
"""
19982061
Write the current Configuration model to a JSON file.

0 commit comments

Comments
 (0)