coxwave-hackathon/guardrails.py at main · corca-ai/coxwave-hackathon · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
"""Guardrails for the research navigator agents.

Implements input and output guardrails following OpenAI Agent SDK patterns:
- Input guardrails: Check user input before agent execution
- Output guardrails: Check agent output before returning to user
"""

from pydantic import BaseModel
from src.shared.config import MODEL_LIGHT
from agents import (
    Agent,
    GuardrailFunctionOutput,
    RunContextWrapper,
    Runner,
    TResponseInputItem,
    input_guardrail,
    output_guardrail,
)

from main import VisualOutput


# =============================================================================
# Guardrail Output Types
# =============================================================================


class OffTopicCheck(BaseModel):
    """Output type for off-topic detection."""

    is_off_topic: bool
    reasoning: str
    detected_category: str  # "research", "homework", "chat", "coding", "other"


class UnethicalResearchCheck(BaseModel):
    """Output type for unethical research detection."""

    is_unethical: bool
    reasoning: str
    concern_type: str  # "weapons", "drugs", "hacking", "harm", "none"


class PIICheck(BaseModel):
    """Output type for PII/sensitive data detection."""

    contains_pii: bool
    reasoning: str
    pii_types: list[str]  # ["api_key", "password", "ssn", "email", etc.]


class HallucinationCheck(BaseModel):
    """Output type for hallucination detection in output."""

    has_unsupported_claims: bool
    reasoning: str
    flagged_statements: list[str]


class OverconfidenceCheck(BaseModel):
    """Output type for overconfidence detection."""

    is_overconfident: bool
    reasoning: str
    flagged_phrases: list[str]


# =============================================================================
# Guardrail Agents (lightweight models for fast checking)
# =============================================================================

off_topic_agent = Agent(
    name="OffTopicGuardrail",
    model=MODEL_LIGHT,
    output_type=OffTopicCheck,
    instructions="""You are a guardrail that checks if a user query is appropriate for a research assistant.

APPROPRIATE (is_off_topic=false):
- Academic research questions
- Literature review requests
- Scientific topic exploration
- Paper summarization requests
- Research methodology questions

NOT APPROPRIATE (is_off_topic=true):
- Homework or assignment completion requests (detected_category="homework")
- General chat or small talk (detected_category="chat")
- Code writing requests (detected_category="coding")
- Non-research tasks (detected_category="other")

Be lenient - if it could reasonably be research-related, allow it.""",
)

unethical_research_agent = Agent(
    name="UnethicalResearchGuardrail",
    model=MODEL_LIGHT,
    output_type=UnethicalResearchCheck,
    instructions="""You are a guardrail that checks if a research query involves unethical topics.

BLOCK (is_unethical=true) queries about:
- Weapons development or manufacturing (concern_type="weapons")
- Drug synthesis or illegal substances (concern_type="drugs")
- Hacking, malware, or cyberattacks (concern_type="hacking")
- Causing harm to individuals or groups (concern_type="harm")

ALLOW (is_unethical=false):
- Legitimate security research
- Historical analysis of conflicts
- Medical/pharmaceutical research
- Defensive cybersecurity

Set concern_type="none" if the query is ethical.
Be balanced - don't block legitimate academic research.""",
)

pii_agent = Agent(
    name="PIIGuardrail",
    model=MODEL_LIGHT,
    output_type=PIICheck,
    instructions="""You detect if user input contains sensitive personal information.

CHECK FOR:
- API keys or tokens (patterns like sk-*, pk_*, etc.)
- Passwords or secrets
- Social security numbers
- Credit card numbers
- Personal email addresses in sensitive contexts
- Private keys or certificates

DO NOT flag:
- Public email domains in academic contexts
- Example/placeholder data
- General names or affiliations""",
)

hallucination_agent = Agent(
    name="HallucinationGuardrail",
    model=MODEL_LIGHT,
    output_type=HallucinationCheck,
    instructions="""You check if research output contains unsupported claims.

FLAG (has_unsupported_claims=true) if:
- Claims are made without citing sources
- Statistics are presented without references
- Strong conclusions lack supporting evidence
- Statements like "studies show" without specific citations

ALLOW (has_unsupported_claims=false) if:
- Claims reference specific papers or sources
- Statements are clearly marked as interpretations
- Limitations are acknowledged
- Uncertainty is expressed appropriately

Add flagged statements to the list for transparency.""",
)

overconfidence_agent = Agent(
    name="OverconfidenceGuardrail",
    model=MODEL_LIGHT,
    output_type=OverconfidenceCheck,
    instructions="""You detect overconfident language in research output.

FLAG (is_overconfident=true) phrases like:
- "definitely", "certainly", "absolutely"
- "proves conclusively", "without doubt"
- "100%", "always", "never" (in absolute contexts)
- "the only way", "must be"

ALLOW (is_overconfident=false):
- Hedged language: "suggests", "indicates", "may"
- Qualified statements: "based on available evidence"
- Acknowledged uncertainty: "further research needed"

Research should express appropriate uncertainty.""",
)


# =============================================================================
# Input Guardrails
# =============================================================================


@input_guardrail
async def off_topic_guardrail(
    ctx: RunContextWrapper[None],
    agent: Agent,
    input: str | list[TResponseInputItem],
) -> GuardrailFunctionOutput:
    """Block off-topic requests that aren't research-related."""
    input_text = input if isinstance(input, str) else str(input)
    result = await Runner.run(off_topic_agent, input_text, context=ctx.context)

    return GuardrailFunctionOutput(
        output_info=result.final_output,
        tripwire_triggered=result.final_output.is_off_topic,
    )


@input_guardrail
async def unethical_research_guardrail(
    ctx: RunContextWrapper[None],
    agent: Agent,
    input: str | list[TResponseInputItem],
) -> GuardrailFunctionOutput:
    """Block unethical research requests."""
    input_text = input if isinstance(input, str) else str(input)
    result = await Runner.run(unethical_research_agent, input_text, context=ctx.context)

    return GuardrailFunctionOutput(
        output_info=result.final_output,
        tripwire_triggered=result.final_output.is_unethical,
    )


@input_guardrail
async def pii_guardrail(
    ctx: RunContextWrapper[None],
    agent: Agent,
    input: str | list[TResponseInputItem],
) -> GuardrailFunctionOutput:
    """Block inputs containing sensitive personal information."""
    input_text = input if isinstance(input, str) else str(input)
    result = await Runner.run(pii_agent, input_text, context=ctx.context)

    return GuardrailFunctionOutput(
        output_info=result.final_output,
        tripwire_triggered=result.final_output.contains_pii,
    )


# =============================================================================
# Output Guardrails
# =============================================================================


@output_guardrail
async def hallucination_guardrail(
    ctx: RunContextWrapper,
    agent: Agent,
    output: VisualOutput,
) -> GuardrailFunctionOutput:
    """Check for unsupported claims in the output."""
    # Convert output to string for checking
    output_text = output.rationale if hasattr(output, "rationale") else str(output)
    result = await Runner.run(hallucination_agent, output_text, context=ctx.context)

    return GuardrailFunctionOutput(
        output_info=result.final_output,
        tripwire_triggered=result.final_output.has_unsupported_claims,
    )


@output_guardrail
async def overconfidence_guardrail(
    ctx: RunContextWrapper,
    agent: Agent,
    output: VisualOutput,
) -> GuardrailFunctionOutput:
    """Check for overconfident language in the output."""
    output_text = output.rationale if hasattr(output, "rationale") else str(output)
    result = await Runner.run(overconfidence_agent, output_text, context=ctx.context)

    return GuardrailFunctionOutput(
        output_info=result.final_output,
        tripwire_triggered=result.final_output.is_overconfident,
    )


# =============================================================================
# Exported guardrail lists for easy import
# =============================================================================

# Input guardrails for the first agent (Clarifier)
INPUT_GUARDRAILS = [
    off_topic_guardrail,
    unethical_research_guardrail,
    pii_guardrail,
]

# Output guardrails for the last agent (Visualizer)
OUTPUT_GUARDRAILS = [
    hallucination_guardrail,
    overconfidence_guardrail,
]