Trustworthy Agents with OpenAI Agents SDK and Cleanlab
Agentic AI applications — such as those built using the OpenAI Agents SDK — orchestrate tools and language models to carry out complex user tasks. But like any system built on LLMs, Agents can still hallucinate, occasionally generating incorrect or misleading responses that undermine user trust. This tutorial shows how to evaluate and score the trustworthiness of any OpenAI Agent’s responses in real-time using Cleanlab, and how to automatically block untrustworthy outputs from reaching your users.
Setup
# Install required packages
%pip install cleanlab-tlm openai-agents --quiet
# Set API keys
import os
os.environ["CLEANLAB_TLM_API_KEY"] = "<YOUR_CLEANLAB_TLM_API_KEY>" # Get your free API key from: https://tlm.cleanlab.ai/
os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_API_KEY>" # Get API key from: https://platform.openai.com/signup
# Import required libraries
from cleanlab_tlm import TLM
from cleanlab_tlm.utils.chat import form_prompt_string, form_response_string_chat_completions_api
from agents import Agent, Runner, RunContextWrapper, function_tool, RunConfig, set_default_openai_api
from agents.models.chatcmpl_converter import Converter
from agents.models.interface import Model, ModelProvider
from agents.models.multi_provider import MultiProvider
# Use the Chat Completions API
set_default_openai_api("chat_completions")
Build an OpenAI Agent
For this tutorial, we’ll build a Customer Support AI Agent using the OpenAI Agents SDK. We have another tutorial on adding TLM into a version of this same Agent built with LangGraph.
Optional: Define Tools the Agent can call
from typing import Union, Optional
import re
import shutil
import openai
import sqlite3
import requests
import numpy as np
import pandas as pd
from dataclasses import dataclass
from datetime import date, datetime
@dataclass
class UserInfo:
passenger_id: str
response = requests.get(
"https://storage.googleapis.com/benchmarks-artifacts/travel-db/swiss_faq.md"
)
response.raise_for_status()
faq_text = response.text
docs = [{"page_content": txt} for txt in re.split(r"(?=\n##)", faq_text)]
class VectorStoreRetriever:
def __init__(self, docs: list, vectors: list, oai_client):
self._arr = np.array(vectors)
self._docs = docs
self._client = oai_client
@classmethod
def from_docs(cls, docs, oai_client):
embeddings = oai_client.embeddings.create(
model="text-embedding-3-small", input=[doc["page_content"] for doc in docs]
)
vectors = [emb.embedding for emb in embeddings.data]
return cls(docs, vectors, oai_client)
def query(self, query: str, k: int = 5) -> list[dict]:
embed = self._client.embeddings.create(
model="text-embedding-3-small", input=[query]
)
# "@" is just a matrix multiplication in python
scores = np.array(embed.data[0].embedding) @ self._arr.T
top_k_idx = np.argpartition(scores, -k)[-k:]
top_k_idx_sorted = top_k_idx[np.argsort(-scores[top_k_idx])]
return [
{**self._docs[idx], "similarity": scores[idx]} for idx in top_k_idx_sorted
]
retriever = VectorStoreRetriever.from_docs(docs, openai.Client())
@function_tool
def lookup_policy(query: str) -> str:
"""Consult the company policies to check whether certain options are permitted.
Use this before making any flight changes performing other 'write' events."""
docs = retriever.query(query, k=2)
return "\n\n".join([doc["page_content"] for doc in docs])
# Populate Database that Agent can access via Tool Call
db_url = "https://storage.googleapis.com/benchmarks-artifacts/travel-db/travel2.sqlite"
local_file = "travel2.sqlite"
# The backup lets us restart for each tutorial section
backup_file = "travel2.backup.sqlite"
overwrite = False
if overwrite or not os.path.exists(local_file):
response = requests.get(db_url)
response.raise_for_status() # Ensure the request was successful
with open(local_file, "wb") as f:
f.write(response.content)
# Backup - we will use this to "reset" our DB in each section
shutil.copy(local_file, backup_file)
# Convert the flights to present time for our tutorial
def update_dates(file):
shutil.copy(backup_file, file)
conn = sqlite3.connect(file)
cursor = conn.cursor()
tables = pd.read_sql(
"SELECT name FROM sqlite_master WHERE type='table';", conn
).name.tolist()
tdf = {}
for t in tables:
tdf[t] = pd.read_sql(f"SELECT * from {t}", conn)
example_time = pd.to_datetime(
tdf["flights"]["actual_departure"].replace("\\N", pd.NaT)
).max()
current_time = pd.to_datetime("now").tz_localize(example_time.tz)
time_diff = current_time - example_time
tdf["bookings"]["book_date"] = (
pd.to_datetime(tdf["bookings"]["book_date"].replace("\\N", pd.NaT), utc=True)
+ time_diff
)
datetime_columns = [
"scheduled_departure",
"scheduled_arrival",
"actual_departure",
"actual_arrival",
]
for column in datetime_columns:
tdf["flights"][column] = (
pd.to_datetime(tdf["flights"][column].replace("\\N", pd.NaT)) + time_diff
)
for table_name, df in tdf.items():
df.to_sql(table_name, conn, if_exists="replace", index=False)
del df
del tdf
conn.commit()
conn.close()
return file
db = update_dates(local_file)
@function_tool
def fetch_user_flight_information(wrapper: RunContextWrapper[UserInfo]) -> list[dict]:
"""Fetch all tickets for the user along with corresponding flight information and seat assignments.
Returns:
A list of dictionaries where each dictionary contains the ticket details,
associated flight details, and the seat assignments for each ticket belonging to the user.
"""
passenger_id = wrapper.context.passenger_id
if not passenger_id:
raise ValueError("No passenger ID configured.")
conn = sqlite3.connect(db)
cursor = conn.cursor()
query = """
SELECT
t.ticket_no, t.book_ref,
f.flight_id, f.flight_no, f.departure_airport, f.arrival_airport, f.scheduled_departure, f.scheduled_arrival,
bp.seat_no, tf.fare_conditions
FROM
tickets t
JOIN ticket_flights tf ON t.ticket_no = tf.ticket_no
JOIN flights f ON tf.flight_id = f.flight_id
JOIN boarding_passes bp ON bp.ticket_no = t.ticket_no AND bp.flight_id = f.flight_id
WHERE
t.passenger_id = ?
"""
cursor.execute(query, (passenger_id,))
rows = cursor.fetchall()
column_names = [column[0] for column in cursor.description]
results = [dict(zip(column_names, row)) for row in rows]
cursor.close()
conn.close()
return results
@function_tool
def search_flights(
departure_airport: Optional[str] = None,
arrival_airport: Optional[str] = None,
start_time: Optional[date | datetime] = None,
end_time: Optional[date | datetime] = None,
limit: int = 20,
) -> list[dict]:
"""Search for flights based on departure airport, arrival airport, and departure time range."""
conn = sqlite3.connect(db)
cursor = conn.cursor()
query = "SELECT * FROM flights WHERE 1 = 1"
params = []
if departure_airport:
query += " AND departure_airport = ?"
params.append(departure_airport)
if arrival_airport:
query += " AND arrival_airport = ?"
params.append(arrival_airport)
if start_time:
query += " AND scheduled_departure >= ?"
params.append(start_time)
if end_time:
query += " AND scheduled_departure <= ?"
params.append(end_time)
query += " LIMIT ?"
params.append(limit)
cursor.execute(query, params)
rows = cursor.fetchall()
column_names = [column[0] for column in cursor.description]
results = [dict(zip(column_names, row)) for row in rows]
cursor.close()
conn.close()
return results
@function_tool
def search_car_rentals(
location: Optional[str] = None,
name: Optional[str] = None,
price_tier: Optional[str] = None,
start_date: Optional[Union[datetime, date]] = None,
end_date: Optional[Union[datetime, date]] = None,
) -> list[dict]:
"""
Search for car rentals based on location, name, price tier, start date, and end date.
Args:
location (Optional[str]): The location of the car rental. Defaults to None.
name (Optional[str]): The name of the car rental company. Defaults to None.
price_tier (Optional[str]): The price tier of the car rental. Defaults to None.
start_date (Optional[Union[datetime, date]]): The start date of the car rental. Defaults to None.
end_date (Optional[Union[datetime, date]]): The end date of the car rental. Defaults to None.
Returns:
list[dict]: A list of car rental dictionaries matching the search criteria.
"""
conn = sqlite3.connect(db)
cursor = conn.cursor()
query = "SELECT * FROM car_rentals WHERE 1=1"
params = []
if location:
query += " AND location LIKE ?"
params.append(f"%{location}%")
if name:
query += " AND name LIKE ?"
params.append(f"%{name}%")
if price_tier:
query += " AND price_tier LIKE ?"
params.append(f"%{price_tier}%")
if start_date:
query += " AND start_date >= ?"
params.append(str(start_date))
if end_date:
query += " AND end_date <= ?"
params.append(str(end_date))
# This tool allows matching on price tier and dates even though data is limited
# which tests Agent behavior when relevant data might be missing.
cursor.execute(query, params)
results = cursor.fetchall()
conn.close()
return [
dict(zip([column[0] for column in cursor.description], row)) for row in results
]
@function_tool
def search_hotels(
location: Optional[str] = None,
name: Optional[str] = None,
price_tier: Optional[str] = None,
checkin_date: Optional[Union[datetime, date]] = None,
checkout_date: Optional[Union[datetime, date]] = None,
) -> list[dict]:
"""
Search for hotels based on location, name, price tier, check-in date, and check-out date.
Args:
location (Optional[str]): The location of the hotel. Defaults to None.
name (Optional[str]): The name of the hotel. Defaults to None.
price_tier (Optional[str]): The price tier of the hotel. Defaults to None. Examples: Midscale, Upper Midscale, Upscale, Luxury
checkin_date (Optional[Union[datetime, date]]): The check-in date of the hotel. Defaults to None.
checkout_date (Optional[Union[datetime, date]]): The check-out date of the hotel. Defaults to None.
Returns:
list[dict]: A list of hotel dictionaries matching the search criteria.
"""
conn = sqlite3.connect(db)
cursor = conn.cursor()
query = "SELECT * FROM hotels WHERE 1=1"
params = []
if location:
query += " AND location LIKE ?"
params.append(f"%{location}%")
if name:
query += " AND name LIKE ?"
params.append(f"%{name}%")
# For the sake of this tutorial, we will let you match on any dates and price tier.
cursor.execute(query, params)
results = cursor.fetchall()
conn.close()
return [
dict(zip([column[0] for column in cursor.description], row)) for row in results
]
@function_tool
def search_trip_recommendations(
location: Optional[str] = None,
name: Optional[str] = None,
keywords: Optional[str] = None,
) -> list[dict]:
"""
Search for trip recommendations based on location, name, and keywords.
Args:
location (Optional[str]): The location of the trip recommendation. Defaults to None.
name (Optional[str]): The name of the trip recommendation. Defaults to None.
keywords (Optional[str]): The keywords associated with the trip recommendation. Defaults to None.
Returns:
list[dict]: A list of trip recommendation dictionaries matching the search criteria.
"""
conn = sqlite3.connect(db)
cursor = conn.cursor()
query = "SELECT * FROM trip_recommendations WHERE 1=1"
params = []
if location:
query += " AND location LIKE ?"
params.append(f"%{location}%")
if name:
query += " AND name LIKE ?"
params.append(f"%{name}%")
if keywords:
keyword_list = keywords.split(",")
keyword_conditions = " OR ".join(["keywords LIKE ?" for _ in keyword_list])
query += f" AND ({keyword_conditions})"
params.extend([f"%{keyword.strip()}%" for keyword in keyword_list])
cursor.execute(query, params)
results = cursor.fetchall()
conn.close()
return [
dict(zip([column[0] for column in cursor.description], row)) for row in results
]
@function_tool
def get_travel_advisory(country: str) -> dict:
"""Returns a mock travel advisory for a country."""
return {
"level": "Level 2 – Exercise Increased Caution",
"notes": f"Travelers to {country} should be aware of petty crime and take precautions."
}
@function_tool
def get_discount_plan(name: str) -> str:
"""Returns details about a discount plan based on its name.
Valid names: "basic", "premium", "five", "student"
"""
return f"Returning details for the '{name}' discount plan:"
class InsuranceTerms:
def __init__(self, provider: str):
self.provider = provider
self.coverage = {
"GloboSure": {
"trip_delay": "Up to 500 credits after 6 hours",
"lost_baggage": "Up to 1200 credits with receipt proof",
"medical": "Emergency care covered up to 50,000 credits"
},
"NimbusCoverage": {
"cancellation": "Refunds up to 70% for non-weather issues",
"extreme_weather": "Full coverage with documentation"
}
}
@function_tool
def get_insurance_terms(provider: str) -> InsuranceTerms:
"""Returns a policy explanation object for a given (possibly obscure) travel insurance provider."""
return InsuranceTerms(provider)
Optional: Setup OpenAI Agent
import json
system_prompt = """You are a helpful customer support assistant for Swiss Airlines. Use the provided tools to search for flights, company policies, and other information to assist the user's queries. When searching, be persistent. Expand your query bounds if the first search returns no results. If a search comes up empty, expand your search before giving up.
Current time: {time}.""".format(time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
tools=[
fetch_user_flight_information,
search_flights,
lookup_policy,
search_car_rentals,
search_hotels,
search_trip_recommendations,
get_travel_advisory,
get_discount_plan,
get_insurance_terms,
]
user_info = UserInfo(passenger_id="3442 587242")
agent = Agent[UserInfo](
name="Customer support agent",
instructions=system_prompt,
tools=tools,
model="gpt-4o-mini"
)
max_length = 2000
def truncate(msg, max_length=max_length):
if msg is None:
return ""
msg = str(msg)
if len(msg) > max_length:
return msg[:max_length] + " ... (truncated)"
return msg
HEADER = {
"human_message": (None, "Human Message"),
"ai_message_header": ("ai_message", "AI Message"),
"tool_calls_header": ("tool_calls", "Tool Calls"),
"tool_message_header": ("tool_message", "Tool Message"),
}
def print_event(tag, printed, data=None, item=None, score=None, tool_calls_from_message=None):
if tag == "human_message":
if tag not in printed:
printed.add(tag)
print(f"\n{'='*10} Human Message {'='*10}")
print(data)
elif tag == "tool_call":
key, title = HEADER["tool_calls_header"]
if key not in printed:
printed.add(key)
print(f"\n{'='*10} {title} {'='*10}")
# If called from a message, print each call in the list
if tool_calls_from_message is not None:
for call in tool_calls_from_message:
func = call.get("function", {})
name = func.get("name", "UnknownFunction")
call_id = call.get("id", "unknown_id")
print(f"{name} ({call_id})\nArgs:")
try:
args = json.loads(func.get("arguments", "{}"))
except Exception:
args = {}
for k, v in args.items():
print(f" {k}: {v}")
elif item is not None:
call = item.raw_item
print(f"{call.name} ({call.call_id})\nArgs:")
for k, v in json.loads(call.arguments).items():
print(f" {k}: {truncate(v)}")
elif tag == "tool_output":
key, title = HEADER["tool_message_header"]
if key not in printed:
printed.add(key)
print(f"\n{'='*10} {title} {'='*10}")
print(truncate(data or item.raw_item.get("output", "")))
elif tag == "ai_message":
key, title = HEADER["ai_message_header"]
if key not in printed:
printed.add(key)
print(f"\n{'='*10} {title} {'='*10}")
print(data or item.raw_item.content[0].text)
elif tag == "tlm_score":
print(f"\n[TLM Score]: {score}")
def print_items(items):
printed = set()
messages = Converter.items_to_messages(items)
for msg in messages:
role = msg.get('role')
content = msg.get("content")
tool_calls = msg.get("tool_calls")
if role == "user":
print_event("human_message", printed, data=content)
elif role == "assistant":
if tool_calls:
print_event("tool_call", printed, tool_calls_from_message=tool_calls)
if content:
print_event("ai_message", printed, data=content)
async def print_stream_events(stream):
printed, cached_score = set(), None
event_map = {
"tool_called": "tool_call",
"tool_output": "tool_output",
"message_output_created": "ai_message"
}
async for e in stream.stream_events():
if e.type == "run_item_stream_event" and e.item and e.name in event_map:
print_event(event_map[e.name], printed, item=e.item)
if cached_score is not None:
print_event("tlm_score", printed, score=cached_score)
cached_score = None
elif getattr(e, "type", None) == "raw_response_event" and getattr(getattr(e, "data", None), "type", None) == "tlm_score_event":
cached_score = e.data.score
if cached_score is not None:
print_event("tlm_score", printed, score=cached_score)
print()
For educational purposes, we implement a stream_agent_response()
method —based on the OpenAI streaming example— which prints everything produced/processed by the Agent (user messages, Tool calls, Tool outputs, LLM responses, …).
async def stream_agent_response(agent, user_input):
input_items = [{"content": user_input, "role": "user"}]
print_items(input_items)
stream = Runner.run_streamed(agent,
input=input_items,
context=user_info)
await print_stream_events(stream)
Run the Agent
Let’s ask our Agent some queries.
user_input = "Can I get a refund if I cancel my flight with Swiss Airlines?"
await stream_agent_response(agent, user_input)
This example illustrates the Agent’s ability to call the appropriate Tool and craft an accurate, policy-based answer from the returned data. Let’s run some more queries, showcasing some that yield problematic Agent responses.
user_input = "What are the details of the 5 discount plan"
await stream_agent_response(agent, user_input)
In this case, incomplete results were returned from the Tool. Without complete details to draw on, the LLM hallucinated incorrect details about the discount plan, potentially misleading the user.
user_input = "What coverage does GloboSure give me for travel delays?"
await stream_agent_response(agent, user_input)
In this case, the Tool was implemented to return raw Python objects rather then the object’s body, and the LLM filled the gap by hallucinating incorrect travel-delay coverage details.
user_input = "Is there a health advisory in Basel"
await stream_agent_response(agent, user_input)
In this case, the Agent chose the right Tool, but it returned an advisory about crime rather than the requested health advisory for Basel. The Tool didn’t explicitly explain that it did not have the relevant information. The Agent didn’t notice the discrepancy and thus gave an incorrect response.
user_input = "What is my arrival time in their time zone?"
await stream_agent_response(agent, user_input)
In this case, the LLM mistakenly assumed that Basel (BSL) uses the UTC-4 time zone. In May, Basel actually uses Central European Summer Time (UTC+2), so the correct local arrival time should be six hours later than stated by the Agent.
user_input = "What are conspiracy theories around Swiss Airlines safety?"
await stream_agent_response(agent, user_input)
In this case, no appropriate Tool is available for this query. The Agent responds with a potentially hallucinated answer that could harm the airline’s reputation.
Trustworthy Agent
Now, let’s explore how Cleanlab’s Trustworthy Language Model (TLM) can help identify problematic responses from your OpenAI agent in real-time. TLM provides a state-of-the-art trustworthiness score for each LLM-generated message, quantifying how likely a response is to be incorrect or flawed. To integrate TLM into your OpenAI Agent application, we provide a TLMModelProvider
. By including TLMModelProvider
in your run configuration (or wrapping your own ModelProvider
), your Agent model will automatically use our wrapper—whether you specify the model
argument as a string
or as a Model
object. Trust scores are then added to all responses, for both get_response
(standard) and stream_response
(streamed) outputs.
Optional: Classes to score the trustworthiness of LLM calls using TLM
Initialize the TLM client
tlm = TLM(options={"log": ["explanation"]}) # See Advanced Tutorial for additional TLM configuration options
class TLMScoreEvent:
"""Custom event for emitting the TLM trustworthiness score."""
def __init__(self, score):
self.type = "tlm_score_event"
self.score = score
class TLMGuardrailTripwireTriggered(Exception):
"""
Raised when the Trustworthy Language Model (TLM) trust score is below the configured threshold.
This exception should be caught at the Agent run level (e.g., using try/except around `Runner.run(agent, ...)`)
for custom handling of low-trust responses.
Attributes:
score (float): The TLM trust score for the response.
input_items (Any): The inputs provided to the LLM.
response_items (Any): The generated responses from the LLM.
explanation (str, optional): Optional explanation from TLM (set log=["explanation"] in TLM options).
"""
def __init__(self, score, input_items, response_items, explanation=None):
super().__init__(f"Trust score too low: {score}")
self.score = score
self.input_items = input_items
self.response_items = response_items
self.explanation = explanation
class TLMModel(Model):
"""
Model wrapper that augments the base class with Cleanlab's Trustworthy Language Model (TLM) trust scoring.
Adds trust scoring to every LLM response. Raises TLMGuardrailTripwireTriggered if the score is below threshold.
"""
def __init__(self, base_model, tlm, trust_threshold=0.9, score_tool_calls=True):
self.base_model = base_model
self.tlm = tlm
self.trust_threshold = trust_threshold
self.score_tool_calls = score_tool_calls
def _handle_trust_score(self, score, input, response_items, expl):
"""
Handles the trust score for each response.
By default, raises TLMGuardrailTripwireTriggered if the score is below the configured threshold.
TODO: Modify this method to customize how trust scores are handled—for example, logging, monitoring,
or applying different policies based on the score.
Tip: To avoid interrupting the Agent's execution, you can store low scores instead of raising an exception
and handle them later.
"""
if score < self.trust_threshold:
raise TLMGuardrailTripwireTriggered(score, input, response_items, expl)
def _score_with_tlm(self, system_instructions, input, tools, response_items):
"""
Prepare the prompt and compute the TLM trust score for the response.
Skips scoring Tool calls if configured.
"""
# Skip scoring Tool calls if configured
if not self.score_tool_calls and response_msgs[0].get("tool_calls"): return None
# Prepare the prompt for TLM
msgs = Converter.items_to_messages(input if isinstance(input, list) else [input])
if system_instructions: msgs.insert(0, {"role": "system", "content": system_instructions})
tlm_prompt = form_prompt_string(msgs, [Converter.tool_to_openai(t) for t in tools])
# Score the response
response_msgs = Converter.items_to_messages(response_items)
tlm_response = form_response_string_chat_completions_api(response_msgs[0])
tlm_res = self.tlm.get_trustworthiness_score(tlm_prompt, tlm_response)
score = tlm_res["trustworthiness_score"]
expl = tlm_res.get("log", {}).get("explanation")
# Handle trust score using a helper method
self._handle_trust_score(score, input, response_items, expl)
return score
async def get_response(self, system_instructions, input, model_settings, tools,
output_schema, handoffs, tracing, previous_response_id, prompt=None):
"""
Add trust scoring to Runner.run(agent, ...).
"""
response = await self.base_model.get_response(
system_instructions, input, model_settings, tools,
output_schema, handoffs, tracing, previous_response_id, prompt)
self._score_with_tlm(system_instructions, input, tools, response.to_input_items())
return response
async def stream_response(self, system_instructions, input, model_settings, tools,
output_schema, handoffs, tracing, previous_response_id, prompt=None):
"""
Add trust scoring to Runner.run_streamed(agent, ...).
"""
final = None
async for e in self.base_model.stream_response(
system_instructions, input, model_settings, tools,
output_schema, handoffs, tracing, previous_response_id, prompt):
yield e
if getattr(e, 'type', None) == "response.completed":
final = e.response
if final:
response_items = [item.model_dump(exclude_unset=True) for item in final.output]
score = self._score_with_tlm(system_instructions, input, tools, response_items)
if not score: return
yield TLMScoreEvent(score)
class TLMModelProvider(ModelProvider):
def __init__(self, base_provider=None, trust_threshold=0.9, score_tool_calls=True):
self.base_provider = base_provider or MultiProvider()
self.trust_threshold = trust_threshold
self.score_tool_calls = score_tool_calls
def get_model(self, name=None):
return TLMModel(self.base_provider.get_model(name), tlm, self.trust_threshold, self.score_tool_calls)
To wrap your model with TLM scoring, use the TLMModelProvider
within your run_config
. You can optionally define a custom trustworthiness scoring threshold and specify whether to apply trust scoring to Tool calls in addition to natural language messages.
async def stream_trustworthy_agent_response(agent, user_input):
input_items = [{"content": user_input, "role": "user"}]
print_items(input_items)
try:
stream = Runner.run_streamed(
agent,
input=user_input,
context=user_info,
run_config=RunConfig(
model_provider=TLMModelProvider(trust_threshold=0.9, score_tool_calls=True)
) # New: Wrap your LLM model with TLM scoring
)
await print_stream_events(stream)
except TLMGuardrailTripwireTriggered as e: # Handling low trust scores will be covered later in this tutorial
# For now, we just print the untrustworthy response and the TLM score
print_items(e.response_items)
print("[TLM Score]:", e.score)
Now let’s run our Agent with automated trustworthiness scoring in place. In the traces below, the trustworthiness scores from TLM correspond to the previous Tool call/message in each conversation. The final AI Message
is the response given to your user, whose correctness is vital.
user_input = "Can I get a refund if I cancel my flight with Swiss Airlines?"
await stream_trustworthy_agent_response(agent, user_input)
Upon review, we see that the Agent’s response was correct for this simple question. TLM computed a high trustworthiness score in real-time, letting our application automatically know it can serve this response to users with great confidence.
Let’s run the queries where our Agent responded incorrectly before.
user_input = "What are the details of the 5 discount plan"
await stream_trustworthy_agent_response(agent, user_input)
user_input = "What coverage does GloboSure give me for travel delays?"
await stream_trustworthy_agent_response(agent, user_input)
user_input = "Is there a health advisory in Basel"
await stream_trustworthy_agent_response(agent, user_input)
user_input = "What is my arrival time in their time zone?"
await stream_trustworthy_agent_response(agent, user_input)
user_input = "What are conspiracy theories around Swiss Airlines safety?"
await stream_trustworthy_agent_response(agent, user_input)
Upon review, we find that the Agent’s responses were problematic for the above queries. These responses received lower trustworthiness scores from TLM in real-time, allowing your application to automatically flag them before they are served to users.
You could still choose to show such responses to users appending a caveat like: CAUTION: THIS RESPONSE HAS BEEN FLAGGED AS POTENTIALLY UNTRUSTWORTHY
.
Alternatively, you could escalate this interaction to a human customer support representative, or return a canned fallback response in place of the Agent’s response. The next sections demonstrate how to implement different fallback strategies in your OpenAI Agent.
Fallback Logic: Replacing Untrustworthy Responses
There are many options for handling low trust scores in your Agent. One option: whenever a LLM call has low trustworthiness, you can raise an exception and manage it similarly to OpenAI’s native Guardrails. In this case, these exceptions indicate the Agent may be going off the rails.
Below, we demonstrate a fallback strategy that halts the Agent when this exception is raised, and has the Agent return a pre-written abstention response (indicating it does not know how to handle this request).
You can easily swap this out for other fallback behaviors like:
- Escalate to a human
- Re-run the Agent with a modified prompt
- Re-generate the recent untrustworthy LLM output to have the Agent autonomously improve its response
async def stream_trustworthy_agent_response(agent, user_input):
input_items = [{"content": user_input, "role": "user"}]
print_items(input_items)
try:
stream = Runner.run_streamed(
agent,
input=input_items,
context=user_info,
run_config=RunConfig(
model_provider=TLMModelProvider(trust_threshold=0.9, score_tool_calls=True)
) # New: Wrap the model with TLM scoring
)
await print_stream_events(stream)
except TLMGuardrailTripwireTriggered:
print_items([{
"content": "Sorry, I cannot answer based on available information. Try rephrasing your question or providing more details.",
"role": "assistant"
}])
user_input = "What are the details of the 5 discount plan"
await stream_trustworthy_agent_response(agent, user_input)
In this case, the trust-based fallback prevented the Agent from hallucinating incorrect details about the “five” discount plan. The Agent instead now responds with a fallback message, which is significantly preferrable over the incorrect response returned by the original Agent for this query.
Fallback Logic: Regenerate Untrustworthy LLM Outputs
Let’s demonstrate a different fallback strategy to have the Agent autonomously improve its responses. Here, we have the Agent re-generate its most recent LLM output whenever the trustworthiness score was low. In our re-generation step, the LLM prompt is modified to include the TLM explanation of why the LLM’s previous output was considered untrustworthy. When the LLM is able to use this feedback to generate more accurate outputs, this fallback strategy can automatically improve the overall Agent.
async def stream_trustworthy_agent_response(agent, user_input):
input_items = [{"content": user_input, "role": "user"}]
print_items(input_items)
while True:
try:
stream = Runner.run_streamed(
agent,
input=input_items,
context=user_info,
run_config=RunConfig(
model_provider=TLMModelProvider(trust_threshold=0.9, score_tool_calls=True)
) # New: Wrap the model with TLM scoring
)
await print_stream_events(stream)
except TLMGuardrailTripwireTriggered as e:
input_items = e.input_items
rewrite_prompt = (
f"Your previous response was flagged as untrustworthy with potential inaccuracies. "
f"Reason: {e.explanation}\n\n"
"# Instructions\n\n"
"Please provide a new response, ensuring it is accurate and trustworthy. "
"If you don't know how to respond accurately, simply express that in your new response.\n"
"Do not reference the earlier version of your response; it will not be shown to the user.\n"
)
last_item = e.response_items[-1]
new_input_items = [
*e.response_items,
*(
[{
"call_id": last_item.get("call_id"),
"output": "Tool call was flagged as untrustworthy, cancelled",
"type": "function_call_output"
}] if last_item.get("type") == "function_call" else []
),
{"content": rewrite_prompt, "role": "user"}
]
print_items(new_input_items)
input_items.extend(new_input_items)
continue
break
user_input = "What are the details of the 5 discount plan"
await stream_trustworthy_agent_response(agent, user_input)
user_input = "What are the details of the 5 discount plan"
await stream_trustworthy_agent_response(agent, user_input)
In this case, the Agent now responds by acknowledging that it lacks specific details about the discount plan. This approach can offer a better experience for your Agent’s users.
Conclusion. Adding trust scoring to your Agent is easy and can automatically prevent incorrect responses from your Agent. When LLM outputs receive low trustworthiness scores, this tutorial showcased two fallback strategies: (1) halting the Agent and having it respond with a predefined fallback message, or (2) re-generating the previous LLM output, optionally using feedback from TLM’s explanation. In these cases, you could alternatively: escalate to a human, re-run the Agent from scratch with modified system instructions, or add whatever fallback strategy is best for your use-case.
Production-Ready Agents
For the Agents above, we printed internal messages and trustworthiness scores for educational purposes, but this is not how they would be utilized in real applications. In this section, we’ll show the minimal code needed to add trust scoring and handling in your production-ready Agent. Let’s start by reviewing the basic OpenAI Agents SDK code that retrieves the Agent’s final output.
Optional: Setup a Basic OpenAI Agent with a Single Tool Call
@function_tool
def get_discount_plan(name: str) -> str:
"""Returns details about a discount plan based on its name.
Valid names: "basic", "premium", "five", "student"
"""
return f"Returning details for the '{name}' discount plan:"
agent = Agent(name="Assistant",
model="gpt-4o-mini",
instructions="You are a customer support bot.",
tools=[get_discount_plan])
agent_response = ""
# Your basic OpenAI Agents SDK code to get the final output
async def get_agent_response(agent, user_input):
result = await Runner.run(agent, user_input)
return result.final_output
Let’s query this production-ready Agent.
user_input= "What are the details of the 5 discount plan"
agent_response = await get_agent_response(agent, user_input)
print(agent_response)
To integrate TLM trust scoring, simply add the TLMModelProvider
to your Agent run.
# Minimal changes for TLM integration
async def get_agent_response(agent, user_input):
result = await Runner.run(
agent,
input=user_input,
run_config=RunConfig(
model_provider=TLMModelProvider(trust_threshold=0.9, score_tool_calls=True)
) # New: Wrap the model with TLM scoring
)
return result.final_output
We are integrating TLM as a Guardrail, which, like other Guardrails in the OpenAI Agents SDK, raises an Exception when triggered. In our case, the TLM Guardrail triggers whenever the trust score of any LLM call falls below a predefined threshold.
try:
agent_response = await get_agent_response(agent, user_input)
except TLMGuardrailTripwireTriggered: # New: Handle TLM guardrail exception
agent_response = "Sorry I cannot answer based on available information, try re-phrasing your question or providing more details."
print(agent_response)
Other OpenAI Multi-Agent Design Patterns
Handoffs
Our trust scoring integration seamlessly supports any OpenAI Agent Pattern. For instance, consider OpenAI’s multi-agent customer support example, which involves agent handoffs. For more details on implementing handoffs, refer to the OpenAI Agents SDK documentation.
Optional: Setup OpenAI Agent (Multi-Agent Customer Support)
from __future__ import annotations as _annotations
import random
from pydantic import BaseModel
from agents import (
Agent,
RunContextWrapper,
Runner,
function_tool,
handoff,
)
from agents.extensions.handoff_prompt import RECOMMENDED_PROMPT_PREFIX
class AirlineAgentContext(BaseModel):
passenger_name: str | None = None
confirmation_number: str | None = None
seat_number: str | None = None
flight_number: str | None = None
@function_tool(name_override="faq_lookup_tool", description_override="Lookup frequently asked questions.")
async def faq_lookup_tool(question: str) -> str:
if "bag" in question or "baggage" in question:
return "You are allowed to bring one bag on the plane. It must be under 50 pounds and 22 inches x 14 inches x 9 inches."
elif "seats" in question or "plane" in question:
return "There are 120 seats on the plane. 22 business class, 98 economy. Exit rows: 4, 16. Economy Plus: 5–8."
elif "wifi" in question:
return "We have free wifi on the plane, join Airline-Wifi"
return "I'm sorry, I don't know the answer to that question."
@function_tool
async def update_seat(context: RunContextWrapper[AirlineAgentContext], confirmation_number: str, new_seat: str) -> str:
context.context.confirmation_number = confirmation_number
context.context.seat_number = new_seat
assert context.context.flight_number is not None, "Flight number is required"
return f"Updated seat to {new_seat} for confirmation number {confirmation_number}"
async def on_seat_booking_handoff(context: RunContextWrapper[AirlineAgentContext]) -> None:
context.context.flight_number = f"FLT-{random.randint(100, 999)}"
faq_agent = Agent[AirlineAgentContext](
name="FAQ Agent",
handoff_description="A helpful agent that can answer questions about the airline.",
instructions=f"""{RECOMMENDED_PROMPT_PREFIX}
You are an FAQ agent. If you are speaking to a customer, you probably were transferred to from the triage agent.
Use the following routine to support the customer:
1. Identify the customer's last question.
2. Use the FAQ tool to answer. Do NOT rely on your own knowledge.
3. If you can't answer, transfer back to the triage agent.""",
tools=[faq_lookup_tool],
)
seat_booking_agent = Agent[AirlineAgentContext](
name="Seat Booking Agent",
handoff_description="A helpful agent that can update a seat on a flight.",
instructions=f"""{RECOMMENDED_PROMPT_PREFIX}
You are a seat booking agent. Use the following routine:
1. Ask for confirmation number.
2. Ask for desired seat number.
3. Use the seat update tool.
If off-topic, transfer back to triage agent.""",
tools=[update_seat],
)
triage_agent = Agent[AirlineAgentContext](
name="Triage Agent",
handoff_description="Routes customer requests to appropriate agents.",
instructions=(
f"{RECOMMENDED_PROMPT_PREFIX} "
"You are a helpful triage agent. Use your tools to route requests."
),
handoffs=[
faq_agent,
handoff(agent=seat_booking_agent, on_handoff=on_seat_booking_handoff),
],
)
faq_agent.handoffs.append(triage_agent)
seat_booking_agent.handoffs.append(triage_agent)
async def get_agent_response(agent, user_input):
result = await Runner.run(agent,
[{"content": user_input, "role": "user"}],
context=AirlineAgentContext(),
run_config=RunConfig(model_provider=TLMModelProvider(trust_threshold=0.9))) # New: Wrap the model with TLM scoring
return result.final_output
user_input = "What are the baggage restrictions?"
try:
agent_response = await get_agent_response(triage_agent, user_input)
except TLMGuardrailTripwireTriggered: # New: Handle TLM guardrail exception
agent_response = "Sorry, I cannot answer based on available information. Try rephrasing your question or providing more details."
print(agent_response)
user_input = "Tell me about recent plane crashes?"
try:
agent_response = await get_agent_response(triage_agent, user_input)
except TLMGuardrailTripwireTriggered: # New: Handle TLM guardrail exception
agent_response = "Sorry, I cannot answer based on available information. Try rephrasing your question or providing more details."
print(agent_response)
Agent-as-a-tool
OpenAI recommends to alternatively use your Agent-as-a-tool, allowing Agents to call other Agents without handing off to them. You can learn more about this pattern from the official OpenAI documentation. To add trust scoring to your Agent-as-a-tool, implement a function tool to run your Agent with our TLMModelProvider
as demonstrated below.
from agents import default_tool_error_function
def custom_tool_error_function(ctx, error) -> str:
if isinstance(error, TLMGuardrailTripwireTriggered):
return f"Agent-as-a-tool returned an untrustworthy response. Please try again. Error: {str(error)}"
return default_tool_error_function(ctx, error) # For all other exceptions, use the default behavior
@function_tool(failure_error_function=custom_tool_error_function)
async def run_discount_agent(input: str) -> str:
"""A tool that runs the agent with custom configs"""
discount_agent = Agent(
name="discount_agent",
model="gpt-4o-mini",
instructions=(
"You are a customer service agent specializing in discount inquiries. "
"Respond helpfully and clearly. "
"Valid discount plan names: 'basic', 'premium', 'five', 'student'. "
"Details for the 'student' discount plan: Students with a valid student ID receive a 15% discount on all full-priced items. "
"Details for the 'five' discount plan: "
),
handoff_description="Handles discount-related questions.",
)
result = await Runner.run(
discount_agent,
input=input,
max_turns=1,
run_config=RunConfig(model_provider=TLMModelProvider(trust_threshold=0.9, score_tool_calls=True))
)
return str(result.final_output)
customer_support_agent = Agent(
name="Customer Support Agent",
model="gpt-4o-mini",
instructions=(
"You are a customer service agent. Always use your tools to handle specific requests. "
"Never answer directly; always use your tools."
),
tools=[run_discount_agent],
)
async def get_agent_response(agent, user_input):
result = await Runner.run(
agent,
[{"content": user_input, "role": "user"}],
run_config=RunConfig(model_provider=TLMModelProvider(trust_threshold=0.9, score_tool_calls=True))
)
return result.final_output
user_input = "I am a student, can I get a discount?"
try:
agent_response = await get_agent_response(customer_support_agent, user_input)
except TLMGuardrailTripwireTriggered: # New: Handle TLM guardrail exception
agent_response = "Sorry, I cannot answer based on available information. Try rephrasing your question or providing more details."
print(agent_response)
user_input = "What are the details of the 5 discount plan"
try:
agent_response = await get_agent_response(customer_support_agent, user_input)
except TLMGuardrailTripwireTriggered: # New: Handle TLM guardrail exception
agent_response = "Sorry, I cannot answer based on available information. Try rephrasing your question or providing more details."
print(agent_response)
Under the hood, TLM scoring intercepted the hallucinated response from the Agent-as-a-tool, preventing it from propagating to the main Agent’s final output.