from langsmith import evaluate, wrappers
from langsmith.schemas import Run, Example
from openai import AsyncOpenAI
# Assumes you've installed pydantic.
from pydantic import BaseModel
# We can still pass in Run and Example objects if we'd like
def correct_old_signature(run: Run, example: Example) -> dict:
"""Check if the answer exactly matches the expected answer."""
return {"key": "correct", "score": run.outputs["answer"] == example.outputs["answer"]}
# Just evaluate actual outputs
def concision(outputs: dict) -> int:
"""Score how concise the answer is. 1 is the most concise, 5 is the least concise."""
return min(len(outputs["answer"]) // 1000, 4) + 1
# Use an LLM-as-a-judge
oai_client = wrappers.wrap_openai(AsyncOpenAI())
async def valid_reasoning(inputs: dict, outputs: dict) -> bool:
"""Use an LLM to judge if the reasoning and the answer are consistent."""
instructions = """\
Given the following question, answer, and reasoning, determine if the reasoning for the \
answer is logically valid and consistent with question and the answer."""
class Response(BaseModel):
reasoning_is_valid: bool
msg = f"Question: {inputs['question']}\nAnswer: {outputs['answer']}\nReasoning: {outputs['reasoning']}"
response = await oai_client.beta.chat.completions.parse(
model="gpt-4o-mini",
messages=[{"role": "system", "content": instructions,}, {"role": "user", "content": msg}],
response_format=Response
)
return response.choices[0].message.parsed.reasoning_is_valid
def dummy_app(inputs: dict) -> dict:
return {"answer": "hmm i'm not sure", "reasoning": "i didn't understand the question"}
results = evaluate(
dummy_app,
data="dataset_name",
evaluators=[correct_old_signature, concision, valid_reasoning]
)