Pydantic Evals

https://ai.pydantic.dev/evals/

Ed Yang was recently recommending keeping your own benchmark of LLM evals, so you can test newer models on problems that they have struggled with in the past. I have recommended similar things to people, but there is some barrier to entry into knowing how to start. Ed references (and forks) Nicolas Carlini’s personal benchmark repo, but its nice to have some light(ish) weight options too.

Pydantic Evals is a powerful evaluation framework designed to help you systematically test and evaluate the performance and accuracy of the systems you build, especially when working with LLMs.

You can install the library with uv or pip:

uv add pydantic-evals

I tried it out with a strawberry test, calling openrouter with different models. I needed a custom eval as the default Contains is a bit rigid, but the approach seems nice!

import os
import asyncio
from dataclasses import dataclass
from pydantic_evals import Case, Dataset
from pydantic_evals.evaluators import Evaluator, EvaluationReason, EvaluatorContext
from pydantic_evals.evaluators.common import _truncated_repr
from openai import OpenAI
from typing import Any, Optional, cast
@dataclass
class FlexibleContains(Evaluator[object, object, object]):
    """
    Check if the output contains any one of the expected options.
    """
    value: Any
    case_sensitive: bool = False
    def evaluate(
        self, ctx: EvaluatorContext[object, object, object]
    ) -> EvaluationReason:
        failure_reason: Optional[str] = None
        # Normalize value into a list of options if it isn't already a list or tuple.
        options = self.value if isinstance(self.value, (list, tuple)) else [self.value]
        output_str = str(ctx.output)
        if not self.case_sensitive:
            output_str = output_str.lower()
        match_found = False
        for opt in options:
            opt_str = str(opt)
            if not self.case_sensitive:
                opt_str = opt_str.lower()
            if opt_str in output_str:
                match_found = True
                break
        if not match_found:
            failure_reason = (
                f"Output string {_truncated_repr(output_str, max_length=100)} does not contain "
                f"any of expected strings: {[str(opt) for opt in options]}"
            )
        return EvaluationReason(value=match_found, reason=failure_reason)
strawberry = Case(
    name="strawberry",
    inputs="How many rs are in strawberry?",
    evaluators=[FlexibleContains(value=["3", "three"])],
    metadata={"difficulty": "easy"},
)
dataset = Dataset(cases=[strawberry])
MODELS = [
    "anthropic/claude-3.5-sonnet",
    "openai/gpt-4o",
    "meta-llama/llama-4-maverick:free",
    "meta-llama/llama-4-scout:free",
    "openrouter/optimus-alpha",  # secret model!
]
def generate_completion(inputs: str, model: str) -> str:
    """Generate a completion using OpenRouter with specified model"""
    client = OpenAI(
        base_url="https://openrouter.ai/api/v1",
        api_key=os.getenv("OPENROUTER_API_KEY"),
    )
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful AI assistant."},
            {"role": "user", "content": inputs},
        ],
        max_tokens=50,
        temperature=0.7,
    )
    return response.choices[0].message.content.strip()
def evaluate_models():
    """Run evaluations across multiple models"""
    for model in MODELS:
        print(f"\nResults for model: {model}")
        print("=" * 50)
        # Wrap the synchronous generate_completion in an async function:
        async def model_specific_generate(inputs: str) -> str:
            loop = asyncio.get_running_loop()
            return await loop.run_in_executor(None, generate_completion, inputs, model)
        # Run evaluation for this model
        report = dataset.evaluate_sync(model_specific_generate)
        # Print results for this model
        report.print(include_input=True, include_output=True, include_durations=False)
def main():
    evaluate_models()
if __name__ == "__main__":
    main()

To give a trimmed output:

Results for model: openrouter/optimus-alpha
==================================================
                                      Evaluation Summary: model_specific_generate
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ Case ID    ┃ Inputs                         ┃ Outputs                                                   ┃ Assertions ┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ strawberry │ How many rs are in strawberry? │ The word **"strawberry"** contains **three** letter "r"s. │ ✔          │
├────────────┼────────────────────────────────┼───────────────────────────────────────────────────────────┼────────────┤
│ Averages   │                                │                                                           │ 100.0% ✔   │
└────────────┴────────────────────────────────┴───────────────────────────────────────────────────────────┴────────────┘

Pydantic Evals

More posts

Helion and the evolving GPU programming model

Qwen-Image

Automation & Managerial Control

A Primer on Post-Training

Pydantic Evals

More posts

Helion and the evolving GPU programming model

Qwen-Image

Automation & Managerial Control

A Primer on Post-Training

Discover more from Ian’s Blog