inspect_ai.log

Eval Log Files

list_eval_logs

List all eval logs in a directory.

def list_eval_logs(
    log_dir: str = os.environ.get("INSPECT_LOG_DIR", "./logs"),
    formats: list[Literal["eval", "json"]] | None = None,
    filter: Callable[[EvalLog], bool] | None = None,
    recursive: bool = True,
    descending: bool = True,
    fs_options: dict[str, Any] = {},
) -> list[EvalLogInfo]
log_dir str

Log directory (defaults to INSPECT_LOG_DIR)

formats list[Literal['eval', 'json']] | None

Formats to list (default to listing all formats)

filter Callable[[EvalLog], bool] | None

Filter to limit logs returned. Note that the EvalLog instance passed to the filter has only the EvalLog header (i.e. does not have the samples or logging output).

recursive bool

List log files recursively (defaults to True).

descending bool

List in descending order.

fs_options dict[str, Any]

Optional. Additional arguments to pass through to the filesystem provider (e.g. S3FileSystem).

write_eval_log

Write an evaluation log.

def write_eval_log(
    log: EvalLog,
    location: str | FileInfo | None = None,
    format: Literal["eval", "json", "auto"] = "auto",
) -> None
log EvalLog

Evaluation log to write.

location str | FileInfo | None

Location to write log to.

format Literal['eval', 'json', 'auto']

Write to format (defaults to ‘auto’ based on log_file extension)

read_eval_log

Read an evaluation log.

def read_eval_log(
    log_file: str | EvalLogInfo,
    header_only: bool = False,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> EvalLog
log_file str | EvalLogInfo

Log file to read.

header_only bool

Read only the header (i.e. exclude the “samples” and “logging” fields). Defaults to False.

resolve_attachments bool

Resolve attachments (e.g. images) to their full content.

format Literal['eval', 'json', 'auto']

Read from format (defaults to ‘auto’ based on log_file extension)

read_eval_log_sample

Read a sample from an evaluation log.

def read_eval_log_sample(
    log_file: str | EvalLogInfo,
    id: int | str,
    epoch: int = 1,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> EvalSample
log_file str | EvalLogInfo

Log file to read.

id int | str

Sample id to read.

epoch int

Epoch for sample id (defaults to 1)

resolve_attachments bool

Resolve attachments (e.g. images) to their full content.

format Literal['eval', 'json', 'auto']

Read from format (defaults to ‘auto’ based on log_file extension)

read_eval_log_samples

Read all samples from an evaluation log incrementally.

Generator for samples in a log file. Only one sample at a time will be read into memory and yielded to the caller.

def read_eval_log_samples(
    log_file: str | EvalLogInfo,
    all_samples_required: bool = True,
    resolve_attachments: bool = False,
    format: Literal["eval", "json", "auto"] = "auto",
) -> Generator[EvalSample, None, None]
log_file str | EvalLogInfo

Log file to read.

all_samples_required bool

All samples must be included in the file or an IndexError is thrown.

resolve_attachments bool

Resolve attachments (e.g. images) to their full content.

format Literal['eval', 'json', 'auto']

Read from format (defaults to ‘auto’ based on log_file extension)

convert_eval_logs

Convert between log file formats.

Convert log file(s) to a target format. If a file is already in the target format it will just be copied to the output dir.

def convert_eval_logs(
    path: str, to: Literal["eval", "json"], output_dir: str, overwrite: bool = False
) -> None
path str

Path to source log file(s). Should be either a single log file or a directory containing log files.

to Literal['eval', 'json']

Format to convert to. If a file is already in the target format it will just be copied to the output dir.

output_dir str

Output directory to write converted log file(s) to.

overwrite bool

Overwrite existing log files (defaults to False, raising an error if the output file path already exists).

bundle_log_dir

Bundle a log_dir into a statically deployable viewer

def bundle_log_dir(
    log_dir: str | None = None,
    output_dir: str | None = None,
    overwrite: bool = False,
    fs_options: dict[str, Any] = {},
) -> None
log_dir str | None

(str | None): The log_dir to bundle

output_dir str | None

(str | None): The directory to place bundled output. If no directory is specified, the env variable INSPECT_VIEW_BUNDLE_OUTPUT_DIR will be used.

overwrite bool

(bool): Optional. Whether to overwrite files in the output directory. Defaults to False.

fs_options dict[str, Any]

Optional. Additional arguments to pass through to the filesystem provider (e.g. S3FileSystem).

write_log_dir_manifest

Write a manifest for a log directory.

A log directory manifest is a dictionary of EvalLog headers (EvalLog w/o samples) keyed by log file names (names are relative to the log directory)

def write_log_dir_manifest(
    log_dir: str,
    *,
    filename: str = "logs.json",
    output_dir: str | None = None,
    fs_options: dict[str, Any] = {},
) -> None
log_dir str

Log directory to write manifest for.

filename str

Manifest filename (defaults to “logs.json”)

output_dir str | None

Output directory for manifest (defaults to log_dir)

fs_options dict[str, Any]

Optional. Additional arguments to pass through to the filesystem provider (e.g. S3FileSystem).

retryable_eval_logs

Extract the list of retryable logs from a list of logs.

Retryable logs are logs with status “error” or “cancelled” that do not have a corresponding log with status “success” (indicating they were subsequently retried and completed)

def retryable_eval_logs(logs: list[EvalLogInfo]) -> list[EvalLogInfo]
logs list[EvalLogInfo]

List of logs to examine.

EvalLogInfo

File info and task identifiers for eval log.

class EvalLogInfo(BaseModel)

Attributes

name str

Name of file.

type str

Type of file (file or directory)

size int

File size in bytes.

mtime float | None

File modification time (None if the file is a directory on S3).

task str

Task name.

task_id str

Task id.

suffix str | None

Log file suffix (e.g. “-scored”)

Eval Log API

EvalLog

Evaluation log.

class EvalLog(BaseModel)

Attributes

version int

Eval log file format version.

status Literal['started', 'success', 'cancelled', 'error']

Status of evaluation (did it succeed or fail).

eval EvalSpec

Eval identity and configuration.

plan EvalPlan

Eval plan (solvers and config)

results EvalResults | None

Eval results (scores and metrics).

stats EvalStats

Eval stats (runtime, model usage)

error EvalError | None

Error that halted eval (if status==“error”)

samples list[EvalSample] | None

Samples processed by eval.

reductions list[EvalSampleReductions] | None

Reduced sample values

location str

Location that the log file was read from.

EvalSpec

Eval target and configuration.

class EvalSpec(BaseModel)

Attributes

run_id str

Unique run id

created str

Time created.

task str

Task name.

task_id str

Unique task id.

task_version int

Task version.

task_file str | None

Task source file.

task_attribs dict[str, Any]

Attributes of the @task decorator.

task_args dict[str, Any]

Arguments used for invoking the task.

solver str | None

Solver name.

solver_args dict[str, Any] | None

Arguments used for invoking the solver.

tags list[str] | None

Tags associated with evaluation run.

dataset EvalDataset

Dataset used for eval.

sandbox SandboxEnvironmentSpec | None

Sandbox environment type and optional config file.

model str

Model used for eval.

model_base_url str | None

Optional override of model base url

model_args dict[str, Any]

Model specific arguments.

config EvalConfig

Configuration values for eval.

revision EvalRevision | None

Source revision of eval.

packages dict[str, str]

Package versions for eval.

metadata dict[str, Any] | None

Additional eval metadata.

scorers list[EvalScorer] | None

Scorers and args for this eval

metrics list[EvalMetricDefinition] | dict[str, list[EvalMetricDefinition]] | None

metrics and args for this eval

EvalDataset

Dataset used for evaluation.

class EvalDataset(BaseModel)

Attributes

name str | None

Dataset name.

location str | None

Dataset location (file path or remote URL)

samples int | None

Number of samples in the dataset.

sample_ids list[int | str] | None

IDs of samples in the dataset.

shuffled bool | None

Was the dataset shuffled after reading.

EvalConfig

Configuration used for evaluation.

class EvalConfig(BaseModel)

Attributes

limit int | tuple[int, int] | None

Sample limit (number of samples or range of samples).

sample_id str | int | list[str | int] | None

Evaluate specific sample(s).

epochs int | None

Number of epochs to run samples over.

epochs_reducer list[str] | None

Reducers for aggregating per-sample scores.

approval ApprovalPolicyConfig | None

Approval policy for tool use.

fail_on_error bool | float | None

Fail eval when sample errors occur.

True to fail on first sample error (default); False to never fail on sample errors; Value between 0 and 1 to fail if a proportion of total samples fails. Value greater than 1 to fail eval if a count of samples fails.

message_limit int | None

Maximum messages to allow in a chat conversation.

token_limit int | None

Maximum tokens to allow in a chat conversation.

time_limit int | None

Maximum seconds for chat conversation.

max_samples int | None

Maximum number of samples to run in parallel.

max_tasks int | None

Maximum number of tasks to run in parallel.

max_subprocesses int | None

Maximum number of subprocesses to run concurrently.

max_sandboxes int | None

Maximum number of sandboxes to run concurrently.

sandbox_cleanup bool | None

Cleanup sandbox environments after task completes.

log_samples bool | None

Log detailed information on each sample.

log_images bool | None

Log base64 encoded versions of images.

log_buffer int | None

Number of samples to buffer before writing log file.

score_display bool | None

Display scoring metrics realtime.

EvalRevision

Git revision for evaluation.

class EvalRevision(BaseModel)

Attributes

type Literal['git']

Type of revision (currently only “git”)

origin str

Revision origin server

commit str

Revision commit.

EvalPlan

Plan (solvers) used in evaluation.

class EvalPlan(BaseModel)

Attributes

name str

Plan name.

steps list[EvalPlanStep]

Steps in plan.

finish EvalPlanStep | None

Step to always run at the end.

config GenerateConfig

Generation config.

EvalPlanStep

Solver step.

class EvalPlanStep(BaseModel)

Attributes

solver str

Name of solver.

params dict[str, Any]

Parameters used to instantiate solver.

EvalResults

Scoring results from evaluation.

class EvalResults(BaseModel)

Attributes

total_samples int

Total samples in eval (dataset samples * epochs)

completed_samples int

Samples completed without error.

Will be equal to total_samples except when –fail-on-error is enabled.

scores list[EvalScore]

Scorers used to compute results

metadata dict[str, Any] | None

Additional results metadata.

sample_reductions list[EvalSampleReductions] | None

List of per sample scores reduced across epochs

EvalScore

Score for evaluation task.

class EvalScore(BaseModel)

Attributes

name str

Score name.

scorer str

Scorer name.

reducer str | None

Reducer name.

params dict[str, Any]

Parameters specified when creating scorer.

metrics dict[str, EvalMetric]

Metrics computed for this scorer.

metadata dict[str, Any] | None

Additional scorer metadata.

EvalMetric

Metric for evaluation score.

class EvalMetric(BaseModel)

Attributes

name str

Metric name.

value int | float

Metric value.

params dict[str, Any]

Params specified when creating metric.

metadata dict[str, Any] | None

Additional metadata associated with metric.

EvalSampleReductions

Score reductions.

class EvalSampleReductions(BaseModel)

Attributes

scorer str

Name the of scorer

reducer str | None

Name the of reducer

samples list[EvalSampleScore]

List of reduced scores

EvalStats

Timing and usage statistics.

class EvalStats(BaseModel)

Attributes

started_at str

Evaluation start time.

completed_at str

Evaluation completion time.

model_usage dict[str, ModelUsage]

Model token usage for evaluation.

EvalError

Eval error details.

class EvalError(BaseModel)

Attributes

message str

Error message.

traceback str

Error traceback.

traceback_ansi str

Error traceback with ANSI color codes.

EvalSample

Sample from evaluation task.

class EvalSample(BaseModel)

Attributes

id int | str

Unique id for sample.

epoch int

Epoch number for sample.

input str | list[ChatMessage]

Sample input.

choices list[str] | None

Sample choices.

target str | list[str]

Sample target value(s)

sandbox SandboxEnvironmentSpec | None

Sandbox environment type and optional config file.

files list[str] | None

Files that go along with the sample (copied to SandboxEnvironment)

setup str | None

Setup script to run for sample (run within default SandboxEnvironment).

messages list[ChatMessage]

Chat conversation history for sample.

output ModelOutput

Model output from sample.

scores dict[str, Score] | None

Scores for sample.

metadata dict[str, Any]

Additional sample metadata.

store dict[str, Any]

State at end of sample execution.

events list[Event]

Events that occurred during sample execution.

model_usage dict[str, ModelUsage]

Model token usage for sample.

error EvalError | None

Error that halted sample.

attachments dict[str, str]

Attachments referenced from messages and events.

Resolve attachments for a sample (replacing attachment://* references with attachment content) by passing resolve_attachments=True to log reading functions.

limit EvalSampleLimit | None

The limit that halted the sample

Methods

metadata_as

Pydantic model interface to metadata.

def metadata_as(self, metadata_cls: Type[MT]) -> MT
metadata_cls Type[MT]

Pydantic model type

store_as

Pydantic model interface to the store.

def store_as(self, model_cls: Type[SMT]) -> SMT
model_cls Type[SMT]

Pydantic model type (must derive from StoreModel)

EvalSampleLimit

Limit encontered by sample.

class EvalSampleLimit(BaseModel)

Attributes

type Literal['context', 'time', 'message', 'token', 'operator', 'custom']

The type of limit

limit int

The limit value

EvalSampleReductions

Score reductions.

class EvalSampleReductions(BaseModel)

Attributes

scorer str

Name the of scorer

reducer str | None

Name the of reducer

samples list[EvalSampleScore]

List of reduced scores

EvalSampleScore

Score and sample_id scored.

class EvalSampleScore(Score)

Attributes

sample_id str | int | None

Sample ID.

Transcript API

transcript

Get the current Transcript.

def transcript() -> Transcript

Transcript

Transcript of events.

class Transcript

Methods

info

Add an InfoEvent to the transcript.

def info(self, data: JsonValue, *, source: str | None = None) -> None
data JsonValue

Data associated with the event.

source str | None

Optional event source.

step

Context manager for recording StepEvent.

@contextlib.contextmanager
def step(self, name: str, type: str | None = None) -> Iterator[None]
name str

Step name.

type str | None

Optional step type.

Event

Event in a transcript.

Event: TypeAlias = Union[
    SampleInitEvent
    | SampleLimitEvent
    | StateEvent
    | StoreEvent
    | ModelEvent
    | ToolEvent
    | ApprovalEvent
    | InputEvent
    | ScoreEvent
    | ErrorEvent
    | LoggerEvent
    | InfoEvent
    | StepEvent
    | SubtaskEvent,
]

SampleInitEvent

Beginning of processing a Sample.

class SampleInitEvent(BaseEvent)

Attributes

event Literal['sample_init']

Event type.

sample Sample

Sample.

state JsonValue

Initial state.

SampleLimitEvent

The sample was unable to finish processing due to a limit

class SampleLimitEvent(BaseEvent)

Attributes

event Literal['sample_limit']

Event type.

type Literal['message', 'time', 'token', 'operator', 'custom']

Type of limit that halted processing

message str

A message associated with this limit

limit int | None

The limit value (if any)

StateEvent

Change to the current TaskState

class StateEvent(BaseEvent)

Attributes

event Literal['state']

Event type.

changes list[JsonChange]

List of changes to the TaskState

StoreEvent

Change to data within the current Store.

class StoreEvent(BaseEvent)

Attributes

event Literal['store']

Event type.

changes list[JsonChange]

List of changes to the Store.

ModelEvent

Call to a language model.

class ModelEvent(BaseEvent)

Attributes

event Literal['model']

Event type.

model str

Model name.

input list[ChatMessage]

Model input (list of messages).

tools list[ToolInfo]

Tools available to the model.

tool_choice ToolChoice

Directive to the model which tools to prefer.

config GenerateConfig

Generate config used for call to model.

output ModelOutput

Output from model.

error str | None

Error which occurred during model call.

cache Literal['read', 'write'] | None

Was this a cache read or write.

call ModelCall | None

Raw call made to model API.

ToolEvent

Call to a tool.

class ToolEvent(BaseEvent)

Attributes

event Literal['tool']

Event type.

type Literal['function']

Type of tool call (currently only ‘function’)

id str

Unique identifier for tool call.

function str

Function called.

arguments dict[str, JsonValue]

Arguments to function.

view ToolCallContent | None

Custom view of tool call input.

result ToolResult

Function return value.

truncated tuple[int, int] | None

Bytes truncated (from,to) if truncation occurred

error ToolCallError | None

Error that occurred during tool call.

events list[Event]

Transcript of events for tool.

cancelled bool

Was the task cancelled?

ApprovalEvent

Tool approval.

class ApprovalEvent(BaseEvent)

Attributes

event Literal['approval']

Event type

message str

Message generated by model along with tool call.

call ToolCall

Tool call being approved.

view ToolCallView | None

View presented for approval.

approver str

Aprover name.

decision Literal['approve', 'modify', 'reject', 'escalate', 'terminate']

Decision of approver.

modified ToolCall | None

Modified tool call for decision ‘modify’.

explanation str | None

Explanation for decision.

InputEvent

Input screen interaction.

class InputEvent(BaseEvent)

Attributes

event Literal['input']

Event type.

input str

Input interaction (plain text).

input_ansi str

Input interaction (ANSI).

StoreEvent

Change to data within the current Store.

class StoreEvent(BaseEvent)

Attributes

event Literal['store']

Event type.

changes list[JsonChange]

List of changes to the Store.

ErrorEvent

Event with sample error.

class ErrorEvent(BaseEvent)

Attributes

event Literal['error']

Event type.

error EvalError

Sample error

LoggerEvent

Log message recorded with Python logger.

class LoggerEvent(BaseEvent)

Attributes

event Literal['logger']

Event type.

message LoggingMessage

Logging message

LoggingLevel

Logging level.

LoggingLevel = Literal[
    "debug", "trace", "http", "sandbox", "info", "warning", "error", "critical"
]

LoggingMessage

Message written to Python log.

class LoggingMessage(BaseModel)

Attributes

name str | None

Logger name (e.g. ‘httpx’)

level LoggingLevel

Logging level.

message str

Log message.

created float

Message created time.

filename str

Logged from filename.

module str

Logged from module.

lineno int

Logged from line number.

InfoEvent

Event with custom info/data.

class InfoEvent(BaseEvent)

Attributes

event Literal['info']

Event type.

source str | None

Optional source for info event.

data JsonValue

Data provided with event.

StepEvent

Step within current sample or subtask.

class StepEvent(BaseEvent)

Attributes

event Literal['step']

Event type.

action Literal['begin', 'end']

Designates beginning or end of event.

type str | None

Optional ‘type’ field for events

name str

Event name.

SubtaskEvent

Subtask spawned.

class SubtaskEvent(BaseEvent)

Attributes

event Literal['subtask']

Event type.

name str

Name of subtask function.

type str | None

Type of subtask

input dict[str, Any]

Subtask function inputs.

result Any

Subtask function result.

events list[Event]

Transcript of events for subtask.