Skip to content

Commit

Permalink
Evals - cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
ddebowczyk committed Oct 23, 2024
1 parent a57863c commit d19cc4e
Show file tree
Hide file tree
Showing 24 changed files with 314 additions and 121 deletions.
10 changes: 5 additions & 5 deletions evals/ComplexExtraction/ProjectsEval.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
use Cognesy\Instructor\Extras\Evals\Contracts\CanObserveExecution;
use Cognesy\Instructor\Extras\Evals\Execution;
use Cognesy\Instructor\Extras\Evals\Observation;
use Cognesy\Instructor\Extras\Sequence\Sequence;

class ProjectsEval implements CanObserveExecution
{
Expand All @@ -20,16 +19,17 @@ public function __construct(array $expectations) {
*/
public function observe(Execution $execution): Observation {
$expectedEvents = $this->expectations['events'];
/** @var Sequence $events */
/** @var ProjectEvents $events */
$events = $execution->get('response')?->value();
$result = ($expectedEvents - count($events->list)) / $expectedEvents;
$result = ($expectedEvents - count($events->events)) / $expectedEvents;
return Observation::make(
type: 'metric',
key: 'execution.percentFound',
key: 'execution.fractionFound',
value: $result,
metadata: [
'executionId' => $execution->id(),
'unit' => 'percentage',
'unit' => 'fraction',
'format' => '%.2f',
],
);
}
Expand Down
22 changes: 16 additions & 6 deletions evals/ComplexExtraction/run.php
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
<?php

use Cognesy\Evals\ComplexExtraction\ProjectsEval;
use Cognesy\Evals\ComplexExtraction\ProjectEvents;
use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Aggregators\AggregateExperimentObservation;
use Cognesy\Instructor\Extras\Evals\Enums\NumberAggregationMethod;
use Cognesy\Instructor\Extras\Evals\Evaluators\ArrayMatchEval;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InferenceCases;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InstructorData;
use Cognesy\Instructor\Extras\Evals\Executors\RunInstructor;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Sequence\Sequence;
use Cognesy\Instructor\Utils\Debug\Debug;

$loader = require 'vendor/autoload.php';
$loader->add('Cognesy\\Instructor\\', __DIR__ . '../../src/');

$data = new InstructorData(
responseModel: Sequence::of(ProjectEvent::class),
responseModel: ProjectEvents::class,
maxTokens: 4096,
prompt: 'Extract a list of project events with all the details from the provided input in JSON format using schema: <|json_schema|>',
input: file_get_contents(__DIR__ . '/report.txt'),
examples: require 'examples.php',
Expand All @@ -24,7 +27,7 @@
cases: InferenceCases::only(
connections: ['openai', 'anthropic', 'gemini', 'cohere'],
modes: [Mode::Tools],
stream: [true, false]
stream: [false]
),
executor: new RunInstructor($data),
processors: [
Expand All @@ -34,10 +37,17 @@
],
postprocessors: [
new AggregateExperimentObservation(
name: 'reliability',
observationKey: 'execution.percentFound',
name: 'experiment.mean_completeness',
observationKey: 'execution.fractionFound',
params: ['unit' => 'fraction', 'format' => '%.2f'],
method: NumberAggregationMethod::Mean,
)
),
new AggregateExperimentObservation(
name: 'experiment.latency_p95',
observationKey: 'execution.timeElapsed',
params: ['percentile' => 95, 'unit' => 'seconds'],
method: NumberAggregationMethod::Percentile,
),
],
);

Expand Down
25 changes: 17 additions & 8 deletions evals/LLMModes/CompanyEval.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,31 @@
namespace Cognesy\Evals\LLMModes;

use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Contracts\CanObserveExecution;
use Cognesy\Instructor\Extras\Evals\Contracts\CanProvideExecutionObservations;
use Cognesy\Instructor\Extras\Evals\Execution;
use Cognesy\Instructor\Extras\Evals\Observation;
use Cognesy\Instructor\Utils\Str;

class CompanyEval implements CanObserveExecution
class CompanyEval implements CanProvideExecutionObservations
{
private string $key;
private array $expectations;

public function __construct(array $expectations) {
public function __construct(
string $key,
array $expectations
) {
$this->key = $key;
$this->expectations = $expectations;
}

public function observe(Execution $execution): Observation {
public function observations(Execution $subject): iterable {
yield $this->correctness($subject);
}

// INTERNAL /////////////////////////////////////////////////

public function correctness(Execution $execution): Observation {
$mode = $execution->get('case.mode');
$isCorrect = match ($mode) {
Mode::Text => $this->validateText($execution),
Expand All @@ -25,7 +36,7 @@ public function observe(Execution $execution): Observation {
};
return Observation::make(
type: 'metric',
key: 'execution.is_correct',
key: $this->key,
value: $isCorrect ? 1 : 0,
metadata: [
'executionId' => $execution->id(),
Expand All @@ -34,10 +45,8 @@ public function observe(Execution $execution): Observation {
);
}

// INTERNAL /////////////////////////////////////////////////

private function validateToolsData(Execution $execution) : bool {
$data = $execution->get('response')->toolsData[0];
$data = $execution->get('response')->toolsData[0] ?? [];
return 'store_company' === ($data['name'] ?? '')
&& 'ACME' === ($data['arguments']['name'] ?? '')
&& 2020 === (int) ($data['arguments']['year'] ?? 0);
Expand Down
21 changes: 13 additions & 8 deletions evals/LLMModes/run.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Aggregators\AggregateExperimentObservation;
use Cognesy\Instructor\Extras\Evals\Enums\NumberAggregationMethod;
use Cognesy\Instructor\Extras\Evals\Evaluators\ArrayMatchEval;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InferenceCases;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InferenceData;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InferenceSchema;
use Cognesy\Instructor\Extras\Evals\Executors\RunInference;
use Cognesy\Instructor\Utils\Debug\Debug;

$data = new InferenceData(
messages: [
Expand Down Expand Up @@ -44,22 +46,25 @@
);

$experiment = new Experiment(
cases: InferenceCases::only(
connections: ['openai'],
modes: [Mode::Tools, Mode::Text],
stream: [false],
cases: InferenceCases::except(
connections: [],
modes: [Mode::Json, Mode::JsonSchema, Mode::Text, Mode::MdJson],
stream: [true],
),
executor: new RunInference($data),
processors: [
new CompanyEval(expectations: [
'name' => 'ACME',
'year' => 2020
]),
new CompanyEval(
key: 'execution.is_correct',
expectations: [
'name' => 'ACME',
'year' => 2020
]),
],
postprocessors: [
new AggregateExperimentObservation(
name: 'experiment.reliability',
observationKey: 'execution.is_correct',
params: ['unit' => 'fraction', 'format' => '%.2f'],
method: NumberAggregationMethod::Mean,
),
]
Expand Down
10 changes: 6 additions & 4 deletions evals/SimpleExtraction/CompanyEval.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@

class CompanyEval implements CanObserveExecution
{
private string $key;
private array $expectations;

public function __construct(array $expectations) {
public function __construct(string $key, array $expectations) {
$this->key = $key;
$this->expectations = $expectations;
}

Expand All @@ -21,11 +23,11 @@ public function observe(Execution $execution): Observation {

return Observation::make(
type: 'metric',
key: 'execution.is_correct',
value: $isCorrect,
key: $this->key,
value: $isCorrect ? 1 : 0,
metadata: [
'executionId' => $execution->id(),
'data' => $company->toArray(),
'data' => json_encode($company),
],
);
}
Expand Down
39 changes: 31 additions & 8 deletions evals/SimpleExtraction/run.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Extras\Evals\Aggregators\AggregateExperimentObservation;
use Cognesy\Instructor\Extras\Evals\Enums\NumberAggregationMethod;
use Cognesy\Instructor\Extras\Evals\Evaluators\ArrayMatchEval;
use Cognesy\Instructor\Extras\Evals\Experiment;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InferenceCases;
use Cognesy\Instructor\Extras\Evals\Executors\Data\InstructorData;
use Cognesy\Instructor\Extras\Evals\Executors\RunInstructor;
use Cognesy\Instructor\Utils\Debug\Debug;

$loader = require 'vendor/autoload.php';
$loader->add('Cognesy\\Instructor\\', __DIR__ . '../../src/');
Expand All @@ -22,29 +24,50 @@
responseModel: Company::class,
);

//Debug::enable();

$experiment = new Experiment(
cases: InferenceCases::only(
connections: ['openai', 'anthropic'],
modes: [Mode::Tools],
stream: [false]
cases: InferenceCases::except(
connections: ['ollama'],
modes: [Mode::JsonSchema, Mode::Text],
stream: [true]
),
executor: new RunInstructor($data),
processors: [
new CompanyEval(expectations: [
new CompanyEval(
key: 'execution.is_correct',
expectations: [
'name' => 'ACME',
'year' => 2020
]),
new ArrayMatchEval(expected: [
'name' => 'ACME',
'year' => 2020
'year' => 2020,
]),
],
postprocessors: [
new AggregateExperimentObservation(
name: 'experiment.reliability',
observationKey: 'execution.is_correct',
params: ['unit' => 'fraction', 'format' => '%.2f'],
method: NumberAggregationMethod::Mean,
),
new AggregateExperimentObservation(
name: 'experiment.mean_precision',
observationKey: 'execution.precision',
params: ['unit' => 'fraction', 'format' => '%.2f'],
method: NumberAggregationMethod::Mean,
),
new AggregateExperimentObservation(
name: 'experiment.mean_recall',
observationKey: 'execution.recall',
params: ['unit' => 'fraction', 'format' => '%.2f'],
method: NumberAggregationMethod::Mean,
),
new AggregateExperimentObservation(
name: 'latency',
name: 'experiment.latency_p95',
observationKey: 'execution.timeElapsed',
params: ['percentile' => 95],
params: ['percentile' => 95, 'unit' => 'seconds'],
method: NumberAggregationMethod::Percentile,
),
],
Expand Down
2 changes: 1 addition & 1 deletion examples/A04_APISupport/LLMSupportCohere/run.php
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class User {

// Get Instructor with specified LLM client connection
// See: /config/llm.php to check or change LLM client connection configuration details
$instructor = (new Instructor)->withConnection('cohere1');
$instructor = (new Instructor)->withConnection('cohere2');

$user = $instructor->respond(
messages: "Jason (@jxnlco) is 25 years old and is the admin of this project. He likes playing football and reading books.",
Expand Down
1 change: 1 addition & 0 deletions examples/A04_APISupport/LLMSupportTogetherAI/run.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

use Cognesy\Instructor\Enums\Mode;
use Cognesy\Instructor\Instructor;
use Cognesy\Instructor\Utils\Debug\Debug;

enum UserType : string {
case Guest = 'guest';
Expand Down
6 changes: 3 additions & 3 deletions src-hub/Views/RunnerView.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ public function stats(int $correct, int $incorrect, int $total) : void {
public function renderOutput(array $errors, float $timeElapsed) : void {
Cli::grid([[1, ">", STR_PAD_RIGHT, Color::DARK_GRAY]]);
if (!empty($errors)) {
Cli::grid([[7, " ERROR", STR_PAD_RIGHT, [Color::WHITE, Color::BG_RED]]]);
Cli::grid([[8, "ERROR", STR_PAD_BOTH, [Color::WHITE, Color::BG_RED]]]);
} else {
Cli::grid([[7, "OK ", STR_PAD_LEFT, [Color::WHITE, Color::BG_GREEN]]]);
Cli::grid([[8, "OK", STR_PAD_BOTH, [Color::WHITE, Color::BG_GREEN]]]);
}
$this->printTimeElapsed($timeElapsed);
Cli::outln();
Expand Down Expand Up @@ -82,7 +82,7 @@ public function displayErrors(array $errors, bool $displayErrors) : void {
}

public function printTimeElapsed(float $totalTime) {
Cli::out(" (", [Color::DARK_GRAY, Color::BG_BLACK]);
Cli::out("(", [Color::DARK_GRAY, Color::BG_BLACK]);
Cli::grid([[10, (round($totalTime, 2) . " sec"), STR_PAD_LEFT, Color::DARK_GRAY]]);
Cli::out(")", [Color::DARK_GRAY]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ private function calculate(Experiment $experiment) : float|int {
$experiment->executionObservations(),
])->withKey($this->observationKey)->get();

if (empty($observations)) {
throw new InvalidArgumentException("No observations found for key: {$this->observationKey}");
}

$values = array_map(
callback: fn($observation) => $observation->toFloat(),
array: $observations,
Expand Down
Loading

0 comments on commit d19cc4e

Please sign in to comment.