From d4d6d69099eb4f9511f0a2c250dd0a933269a9cf Mon Sep 17 00:00:00 2001 From: ddebowczyk Date: Sun, 29 Sep 2024 15:34:09 +0200 Subject: [PATCH] Fixes in JSON extraction --- README.md | 18 +- composer.json | 2 - examples/A02_Advanced/ContextCaching/run.php | 23 +- src/Core/RequestHandler.php | 6 + src/Core/StreamResponse/PartialsGenerator.php | 2 +- src/Extras/Debug/Debug.php | 1 + src/Extras/Http/Drivers/GuzzleHttpClient.php | 6 +- src/Extras/LLM/InferenceResponse.php | 8 +- src/Utils/Json/Json.php | 31 ++- .../{JsonParser.php => PartialJsonParser.php} | 2 +- src/Utils/Json/ResilientJsonParser.php | 202 ++++++++++++++++++ tests/Feature/Utils/JsonParserTest.php | 4 +- tests/Feature/Utils/PartialJsonTest.php | 4 +- 13 files changed, 273 insertions(+), 36 deletions(-) rename src/Utils/Json/{JsonParser.php => PartialJsonParser.php} (96%) create mode 100644 src/Utils/Json/ResilientJsonParser.php diff --git a/README.md b/README.md index 0d291496..656f2f75 100644 --- a/README.md +++ b/README.md @@ -904,9 +904,21 @@ To provide an essential functionality we needed here Instructor for PHP leverage Instructor for PHP is compatible with PHP 8.2 or later and, due to minimal dependencies, should work with any framework of your choice. - - [SaloonPHP](https://docs.saloon.dev/) - for handling communication with LLM API providers - - [Symfony components](https://symfony.com/) - for validation, serialization and other utilities - + - [Guzzle](https://docs.guzzlephp.org/) + - [Symfony components](https://symfony.com/) + * symfony/property-access + * symfony/property-info + * symfony/serializer + * symfony/type-info + * symfony/validator + - adbario/php-dot-notation + - phpdocumentor/reflection-docblock + - phpstan/phpdoc-parser + - vlucas/phpdotenv + +Additional dependencies are required for some extras: + - spatie/array-to-xml + - gioni06/gpt3-tokenizer ## TODOs diff --git a/composer.json b/composer.json index 7ea61ae1..e0d9d0d0 100644 --- a/composer.json +++ b/composer.json @@ -61,8 +61,6 @@ "ext-fileinfo": "*", "gioni06/gpt3-tokenizer": "^1.2", "guzzlehttp/guzzle": "^7.8", - "league/flysystem": "^3.0", - "nyholm/psr7": "^1.8", "phpdocumentor/reflection-docblock": "^5.4", "phpstan/phpdoc-parser": "^1.29", "psr/log": "^3.0", diff --git a/examples/A02_Advanced/ContextCaching/run.php b/examples/A02_Advanced/ContextCaching/run.php index 8c5b2c7f..1040201a 100644 --- a/examples/A02_Advanced/ContextCaching/run.php +++ b/examples/A02_Advanced/ContextCaching/run.php @@ -30,10 +30,11 @@ use Cognesy\Instructor\Enums\Mode; use Cognesy\Instructor\Instructor; use Cognesy\Instructor\Schema\Attributes\Description; -use Cognesy\Instructor\Utils\Env; +use Cognesy\Instructor\Utils\Str; class Project { public string $name; + public string $targetAudience; /** @var string[] */ #[Description('Technology platform and libraries used in the project')] public array $technologies; @@ -43,9 +44,9 @@ class Project { /** @var string[] */ #[Description('Applications and potential use cases of the project')] public array $applications; - #[Description('Explain the purpose of the project and the problems it solves')] + #[Description('Explain the purpose of the project and the domain specific problems it solves')] public string $description; - #[Description('Example code in Markdown demonstrating the application of the library')] + #[Description('Example code in Markdown demonstrating domain specific application of the library')] public string $code; } ?> @@ -59,9 +60,9 @@ class Project { $content = file_get_contents(__DIR__ . '/../../../README.md'); $cached = (new Instructor)->withConnection('anthropic')->cacheContext( - system: 'Your goal is to respond questions about the project described in the README.md file', + system: 'Your goal is to respond questions about the project described in the README.md file' + . "\n\n# README.md\n\n" . $content, prompt: 'Respond to the user with a description of the project with JSON using schema:\n<|json_schema|>', - input: "# README.md\n\n" . $content, ); ?> ``` @@ -73,12 +74,14 @@ class Project { ```php respond( - messages: 'Describe the project - my audience is P&C insurance CIOs', - mode: Mode::Json, + messages: 'Describe the project in a way compelling to my audience: P&C insurance CIOs.', responseModel: Project::class, options: ['max_tokens' => 4096], + mode: Mode::Json, ); dump($project); +assert($project instanceof Project); +assert(Str::contains($project->name, 'Instructor')); ?> ``` Now we can use the same context to ask the user to describe the project for a different @@ -90,11 +93,13 @@ class Project { ```php respond( - messages: 'Describe the project - my audience is boutique CMS consulting company owner', - mode: Mode::Json, + messages: "Describe the project in a way compelling to my audience: boutique CMS consulting company owner.", responseModel: Project::class, options: ['max_tokens' => 4096], + mode: Mode::Json, ); dump($project); +assert($project instanceof Project); +assert(Str::contains($project->name, 'Instructor')); ?> ``` diff --git a/src/Core/RequestHandler.php b/src/Core/RequestHandler.php index df71126d..163b5558 100644 --- a/src/Core/RequestHandler.php +++ b/src/Core/RequestHandler.php @@ -7,6 +7,7 @@ use Cognesy\Instructor\Contracts\CanHandleSyncRequest; use Cognesy\Instructor\Data\Request; use Cognesy\Instructor\Data\ResponseModel; +use Cognesy\Instructor\Enums\Mode; use Cognesy\Instructor\Events\EventDispatcher; use Cognesy\Instructor\Events\Instructor\ResponseGenerated; use Cognesy\Instructor\Events\Request\NewValidationRecoveryAttempt; @@ -17,6 +18,7 @@ use Cognesy\Instructor\Extras\LLM\Data\ApiResponse; use Cognesy\Instructor\Extras\LLM\Inference; use Cognesy\Instructor\Extras\LLM\InferenceResponse; +use Cognesy\Instructor\Utils\Json\Json; use Cognesy\Instructor\Utils\Result\Result; use Exception; use Generator; @@ -92,6 +94,10 @@ protected function getApiResponse(Request $request) : ApiResponse { try { $this->events->dispatch(new RequestSentToLLM($request)); $apiResponse = $this->makeInference($request)->toApiResponse(); + $apiResponse->content = match($request->mode()) { + Mode::Text => $apiResponse->content, + default => Json::find($apiResponse->content), + }; } catch (Exception $e) { $this->events->dispatch(new RequestToLLMFailed($request, $e->getMessage())); throw $e; diff --git a/src/Core/StreamResponse/PartialsGenerator.php b/src/Core/StreamResponse/PartialsGenerator.php index cd789277..a95650b3 100644 --- a/src/Core/StreamResponse/PartialsGenerator.php +++ b/src/Core/StreamResponse/PartialsGenerator.php @@ -146,7 +146,7 @@ protected function tryGetPartialObject( string $partialJsonData, ResponseModel $responseModel, ) : Result { - return Chain::from(fn() => Json::fix($partialJsonData)) + return Chain::from(fn() => Json::fix(Json::find($partialJsonData))) ->through(fn($jsonData) => $this->responseDeserializer->deserialize($jsonData, $responseModel, $this?->toolCalls->last()->name)) ->through(fn($object) => $this->responseTransformer->transform($object)) ->result(); diff --git a/src/Extras/Debug/Debug.php b/src/Extras/Debug/Debug.php index 8eb35e44..f4a7fa08 100644 --- a/src/Extras/Debug/Debug.php +++ b/src/Extras/Debug/Debug.php @@ -94,6 +94,7 @@ private static function printHeaders(array $headers) : void { } private static function printBody(string $body) : void { + /** @noinspection ForgottenDebugOutputInspection */ dump(json_decode($body)); } } \ No newline at end of file diff --git a/src/Extras/Http/Drivers/GuzzleHttpClient.php b/src/Extras/Http/Drivers/GuzzleHttpClient.php index 2f9fa39e..32fc2ffb 100644 --- a/src/Extras/Http/Drivers/GuzzleHttpClient.php +++ b/src/Extras/Http/Drivers/GuzzleHttpClient.php @@ -2,7 +2,6 @@ namespace Cognesy\Instructor\Extras\Http\Drivers; -use Cognesy\Instructor\Events\EventDispatcher; use Cognesy\Instructor\Extras\Debug\Debug; use Cognesy\Instructor\Extras\Http\Contracts\CanHandleHttp; use Cognesy\Instructor\Extras\Http\Data\HttpClientConfig; @@ -22,14 +21,13 @@ class GuzzleHttpClient implements CanHandleHttp public function __construct( protected HttpClientConfig $config, - protected ?EventDispatcher $events = null, + protected ?Client $httpClient = null, ) { - $this->events = $events ?? new EventDispatcher(); if (isset($this->httpClient) && Debug::isEnabled()) { throw new InvalidArgumentException("Guzzle does not allow to inject debugging stack into existing client. Turn off debug or use default client."); } $this->client = match(Debug::isEnabled()) { - false => new Client(), + false => $httpClient ?? new Client(), true => new Client(['handler' => $this->addDebugStack(HandlerStack::create())]), }; } diff --git a/src/Extras/LLM/InferenceResponse.php b/src/Extras/LLM/InferenceResponse.php index ca58f915..f0837d40 100644 --- a/src/Extras/LLM/InferenceResponse.php +++ b/src/Extras/LLM/InferenceResponse.php @@ -5,8 +5,6 @@ use Cognesy\Instructor\Events\ApiClient\ApiResponseReceived; use Cognesy\Instructor\Events\ApiClient\PartialApiResponseReceived; use Cognesy\Instructor\Events\EventDispatcher; -use Cognesy\Instructor\Events\Inference\InferenceResponseGenerated; -use Cognesy\Instructor\Events\Inference\PartialInferenceResponseGenerated; use Cognesy\Instructor\Extras\Http\StreamReader; use Cognesy\Instructor\Extras\LLM\Contracts\CanHandleInference; use Cognesy\Instructor\Extras\LLM\Data\ApiResponse; @@ -21,6 +19,7 @@ class InferenceResponse { protected EventDispatcher $events; protected StreamReader $streamReader; + protected string $responseContent = ''; public function __construct( protected ResponseInterface $response, @@ -101,7 +100,10 @@ public function psrStream() : StreamInterface { // INTERNAL ///////////////////////////////////////////////// protected function responseData() : array { - return Json::parse($this->response->getBody()->getContents()) ?? []; + if (empty($this->responseContent)) { + $this->responseContent = $this->response->getBody()->getContents(); + } + return Json::parse($this->responseContent); } /** diff --git a/src/Utils/Json/Json.php b/src/Utils/Json/Json.php index ac0eac02..8985aace 100644 --- a/src/Utils/Json/Json.php +++ b/src/Utils/Json/Json.php @@ -8,8 +8,7 @@ public static function find(string $text) : string { if (empty($text)) { return ''; } - $candidates = (new Json)->extractJSONStrings($text); - return empty($candidates) ? '' : $candidates[0]; + return (new Json)->tryExtractJson($text); } public static function findPartial(string $text) : string { @@ -25,7 +24,7 @@ public static function findPartial(string $text) : string { } public static function fix(string $text) : string { - return (new JsonParser)->fix($text); + return (new PartialJsonParser)->fix($text); } public static function parse(string $text, mixed $default = null) : mixed { @@ -34,7 +33,7 @@ public static function parse(string $text, mixed $default = null) : mixed { } public static function parsePartial(string $text, bool $associative = true) : mixed { - return (new JsonParser)->parse($text, $associative); + return (new PartialJsonParser)->parse($text, $associative); } public static function encode(mixed $json, int $options = 0) : string { @@ -43,16 +42,31 @@ public static function encode(mixed $json, int $options = 0) : string { // INTERNAL //////////////////////////////////////////////////////////////// + private function tryExtractJson(string $text) : string { + // approach 1 + $candidates = $this->extractJSONStrings($text); + $json = empty($candidates) ? '' : $candidates[0] ?? ''; + if (!empty($json)) { + return $json; + } + // approach 2 + $maybeJson = $this->naiveExtract($text); + $json = (new ResilientJsonParser($maybeJson))->parse(); + if (!empty($json)) { + return json_encode($json); + } + // failed to find JSON + return ''; + } + private function naiveExtract(string $text) : string { if (empty($text)) { return ''; } - $firstOpenBracket = strpos($text, '{'); - if ($firstOpenBracket === false) { + if (($firstOpenBracket = strpos($text, '{')) === false) { return ''; } - $lastCloseBracket = strrpos($text, '}'); - if ($lastCloseBracket === false) { + if (($lastCloseBracket = strrpos($text, '}')) === false) { return ''; } return substr($text, $firstOpenBracket, $lastCloseBracket - $firstOpenBracket + 1); @@ -95,7 +109,6 @@ private function extractJSONStrings(string $text): array { $currentCandidate .= $char; } } - return $this->validateJSONStrings($candidates); } diff --git a/src/Utils/Json/JsonParser.php b/src/Utils/Json/PartialJsonParser.php similarity index 96% rename from src/Utils/Json/JsonParser.php rename to src/Utils/Json/PartialJsonParser.php index 605229b0..fce8c283 100644 --- a/src/Utils/Json/JsonParser.php +++ b/src/Utils/Json/PartialJsonParser.php @@ -11,7 +11,7 @@ * Original source: https://github.com/greghunt/partial-json/ * License: MIT */ -class JsonParser +class PartialJsonParser { private $parsers = []; private string $lastParseReminding = ''; diff --git a/src/Utils/Json/ResilientJsonParser.php b/src/Utils/Json/ResilientJsonParser.php new file mode 100644 index 00000000..d8aa427a --- /dev/null +++ b/src/Utils/Json/ResilientJsonParser.php @@ -0,0 +1,202 @@ +input = $input; + $this->length = strlen($input); + } + + // PUBLIC ///////////////////////////////////////////////////////////////// + + public function parse(): mixed { + $this->skipWhitespace(); + return $this->parseValue(); + } + + // INTERNAL //////////////////////////////////////////////////////////////// + + private function parseValue(): mixed { + $char = $this->getCurrentChar(); + return match ($char) { + '{' => $this->parseObject(), + '[' => $this->parseArray(), + '"' => $this->parseString(), + 't' => $this->parseTrue(), + 'f' => $this->parseFalse(), + 'n' => $this->parseNull(), + default => $this->parseNumber(), + }; + } + + private function parseObject(): array { + $result = []; + $this->consume('{'); + $this->skipWhitespace(); + + while ($this->getCurrentChar() !== '}') { + $key = $this->parseString(); + $this->skipWhitespace(); + $this->consume(':'); + $this->skipWhitespace(); + $value = $this->parseValue(); + $result[$key] = $value; + + $this->skipWhitespace(); + if ($this->getCurrentChar() === ',') { + $this->consume(','); + $this->skipWhitespace(); + } + } + + $this->consume('}'); + return $result; + } + + private function parseArray(): array { + $result = []; + $this->consume('['); + $this->skipWhitespace(); + + while ($this->getCurrentChar() !== ']') { + $value = $this->parseValue(); + $result[] = $value; + + $this->skipWhitespace(); + if ($this->getCurrentChar() === ',') { + $this->consume(','); + $this->skipWhitespace(); + } + } + + $this->consume(']'); + return $result; + } + +// private function parseString(): string { +// $result = ''; +// $this->consume('"'); +// +// while (true) { +// $char = $this->getCurrentChar(); +// if ($char === '"' && $this->getPreviousChar() !== '\\') { +// break; +// } +// if ($char === "\n" || $char === "\r") { +// $result .= '\n'; +// $this->position++; +// } elseif ($char === '\\') { +// $result .= $char . $this->getNextChar(); +// $this->position += 2; +// } else { +// $result .= $char; +// $this->position++; +// } +// } +// +// $this->consume('"'); +// return $result; +// } + + private function parseString(): string + { + $result = ''; + $this->consume('"'); + + while (true) { + $char = $this->getCurrentChar(); + if ($char === '`' && $this->getNextChar() === '`' && $this->getNextNextChar() === '`') { + $this->inCodeBlock = !$this->inCodeBlock; + $result .= '```'; + $this->position += 3; + continue; + } + if ($char === '"' && $this->getPreviousChar() !== '\\' && !$this->inCodeBlock) { + break; + } + if ($char === "\n" || $char === "\r") { + $result .= '\n'; + $this->position++; + } elseif ($char === '\\') { + $result .= $char . $this->getNextChar(); + $this->position += 2; + } else { + $result .= $char; + $this->position++; + } + } + + $this->consume('"'); + return $result; + } + + private function parseNumber(): float|int { + $start = $this->position; + while (preg_match('/[\d.+-e]/i', $this->getCurrentChar())) { + $this->position++; + } + $numberString = substr($this->input, $start, $this->position - $start); + return is_numeric($numberString) + ? $this->toNumber($numberString) + : 0; + } + + private function parseTrue(): bool { + $this->consume('true'); + return true; + } + + private function parseFalse(): bool { + $this->consume('false'); + return false; + } + + private function parseNull(): ?string { + $this->consume('null'); + return null; + } + + private function skipWhitespace(): void { + while ($this->position < $this->length && ctype_space($this->getCurrentChar())) { + $this->position++; + } + } + + private function consume(string $expected): void { + $length = strlen($expected); + if (substr($this->input, $this->position, $length) !== $expected) { + throw new \RuntimeException("Expected '$expected' at position {$this->position}"); + } + $this->position += $length; + } + + private function getCurrentChar(): string { + return $this->position < $this->length ? $this->input[$this->position] : ''; + } + + private function getNextChar(): string { + return $this->position + 1 < $this->length ? $this->input[$this->position + 1] : ''; + } + + private function getPreviousChar(): string { + return $this->position > 0 ? $this->input[$this->position - 1] : ''; + } + + private function getNextNextChar(): string + { + return $this->position + 2 < $this->length ? $this->input[$this->position + 2] : ''; + } + + private function toNumber(float|int|string $numberString) : float|int { + return strpos($numberString, '.') !== false + ? (float) $numberString + : (int) $numberString; + } +} \ No newline at end of file diff --git a/tests/Feature/Utils/JsonParserTest.php b/tests/Feature/Utils/JsonParserTest.php index 20e478cd..524bc388 100644 --- a/tests/Feature/Utils/JsonParserTest.php +++ b/tests/Feature/Utils/JsonParserTest.php @@ -1,11 +1,11 @@ parse($json, true); + $parsed = (new PartialJsonParser())->parse($json, true); expect($parsed)->toBe($result); })->with([ [ diff --git a/tests/Feature/Utils/PartialJsonTest.php b/tests/Feature/Utils/PartialJsonTest.php index c5b6ec22..b2b49f37 100644 --- a/tests/Feature/Utils/PartialJsonTest.php +++ b/tests/Feature/Utils/PartialJsonTest.php @@ -1,6 +1,6 @@ ['field-a'=>'str-1', 'field-b'=>1, 'field-c'=>['str-2']], '{"field-a":"str-1", "field-b":1, "field-c":["str-2", 2, true]}' => ['field-a'=>'str-1', 'field-b'=>1, 'field-c'=>['str-2', 2, true]], ]; - $parser = new JsonParser(); + $parser = new PartialJsonParser(); foreach ($examples as $src => $dest) { $partial = $parser->parse($src);