diff --git a/.gitignore b/.gitignore index 696c39e8..9ec8e77c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ experimental/ .env vendor php_errors.log -NOTES.md +composer.lock diff --git a/docs/img/concept.png b/docs/img/concept.png new file mode 100644 index 00000000..c3d11e89 Binary files /dev/null and b/docs/img/concept.png differ diff --git a/docs/index.md b/docs/index.md index 0c4fcc86..3939d069 100644 --- a/docs/index.md +++ b/docs/index.md @@ -12,6 +12,7 @@ Instructor is a library that allows you to extract structured, validated data fr Instructor for PHP is inspired by the [Instructor](https://jxnl.github.io/instructor/) library for Python created by [Jason Liu](https://twitter.com/jxnlco). +![image](./img/concept.png) ## Instructor in Other Languages diff --git a/notes/NOTES.md b/notes/NOTES.md new file mode 100644 index 00000000..525782e3 --- /dev/null +++ b/notes/NOTES.md @@ -0,0 +1,26 @@ +# NOTES + + +## Public vs private/protected fields + +Document and write tests around the behavior of public vs private/protected fields. + + +## Research + +- Queue-based load leveling +- Throttling +- Circuit breaker +- Producer-consumer / queue-worker +- Rate limiting +- Retry on service failure +- Backpressure +- Batch stage chain +- Request aggregator +- Rolling poller window +- Sparse task scheduler +- Marker and sweeper +- Actor model + + + diff --git a/notes/api/00_API.md b/notes/api/00_API.md new file mode 100644 index 00000000..adf7bb5a --- /dev/null +++ b/notes/api/00_API.md @@ -0,0 +1,3 @@ +# API design + +> This is an exploration of the ideas for Instructor API. Those examples are not final, most are not working and the concepts are subject to change. diff --git a/notes/api/async.md b/notes/api/async.md new file mode 100644 index 00000000..c3d01d3a --- /dev/null +++ b/notes/api/async.md @@ -0,0 +1,42 @@ +# Async + +## No streaming + +```php +$instructor = new Instructor(); +$async = $instructor->request( + messages: "Jason is 35 years old", + responseModel: Task::class, + onDone: function (Task $task) { + // Completed model + $this->saveTask($task); + }, + onError: function (Exception $e) { + // Handle error + }, +)->async(); +// continue execution +``` + +## With streaming / partials + +```php +$instructor = new Instructor(); +$async = $instructor->->request( + messages: "Jason is 35 years old", + responseModel: Task::class, + onEachUpdate: function (Task $task) { + // Partially updated model + $this->updateTask($task); + }, + onDone: function (Task $task) { + // Completed model + $this->saveTask($task); + }, + onError: function (Exception $e) { + // Handle error + }, +)->async(); +// continue execution +``` + diff --git a/notes/api/inference.md b/notes/api/inference.md new file mode 100644 index 00000000..53c408f9 --- /dev/null +++ b/notes/api/inference.md @@ -0,0 +1,86 @@ +# Inference + +Get the task. + +```php +$instructor = new Instructor(); +$task = $instructor->respond( + messages: "Jason is 35 years old", + responseModel: Task::class, +); + +$this->updateView($task); +``` +or + +```php +$instructor = new Instructor(); +$task = $instructor->request( + messages: "Jason is 35 years old", + responseModel: Task::class, +)->get(); + +$this->updateView($task); +``` +or + +```php +$instructor = new Instructor(); +$task = $instructor->withRequest(new Request( + messages: "Jason is 35 years old", + responseModel: Task::class, + partials: true +))->get(); +``` + +Get partial updates of task. + +```php +$instructor = new Instructor(); +$stream = $instructor->request( + messages: "Jason is 35 years old", + responseModel: Task::class, +)->stream(); + +foreach($stream->partial as $taskUpdate) { + // Partially updated model + $this->updateView($taskUpdate); + // Complete model is null until done + // $stream->complete == null +} +// Only now $stream->complete is set & validated +if($stream->complete) { + $task = $stream->complete; +} +``` + +Get the list of tasks, one by one. + +```php +$instructor = new Instructor(); +$stream = $instructor->request( + messages: "Jason is 35 years old", + responseModel: Sequence::of(Task::class), +)->get(); + +foreach($stream as $taskUpdate) { + // Partially updated model + $this->updateView($taskUpdate); +} +``` + +Get the list of tasks, one by one, with partial updates. + +```php +$instructor = new Instructor(); +$stream = $instructor->request( + messages: "Jason is 35 years old", + responseModel: Sequence::of(Task::class), + partials: true +)->stream(); + +foreach($stream as $taskUpdate) { + // Partially updated model + $this->updateView($taskUpdate); +} +``` diff --git a/notes/api/iterables.md b/notes/api/iterables.md new file mode 100644 index 00000000..ca8914a6 --- /dev/null +++ b/notes/api/iterables.md @@ -0,0 +1,44 @@ +# Iterable results + + +## Separate endpoint which returns Iterable + +Client iterates over it and receives partial updates until iterator is exhausted. +If the model implements iterable, it can be used to return partial updates. + +```php +$instructor = new Instructor(); +$taskUpdates = $instructor->respond( + messages: "Notify Jason about the upcoming meeting on Thursday at 10:00 AM", + responseModel: Task::class, + stream: true +); +foreach($taskUpdates as $partial) { + // Partially updated model + $this->updateView($partial); +} +// do something with task +TaskStore::save($partial); +``` + + + +## Separate, optional callback parameter + +Client receives partially updated model via callback, while `response()` will still return complete answer when done. + +```php +$instructor = new Instructor(); +$task = $instructor->respond( + messages: "Jason is 35 years old", + responseModel: Task::class, + onEachUpdate: function (Task $partial) { + // Partially updated model + $this->updateView($partial); + }, + stream: true +); +// do something with task +TaskStore::save($task); +``` + diff --git a/notes/done/00_DONE.md b/notes/done/00_DONE.md new file mode 100644 index 00000000..6e5b88f8 --- /dev/null +++ b/notes/done/00_DONE.md @@ -0,0 +1,3 @@ +# DONE + +> Those are ideas that have been implemented or problems that have been solved. diff --git a/notes/done/custom-schema.md b/notes/done/custom-schema.md new file mode 100644 index 00000000..8990c7f6 --- /dev/null +++ b/notes/done/custom-schema.md @@ -0,0 +1,31 @@ +## Custom schema generation - not based on class reflection & PHPDoc + +### Problem and ideas + +Model classes could implement HasSchemaProvider interface, which would allow for custom schema generation - rendering logic would skip reflection and use the provided schema instead. + +SchemaProvider could be a trait, which would allow for easy implementation. + +Example SchemaProvider: +class SchemaProvider { +public function schema(): Schema { +return new Schema([ +'type' => 'object', +'properties' => [ +'id' => ['type' => 'integer', 'description' => 'Description'], +'name' => ['type' => 'string', 'description' => 'Description'], +], +'required' => ['id', 'name'], +]); +} +} + +### Solution + +If model implements CanProvideSchema interface it can fully customize schema generation. + +It usually requires to also implement custom deserialization logic via CanDeserializeJson interface, so you can control how LLM response JSON is turned into data (and fed into model fields). + +You may also need to implement CanTransformResponse to control what you ultimately send back to the caller (e.g. you can return completely different data than the input model). + +This is used for the implementation of Scalar class, which is a universal adapter for scalar values. diff --git a/notes/done/custom-validation.md b/notes/done/custom-validation.md new file mode 100644 index 00000000..45ab7404 --- /dev/null +++ b/notes/done/custom-validation.md @@ -0,0 +1,13 @@ +# Validation + + +### Problem and ideas + +What about validation in such case? we can already have ```validate()``` method in the schema, +Is it enough? + + +## Solution + +Validation can be also customized by implementing CanSelfValidate interface. It allows you to fully control how the data is validated. At the moment it skips built in Symfony Validator logic, so you have to deal with Symfony validation constraints manually. + diff --git a/notes/done/observability.md b/notes/done/observability.md new file mode 100644 index 00000000..db26a086 --- /dev/null +++ b/notes/done/observability.md @@ -0,0 +1,26 @@ +# Observability + + +## Problem and ideas + +> Priority: must have + +Requirements and solution - to be analyzed + +- How to track regular vs streamed responses? Streamed responses are unreadable / meaningless individually. Higher abstraction layer is needed to handle them - eg. "folder" with individual chunks of data. Completion ID allows to track incoming chunks under a single context. +- Completion, if streamed, needs extra info on whether it has been completed or disrupted for any reason. + + +## Solution + +You can: +- wiretap() to get stream of all internal events +- connect to specific events via onEvent() + +This allows you plug in your preferred logging / monitoring system. + +- Performance - timestamps are available on events, which allows you to record performance of either full flow or individual steps. +- Errors - can be done via onError() +- Validation errors - can be done via onEvent() +- Generated data models - can be done via onEvent() + diff --git a/notes/done/partial-updates.md b/notes/done/partial-updates.md new file mode 100644 index 00000000..829bdd50 --- /dev/null +++ b/notes/done/partial-updates.md @@ -0,0 +1,83 @@ +## Partial updates + +> Priority: should have + +If callback is on, we should be able to provide partial updates to the object + send +notifications about the changes. + +To achieve this I need a way to generate a skeleton JSON, send it back to the client and then send changes or new versions of the whole object back to the client. + +Question: How to make partial updates and streaming / iterables compatible? + +### Using events + +Library currently dispatches events on every chunk received from LLM in streaming mode and on every partial update of the response model. + +Questions: +1. How does the client receive partially updated data model? What's the API? Do we want separate endpoint for regular `response()` method vs partial / streamed one? +2. How do we distinguish between partial updates and collection streaming (getting a stream of instances of the same model)? +3. Can the streamed collections models be partially updated? +4. Is there a need for a separate event on property completed, not just updated? + + +### IDEA: Denormalization of model structure + +It may make sense to denormalize the model - instead of nested structure, split it into a series of individual objects with references. Then generate them in a sequence individually (while providing object context). To be tested if this would result in better or worse inference quality, which is ultimately the most important thing. + +Splitting into objects would also allow for partial updates. + +Further - splitting objects to properties and generating them individually would make streaming partial updates easier. + +To be tested: maybe it could work for less capable models with no function calling. + +##### Model now + +Conceptually, the model is a tree of objects, which is generated in a single pass. + +``` +Issues[] { + Issue { + title: string + description: string + type: IssueType { + value: [technical, commercial, collaboration, other] + } + related_quotes: Quote[] { + Quote { + text: string + source: string + date: ?date + } + } + } +} +``` + +##### Flattened model + +The alternative is treating the model as a series of items - each item is a property of an object, following prescribed structure. + +``` +issues.issue[0].title +issues.issue[0].description +issues.issue[0].type +issues.issue[0].related_quotes +issues.issue[0].related_quotes.quote[0].text +issues.issue[0].related_quotes.quote[0].source +issues.issue[0].related_quotes.quote[0].date +issues.issue[0].related_quotes.quote[1].text +issues.issue[0].related_quotes.quote[1].source +issues.issue[0].related_quotes.quote[1].date +... +issues.issue[1].title +issues.issue[1].description +issues.issue[1].type +issues.issue[1].related_quotes +issues.issue[1].related_quotes.quote[2].text +issues.issue[1].related_quotes.quote[2].source +issues.issue[1].related_quotes.quote[2].date +issues.issue[1].related_quotes.quote[3].text +issues.issue[1].related_quotes.quote[3].source +issues.issue[1].related_quotes.quote[3].date +... +``` diff --git a/notes/done/scalar-values.md b/notes/done/scalar-values.md new file mode 100644 index 00000000..49a3f072 --- /dev/null +++ b/notes/done/scalar-values.md @@ -0,0 +1,21 @@ +# Support scalar types as response_model + + +## Problem and ideas + +Have universal scalar value adapter with HasSchemaProvider interface +HasSchemaProvider = schema() : Schema, which, if present, will be used to generate schema +Instead of the default schema generation mechanism +This will allow for custom schema generation + + +## Solution + +Ultimately the implemented solution has much nicer DX: + +```php +$isAdult = (new Instructor)->respond( + messages: "Jason is 35 years old", + responseModel: Scalar::bool('isAdult') +); +``` diff --git a/notes/ideas/00_IDEAS.md b/notes/ideas/00_IDEAS.md new file mode 100644 index 00000000..cb4a86b9 --- /dev/null +++ b/notes/ideas/00_IDEAS.md @@ -0,0 +1,3 @@ +# Ideas + +> This is an exploration of gaps, ideas for new features, capabilities, problems and potential future changes in Instructor. Those ideas are not final and the concepts are subject to change. diff --git a/notes/ideas/async.md b/notes/ideas/async.md new file mode 100644 index 00000000..9d21b20d --- /dev/null +++ b/notes/ideas/async.md @@ -0,0 +1,3 @@ +# Async / parallel processing + +Identify capabilities of the engine that could be parallelized, so we can speed up processing of the results, esp. for large data sets. diff --git a/notes/ideas/attributes.md b/notes/ideas/attributes.md new file mode 100644 index 00000000..da9deccc --- /dev/null +++ b/notes/ideas/attributes.md @@ -0,0 +1,18 @@ +# Attributes + +Use PHP attributes to define the model's class / field metadata. It has been done previously, but reverted due to Symfony validation use. + +```php +use Cognesy\Instructor\Attributes\Description; +use Cognesy\Instructor\Attributes\Examples; + +class User { + #[Description("User's name")] + public string $name; + #[Description("User's age")] + public int $age; + #[Description("User's role")] + #[Examples("admin", "user", "guest")] + public string $role; +} +``` diff --git a/notes/ideas/caching-schema.md b/notes/ideas/caching-schema.md new file mode 100644 index 00000000..4e0c1df8 --- /dev/null +++ b/notes/ideas/caching-schema.md @@ -0,0 +1,5 @@ +# Caching schema + +It may not be worth it purely for performance reasons, but it would be useful for debugging or schema optimization (DSPy like). + +Schema could be saved in version controlled, versioned JSON files and loaded from there. In development mode it would be read from JSON file, unless class file is newer than schema file. diff --git a/notes/ideas/cli-tool.md b/notes/ideas/cli-tool.md new file mode 100644 index 00000000..b7f69de6 --- /dev/null +++ b/notes/ideas/cli-tool.md @@ -0,0 +1,19 @@ +# CLI + +> Priority: nice to have + +## Simple example + +```cli +instruct --messages "Jason is 35 years old" --respond-with UserDetails --response-format yaml +``` +It will search for UserFormat.php (PHP class) or UserFormat.json (JSONSchema) in current dir. +We should be able to provide a path to class code / schema definitions directory. +Default response format is JSON, we can render it to YAML (or other supported formats). + + +## Scalar example + +```cli +instruct --messages "Jason is 35 years old" --respond-with Scalar::bool('isAdult') +``` diff --git a/notes/ideas/common-datatypes.md b/notes/ideas/common-datatypes.md new file mode 100644 index 00000000..172e9b18 --- /dev/null +++ b/notes/ideas/common-datatypes.md @@ -0,0 +1,13 @@ +# Handling useful, common data types + +Currently, there is no special treatment for common data types, such as: + +- Date +- Time +- DateTime +- Period +- Duration +- Money +- Currency + +There are no tests around those types of data, nor support for parsing that Pydantic has. diff --git a/notes/ideas/deserialization.md b/notes/ideas/deserialization.md new file mode 100644 index 00000000..2b4fd94b --- /dev/null +++ b/notes/ideas/deserialization.md @@ -0,0 +1,12 @@ +# Better control over deserialization + +> Priority: must have + +We need custom deserializer or easier way of customizing existing one. +Specific need is #[Description] attribute, which should be used to generate description. + +Another reason is that we need to handle custom types, such as Money, Date, etc. Some of them may not be supported by Symfony Serializer out of the box. (COMMENT: this can be achieved by writing custom Symfony deserializers). + +Need to document how to write and plug in custom field / object deserializer into Instructor. + +Custom deserialization strategy is also needed for partial updates, maybe for streaming too. diff --git a/notes/ideas/examples-for-llm.md b/notes/ideas/examples-for-llm.md new file mode 100644 index 00000000..48f6760b --- /dev/null +++ b/notes/ideas/examples-for-llm.md @@ -0,0 +1,11 @@ +# Examples for LLM + +We need a way to inject examples in a more structured way than as a text in PHPDocs. + +- It mixes instructions with examples. +- It's not easy to extract examples from PHPDocs and manage them separately (e.g. using larger external source of examples) +- PHPDocs cannot be easily manipulated - it's not easy to inject / replace examples in PHPDocs. + +## Questions + +Do examples need to be provided at a class level or at a property level? diff --git a/notes/ideas/instant-rest-api.md b/notes/ideas/instant-rest-api.md new file mode 100644 index 00000000..456c1f04 --- /dev/null +++ b/notes/ideas/instant-rest-api.md @@ -0,0 +1,7 @@ +# Instant REST API and docs + +> Priority: nice to have + +Can we serve the models via REST API and generate Swagger documentation automatically? +FrankenPHP could be used as default, fast server. + diff --git a/notes/ideas/instructor-port-interop.md b/notes/ideas/instructor-port-interop.md new file mode 100644 index 00000000..c8d58845 --- /dev/null +++ b/notes/ideas/instructor-port-interop.md @@ -0,0 +1,5 @@ +# Interoperability with Python/JS/Elixir versions + +> Priority: nice to have + +Can we make it easy to automatically convert models between Python, JS and PHP versions of Instructor? diff --git a/notes/ideas/metadata.md b/notes/ideas/metadata.md new file mode 100644 index 00000000..945d728b --- /dev/null +++ b/notes/ideas/metadata.md @@ -0,0 +1,20 @@ +# Metadata + +Need a better way to handle model metadata. Currently, we rely on 2 building blocks: + +- PHPDocs +- Type information +- Attributes (limited - validation) + +Redesigned engine does not offer an easy way to handle custom Attributes. + +Not sure if Attributes are the ultimate answer, as they are static and cannot be easily manipulated at runtime. + +Pydantic approach is to take over the whole model definition via Field() calls, but PHP does not allow us to do something similar, at least in a clean way. + +```php +class User { + public string $name; + public string $email = new Field(description: 'Email address'); // This is not possible in PHP +} +``` diff --git a/notes/ideas/other-extraction-modes.md b/notes/ideas/other-extraction-modes.md new file mode 100644 index 00000000..84ffe17b --- /dev/null +++ b/notes/ideas/other-extraction-modes.md @@ -0,0 +1,20 @@ +# Other modes of extraction + +> Priority: should have + +It is related to compatibility with other LLMs, as some of them may not directly support function calling or support it in a different way (see: Claude). + + +## JSON_MODE vs function calling + +Add JSON_MODE to the LLM class, so it can handle both modes. + + +## MISTRAL_MODE + +Review Jason's Python code to understand how to handle function calling for Mistral. + + +## YAML + +For models not supporting function calling YAML might be an easier way to get structured outputs. diff --git a/notes/ideas/other-llms.md b/notes/ideas/other-llms.md new file mode 100644 index 00000000..32f1b778 --- /dev/null +++ b/notes/ideas/other-llms.md @@ -0,0 +1,18 @@ +# Other LLMs + +> Priority: must have + +1) Via custom BASE_URI - via existing OpenAI client +2) Custom LLM classes. + LLM class is the one that needs to handle all model / API specific stuff (e.g. function calling - vide: Claude's FC XML "API", streaming, modes, etc.). + +We MUST support models which offer OpenAI compatible API and function calling (step 1 above). +Most likely we do already, but it should be tested and documented, so anybody can do it easily. + +Things missing currently: +- Tests +- Documentation +- Examples + +Next steps: +- Implement custom LLM class - for Claude? diff --git a/notes/ideas/parallel-tools.md b/notes/ideas/parallel-tools.md new file mode 100644 index 00000000..77fd78ba --- /dev/null +++ b/notes/ideas/parallel-tools.md @@ -0,0 +1,15 @@ +# Parallel function calling + +> Priority: nice to have + +GPT-4-turbo can handle parallel function calling, which allows to return multiple models in a single API call. We do not yet support it, but Python Instructor does. + +The benefit is that you can reduce the number of function calls and get extra "intelligence", for example asking LLM to return a series of "operations" it considers relevant to the input. + +Need to test it further to understand how it is different from constructing a more complex model that is composed out of other models (or sequences of other models). + +One obvious benefit could be that they are returned separately, can be processed separately and, potentially, acted upon in parallel. + +It is doable with composite models via custom deserialization, but would be nice not to be forced to do it manually. + + diff --git a/notes/ideas/processing-pipeline.md b/notes/ideas/processing-pipeline.md new file mode 100644 index 00000000..80ba2a25 --- /dev/null +++ b/notes/ideas/processing-pipeline.md @@ -0,0 +1,5 @@ +# Processing pipeline + +Introduce manageable and configurable processing pipeline for request processing, so we can easily add, remove, or modify processing steps. + +It would allow to build 'middleware' type components focused on single action, so we could simplify the code and make it more readeable / maintainable. diff --git a/notes/ideas/prompt-optimization.md b/notes/ideas/prompt-optimization.md new file mode 100644 index 00000000..860ef3d5 --- /dev/null +++ b/notes/ideas/prompt-optimization.md @@ -0,0 +1,45 @@ +# Prompt optimization + +## Optimization of instructions + +Quality is highly dependent on the . We need a better way to generate instructions and examples (e.g. similar to DSPy). + +## Stages of processing + +- define: Define processing architecture from modules (layers). What are modules (layers)? +- process: Process the messages throught the flow defined in define(). + +## Stages of optimization + +- evaluator() - evaluate results (e.g. vs golden data set) +- CanOptimize::optimize() - check results from evaluator and modify the instructions + +```php +inputs(Scalar::STRING) + ->outputs(Sequence::of(Issue::class)); + +class Signature {} +class Transformer {} + +class IdentifySpam extends Transformer { +} + +class IdentifyProjectIssues extends Transformer { + public function define() { + $this->input('emails')->type(Collection::of(Scalar::string())); + $this->output('issues')->type(Collection::of(Issue::class)); + } + + public function process() { + foreach($this->inputs['emails'] as $email) { + $this->outputs['issues'][] = + } + } +} + +?> +``` diff --git a/notes/ideas/streaming-collections.md b/notes/ideas/streaming-collections.md new file mode 100644 index 00000000..b56b6925 --- /dev/null +++ b/notes/ideas/streaming-collections.md @@ -0,0 +1,9 @@ +# Streaming arrays / collections / iterables + +> Priority: should have + +Callback approach - provide callback to Instructor, which will be called for each +token received (?). It does not make sense for structured outputs, only if the result +is iterable / array. + +Streamed responses require special handling in Instructor core - checking for "finishReason", and handling other than "stop". diff --git a/notes/ideas/validation.md b/notes/ideas/validation.md new file mode 100644 index 00000000..eaba07d8 --- /dev/null +++ b/notes/ideas/validation.md @@ -0,0 +1,21 @@ +# Validation + +> Priority: must have + + +## Returning errors - array vs typed object + +Array is simple and straightforward, but it's not type safe and does not provide a way to add custom methods to the error object. + +Typed object is less flexible, but actually might be better for DX. + +If the switch to typed object error is decided, current CanSelfValidate need changes as it currently returns an array. + + +## Validation for custom deserializers + +> **Observation:** Symfony Validator does not care whether it validates full / big, complex model or individual objects. It's a good thing, as it allows for partial validation - not property by property, but at least object by object (and separately for nested objects). + +Idea: we could have multiple validators connected to the model and executed in a sequence. + +