From 734ca10563101aa1848d8f788141b6c4d6bb5673 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Mon, 3 Jun 2024 22:00:07 +0200 Subject: [PATCH 1/4] updating data --- cognee/api/v1/topology/add_topology.py | 40 ++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/cognee/api/v1/topology/add_topology.py b/cognee/api/v1/topology/add_topology.py index deb074b2..f2da0013 100644 --- a/cognee/api/v1/topology/add_topology.py +++ b/cognee/api/v1/topology/add_topology.py @@ -1,10 +1,43 @@ import pandas as pd from pydantic import BaseModel + from typing import List, Dict, Any, Union, Optional from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client from cognee.modules.topology.topology import TopologyEngine, GitHubRepositoryModel from cognee.infrastructure.databases.graph.config import get_graph_config +import os +import pandas as pd +import json +from pydantic import BaseModel, Field +from typing import Dict, List, Optional, Union, Type, Any +from cognee.infrastructure.databases.graph.get_graph_client import get_graph_client + + + + +class Relationship(BaseModel): + type: str = Field(..., description="The type of relationship, e.g., 'belongs_to'.") + source: Optional[str] = Field(None, description="The identifier of the source id of in the relationship being a directory or subdirectory") + target: Optional[str] = Field(None, description="The identifier of the target id in the relationship being the directory, subdirectory or file") + properties: Optional[Dict[str, Any]] = Field(None, description="A dictionary of additional properties and values related to the relationship.") + +class JSONEntity(BaseModel): + name: str + set_type_as: Optional[str] = None + property_columns: List[str] + description: Optional[str] = None + +class JSONPattern(BaseModel): + head: str + relation: str + tail: str + description: Optional[str] = None + +class JSONModel(BaseModel): + node_id: str + entities: List[JSONEntity] + patterns: List[JSONPattern] USER_ID = "default_user" async def add_topology(directory: str = "example", model: BaseModel = GitHubRepositoryModel) -> Any: @@ -44,11 +77,12 @@ def flatten_repository(repo_model: BaseModel) -> List[Dict[str, Any]]: """ Flatten the entire repository model, starting with the top-level model """ return recursive_flatten(repo_model) - flt_topology = flatten_repository(topology) + async def add_graph_topology(): + + flt_topology = flatten_repository(topology) - df = pd.DataFrame(flt_topology) + df = pd.DataFrame(flt_topology) - print(df.head(10)) for _, row in df.iterrows(): node_data = row.to_dict() From e69571b306f6daddb5503a284c66175ec863137d Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Wed, 5 Jun 2024 20:31:47 +0200 Subject: [PATCH 2/4] added some docs updates --- docs/research.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/research.md b/docs/research.md index 27f736fa..15f06d44 100644 --- a/docs/research.md +++ b/docs/research.md @@ -5,6 +5,7 @@ The page is dedicated to collecting all research that was collected in the past This is not an exhaustive list, and any PRs would be welcome ### Research Papers +- [2024/06/04] [Transformers and episodic memory](https://arxiv.org/abs/2405.14992) - [2024/03/24] [Graph Chain-of-Thought: Augmenting Large Language Models by Reasoning on Graphs](https://arxiv.org/abs/2404.07103) - [2024/03/24] [Leave No Context Behind: Efficient Infinite Context Transformers with Infini-attention](https://arxiv.org/abs/2404.07143) - [2024/03/24] [Compound AI systems](https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/) From 6bf38e07849c64e6775de6ae1fe530d5eaaaf208 Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Fri, 7 Jun 2024 09:53:20 +0200 Subject: [PATCH 3/4] extend langchain splitter with character splitter --- .../data/chunking/LangchainChunkingEngine.py | 12 ++++++++++++ cognee/shared/data_models.py | 1 + docs/blog/posts/llmops-and-knowledge-graphs.md | 0 3 files changed, 13 insertions(+) create mode 100644 docs/blog/posts/llmops-and-knowledge-graphs.md diff --git a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py index c936bbe6..3b821df9 100644 --- a/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py +++ b/cognee/infrastructure/data/chunking/LangchainChunkingEngine.py @@ -29,6 +29,9 @@ def chunk_data( if chunk_strategy == ChunkStrategy.CODE: chunked_data = LangchainChunkEngine.chunk_data_by_code(source_data,chunk_size, chunk_overlap) + + elif chunk_strategy == ChunkStrategy.LANGCHAIN_CHARACTER: + chunked_data = LangchainChunkEngine.chunk_data_by_character(source_data,chunk_size, chunk_overlap) else: chunked_data = DefaultChunkEngine.chunk_data_by_paragraph(source_data,chunk_size, chunk_overlap) return chunked_data @@ -50,3 +53,12 @@ def chunk_data_by_code(data_chunks, chunk_size, chunk_overlap, language=None): return only_content + def chunk_data_by_character(self, data_chunks, chunk_size, chunk_overlap): + from langchain_text_splitters import RecursiveCharacterTextSplitter + splitter = RecursiveCharacterTextSplitter(chunk_size, chunk_overlap) + data = splitter.split(data_chunks) + + only_content = [chunk.page_content for chunk in data] + + return only_content + diff --git a/cognee/shared/data_models.py b/cognee/shared/data_models.py index 7e228bf8..906cd537 100644 --- a/cognee/shared/data_models.py +++ b/cognee/shared/data_models.py @@ -35,6 +35,7 @@ class ChunkStrategy(Enum): PARAGRAPH = "paragraph" SENTENCE = "sentence" CODE = "code" + LANGCHAIN_CHARACTER = "langchain_character" class MemorySummary(BaseModel): """ Memory summary. """ diff --git a/docs/blog/posts/llmops-and-knowledge-graphs.md b/docs/blog/posts/llmops-and-knowledge-graphs.md new file mode 100644 index 00000000..e69de29b From 1ea6824d1b29709fc706adaf7ee1e68c3e0d168b Mon Sep 17 00:00:00 2001 From: Vasilije <8619304+Vasilije1990@users.noreply.github.com> Date: Sun, 9 Jun 2024 19:52:14 +0200 Subject: [PATCH 4/4] commented out missing docs --- docs/blog/index.md | 22 +++++++++++++++++-- .../blog/posts/llmops-and-knowledge-graphs.md | 1 + 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/blog/index.md b/docs/blog/index.md index b7a2ddae..0b0320ad 100644 --- a/docs/blog/index.md +++ b/docs/blog/index.md @@ -2,13 +2,31 @@ The goal of the blog is to discuss broader topics around the cognee project, including the motivation behind the project, the technical details, and the future of the project. -## cognee library announcements + +## knowledge graphs + rags + +In progress + +[//]: # (1. [LLMOps stack + Graphs](posts/llmops-and-knowledge-graphs.md)) + +[//]: # (2. [Where do knowledge graphs fit, and where do they not? A case study with dynamo.fyi](posts/where-do-knowledge-graphs-fit.md)) + +[//]: # (3. [Knowledge Graphs vs basic RAGs, some metrics](posts/knowledge-graphs-vs-basic-rags.md)) + +[//]: # () + + +## product announcements This section covers the release notes for the cognee library. It includes the new features, bug fixes, and improvements in each release. 1. [Cognee - library release](posts/cognee-library-release.md) +2. [Cognee - v0.1.11 announcement](posts/cognee-v0.1.1.md) +3. [New website for cognee](posts/new-website-for-cognee.md) + + + -[//]: # (2. [Cognee - v0.1.4 announcement](posts/cognee-v0.1.4.md)) ## Towards deterministic data pipelines for LLMs step by step This series mostly deals with product discovery, data engineering, and the development of robust AI memory data pipelines. diff --git a/docs/blog/posts/llmops-and-knowledge-graphs.md b/docs/blog/posts/llmops-and-knowledge-graphs.md index e69de29b..8318c86b 100644 --- a/docs/blog/posts/llmops-and-knowledge-graphs.md +++ b/docs/blog/posts/llmops-and-knowledge-graphs.md @@ -0,0 +1 @@ +Test \ No newline at end of file