Merge branch 'main' into COG-546-local-script-dependencies

topoteretes · Nov 18, 2024 · 2bfaec4 · 2bfaec4
2 parents 769524b + ced5385
commit 2bfaec4
Show file tree

Hide file tree

Showing 18 changed files with 762 additions and 157 deletions.
diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
@@ -20,5 +20,8 @@ class DataPoint(BaseModel):
     def get_embeddable_data(self):
         if self._metadata and len(self._metadata["index_fields"]) > 0 \
             and hasattr(self, self._metadata["index_fields"][0]):
-
-            return getattr(self, self._metadata["index_fields"][0])
+            attribute = getattr(self, self._metadata["index_fields"][0])
+            if isinstance(attribute, str):
+                return(attribute.strip())
+            else:
+                return (attribute)
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
@@ -9,26 +9,25 @@ class TextChunker():
 
     chunk_index = 0
     chunk_size = 0
-    paragraph_chunks = []
 
     def __init__(self, document, get_text: callable, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
 
     def read(self):
-        self.paragraph_chunks = []
+        paragraph_chunks = []
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
                 self.max_chunk_size,
                 batch_paragraphs = True,
             ):
                 if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
-                    self.paragraph_chunks.append(chunk_data)
+                    paragraph_chunks.append(chunk_data)
                     self.chunk_size += chunk_data["word_count"]
                 else:
-                    if len(self.paragraph_chunks) == 0:
+                    if len(paragraph_chunks) == 0:
                         yield DocumentChunk(
                             id = chunk_data["chunk_id"],
                             text = chunk_data["text"],
@@ -37,35 +36,35 @@ def read(self):
                             chunk_index = self.chunk_index,
                             cut_type = chunk_data["cut_type"],
                         )
-                        self.paragraph_chunks = []
+                        paragraph_chunks = []
                         self.chunk_size = 0
                     else:
-                        chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks)
+                        chunk_text = " ".join(chunk["text"] for chunk in paragraph_chunks)
                         try:
                             yield DocumentChunk(
                                 id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
                                 text = chunk_text,
                                 word_count = self.chunk_size,
                                 is_part_of = self.document,
                                 chunk_index = self.chunk_index,
-                                cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
+                                cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
                             )
                         except Exception as e:
                             print(e)
-                        self.paragraph_chunks = [chunk_data]
+                        paragraph_chunks = [chunk_data]
                         self.chunk_size = chunk_data["word_count"]
 
                     self.chunk_index += 1
 
-        if len(self.paragraph_chunks) > 0:
+        if len(paragraph_chunks) > 0:
             try:
                 yield DocumentChunk(
                     id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
-                    text = " ".join(chunk["text"] for chunk in self.paragraph_chunks),
+                    text = " ".join(chunk["text"] for chunk in paragraph_chunks),
                     word_count = self.chunk_size,
                     is_part_of = self.document,
                     chunk_index = self.chunk_index,
-                    cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
+                    cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
                 )
             except Exception as e:
                 print(e)
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -5,11 +5,15 @@
 class AudioDocument(Document):
     type: str = "audio"
 
+    def create_transcript(self):
+        result = get_llm_client().create_transcript(self.raw_data_location)
+        return(result.text)
+
     def read(self, chunk_size: int):
         # Transcribe the audio file
-        result = get_llm_client().create_transcript(self.raw_data_location)
-        text = result.text
+
+        text = self.create_transcript()
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -5,11 +5,15 @@
 class ImageDocument(Document):
     type: str = "image"
 
+
+    def transcribe_image(self):
+        result = get_llm_client().transcribe_image(self.raw_data_location)
+        return(result.choices[0].message.content)
+
     def read(self, chunk_size: int):
         # Transcribe the image file
-        result = get_llm_client().transcribe_image(self.raw_data_location)
-        text = result.choices[0].message.content
+        text = self.transcribe_image()
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -1,69 +1,72 @@
 from uuid import uuid5, NAMESPACE_OID
+from typing import Dict, Any, Iterator
 from .chunk_by_sentence import chunk_by_sentence
 
-def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True):
-    paragraph = ""
+def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
+    """
+    Chunks text by paragraph while preserving exact text reconstruction capability.
+    When chunks are joined with empty string "", they reproduce the original text exactly.
+    """
+    current_chunk = ""
+    current_word_count = 0
+    chunk_index = 0
+    paragraph_ids = []
     last_cut_type = None
-    last_paragraph_id = None
-    paragraph_word_count = 0
-    paragraph_chunk_index = 0
-
-    for (paragraph_id, __, sentence, word_count, end_type) in chunk_by_sentence(data):
-        if paragraph_word_count > 0 and paragraph_word_count + word_count > paragraph_length:
-            if batch_paragraphs is True:
-                chunk_id = uuid5(NAMESPACE_OID, paragraph)
-                yield dict(
-                    text = paragraph.strip(),
-                    word_count = paragraph_word_count,
-                    id = chunk_id, # When batching paragraphs, the paragraph_id is the same as chunk_id.
-                                   # paragraph_id doens't mean anything since multiple paragraphs are merged.
-                    chunk_id = chunk_id,
-                    chunk_index = paragraph_chunk_index,
-                    cut_type = last_cut_type,
-                )
-            else:
-                yield dict(
-                    text = paragraph.strip(),
-                    word_count = paragraph_word_count,
-                    id = last_paragraph_id,
-                    chunk_id = uuid5(NAMESPACE_OID, paragraph),
-                    chunk_index = paragraph_chunk_index,
-                    cut_type = last_cut_type,
-                )
-
-            paragraph_chunk_index += 1
-            paragraph_word_count = 0
-            paragraph = ""
-
-        paragraph += (" " if len(paragraph) > 0 else "") + sentence
-        paragraph_word_count += word_count
-
-        if end_type == "paragraph_end" or end_type == "sentence_cut":
-            if batch_paragraphs is True:
-                paragraph += "\n\n" if end_type == "paragraph_end" else ""
-            else:
-                yield dict(
-                    text = paragraph.strip(),
-                    word_count = paragraph_word_count,
-                    paragraph_id = paragraph_id,
-                    chunk_id = uuid5(NAMESPACE_OID, paragraph),
-                    chunk_index = paragraph_chunk_index,
-                    cut_type = end_type,
-                )
-
-                paragraph_chunk_index = 0
-                paragraph_word_count = 0
-                paragraph = ""
+
+    for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
+        # Check if this sentence would exceed length limit
+        if current_word_count > 0 and current_word_count + word_count > paragraph_length:
+            # Yield current chunk
+            chunk_dict = {
+                "text": current_chunk,
+                "word_count": current_word_count,
+                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+                "paragraph_ids": paragraph_ids,
+                "chunk_index": chunk_index,
+                "cut_type": last_cut_type,
+            }
+
+            yield chunk_dict
+
+            # Start new chunk with current sentence
+            paragraph_ids = []
+            current_chunk = ""
+            current_word_count = 0
+            chunk_index += 1
 
+        paragraph_ids.append(paragraph_id)
+        current_chunk += sentence
+        current_word_count += word_count
+
+        # Handle end of paragraph
+        if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
+            # For non-batch mode, yield each paragraph separately
+            chunk_dict = {
+                "text": current_chunk,
+                "word_count": current_word_count,
+                "paragraph_ids": paragraph_ids,
+                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+                "chunk_index": chunk_index,
+                "cut_type": end_type
+            }
+            yield chunk_dict
+            paragraph_ids = []
+            current_chunk = ""
+            current_word_count = 0
+            chunk_index += 1
+
         last_cut_type = end_type
-        last_paragraph_id = paragraph_id
-
-    if len(paragraph) > 0:
-        yield dict(
-            chunk_id = uuid5(NAMESPACE_OID, paragraph),
-            text = paragraph,
-            word_count = paragraph_word_count,
-            paragraph_id = last_paragraph_id,
-            chunk_index = paragraph_chunk_index,
-            cut_type = last_cut_type,
-        )
+
+    # Yield any remaining text
+    if current_chunk:
+        chunk_dict = {
+            "text": current_chunk,
+            "word_count": current_word_count,
+            "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+            "paragraph_ids": paragraph_ids,
+            "chunk_index": chunk_index,
+            "cut_type": "sentence_cut" if last_cut_type == "word" else last_cut_type
+        }
+
+
+        yield chunk_dict
diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py
@@ -2,30 +2,43 @@
 
 
 from uuid import uuid4
+from typing import Optional
 from .chunk_by_word import chunk_by_word
 
-def chunk_by_sentence(data: str):
+def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
     sentence = ""
     paragraph_id = uuid4()
-    chunk_index = 0
     word_count = 0
+    section_end = False
+    word_type_state = None
 
+    # the yielded word_type_state is identical to word_type, except when
+    # the word type is 'word', the word doesn't contain any letters
+    # and words with the same characteristics connect it to a preceding
+    # word with word_type 'paragraph_end' or 'sentence_end'
     for (word, word_type) in chunk_by_word(data):
-        sentence += (" " if len(sentence) > 0 else "") + word
+        sentence += word
         word_count += 1
 
-        if word_type == "paragraph_end" or word_type == "sentence_end":
-            yield (paragraph_id, chunk_index, sentence, word_count, word_type)
+        if word_type in ["paragraph_end", "sentence_end"]:
+            word_type_state = word_type
+        else:
+            for character in word:
+                if character.isalpha():
+                    word_type_state = word_type
+                    break
+
+        if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
+            yield (paragraph_id, sentence, word_count, word_type_state)
             sentence = ""
             word_count = 0
             paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
-            chunk_index = 0 if word_type == "paragraph_end" else chunk_index + 1
 
     if len(sentence) > 0:
+        section_end = "sentence_cut" if word_type_state == "word" else word_type_state
         yield (
             paragraph_id,
-            chunk_index,
             sentence,
             word_count,
-            "sentence_cut",
+            section_end,
         )