topoteretes · 0xideas · Nov 18, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
@@ -9,25 +9,25 @@ class TextChunker():
 
     chunk_index = 0
     chunk_size = 0
-    paragraph_chunks = []
 
     def __init__(self, document, get_text: callable, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
 
     def read(self):
+        paragraph_chunks = []
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
                 self.max_chunk_size,
                 batch_paragraphs = True,
             ):
                 if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
-                    self.paragraph_chunks.append(chunk_data)
+                    paragraph_chunks.append(chunk_data)
                     self.chunk_size += chunk_data["word_count"]
                 else:
-                    if len(self.paragraph_chunks) == 0:
+                    if len(paragraph_chunks) == 0:
                         yield DocumentChunk(
                             id = chunk_data["chunk_id"],
                             text = chunk_data["text"],
@@ -36,35 +36,35 @@ def read(self):
                             chunk_index = self.chunk_index,
                             cut_type = chunk_data["cut_type"],
                         )
-                        self.paragraph_chunks = []
+                        paragraph_chunks = []
                         self.chunk_size = 0
                     else:
-                        chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks)
+                        chunk_text = " ".join(chunk["text"] for chunk in paragraph_chunks)
                         try:
                             yield DocumentChunk(
                                 id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
                                 text = chunk_text,
                                 word_count = self.chunk_size,
                                 is_part_of = self.document,
                                 chunk_index = self.chunk_index,
-                                cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
+                                cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
                             )
                         except Exception as e:
                             print(e)
-                        self.paragraph_chunks = [chunk_data]
+                        paragraph_chunks = [chunk_data]
                         self.chunk_size = chunk_data["word_count"]
 
                     self.chunk_index += 1
 
-        if len(self.paragraph_chunks) > 0:
+        if len(paragraph_chunks) > 0:
             try:
                 yield DocumentChunk(
                     id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
-                    text = " ".join(chunk["text"] for chunk in self.paragraph_chunks),
+                    text = " ".join(chunk["text"] for chunk in paragraph_chunks),
                     word_count = self.chunk_size,
                     is_part_of = self.document,
                     chunk_index = self.chunk_index,
-                    cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
+                    cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
                 )
             except Exception as e:
                 print(e)
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -1,69 +1,79 @@
 from uuid import uuid5, NAMESPACE_OID
+from typing import Dict, Any, Iterator
 from .chunk_by_sentence import chunk_by_sentence
 
-def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True):
-    paragraph = ""
-    last_cut_type = None
+def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
+    """
+    Chunks text by paragraph while preserving exact text reconstruction capability.
+    When chunks are joined with empty string "", they reproduce the original text exactly.
+    """
+    current_chunk = ""
+    current_word_count = 0
+    chunk_index = 0
     last_paragraph_id = None
-    paragraph_word_count = 0
-    paragraph_chunk_index = 0
-
-    for (paragraph_id, __, sentence, word_count, end_type) in chunk_by_sentence(data):
-        if paragraph_word_count > 0 and paragraph_word_count + word_count > paragraph_length:
-            if batch_paragraphs is True:
-                chunk_id = uuid5(NAMESPACE_OID, paragraph)
-                yield dict(
-                    text = paragraph.strip(),
-                    word_count = paragraph_word_count,
-                    id = chunk_id, # When batching paragraphs, the paragraph_id is the same as chunk_id.
-                                   # paragraph_id doens't mean anything since multiple paragraphs are merged.
-                    chunk_id = chunk_id,
-                    chunk_index = paragraph_chunk_index,
-                    cut_type = last_cut_type,
-                )
-            else:
-                yield dict(
-                    text = paragraph.strip(),
-                    word_count = paragraph_word_count,
-                    id = last_paragraph_id,
-                    chunk_id = uuid5(NAMESPACE_OID, paragraph),
-                    chunk_index = paragraph_chunk_index,
-                    cut_type = last_cut_type,
-                )
-
-            paragraph_chunk_index += 1
-            paragraph_word_count = 0
-            paragraph = ""
-
-        paragraph += (" " if len(paragraph) > 0 else "") + sentence
-        paragraph_word_count += word_count
-
-        if end_type == "paragraph_end" or end_type == "sentence_cut":
-            if batch_paragraphs is True:
-                paragraph += "\n\n" if end_type == "paragraph_end" else ""
+    last_cut_type = None
+
+    for paragraph_id, _, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
+        assert word_count <= paragraph_length, f"{paragraph_length = } is smaller than {word_count = }"
+        # Check if this sentence would exceed length limit
+        if current_word_count > 0 and current_word_count + word_count > paragraph_length:
+            # Yield current chunk
+            chunk_dict = {
+                "text": current_chunk,
+                "word_count": current_word_count,
+                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+                "chunk_index": chunk_index,
+                "cut_type": last_cut_type
+            }
+
+            if batch_paragraphs:
+                chunk_dict["id"] = chunk_dict["chunk_id"]
             else:
-                yield dict(
-                    text = paragraph.strip(),
-                    word_count = paragraph_word_count,
-                    paragraph_id = paragraph_id,
-                    chunk_id = uuid5(NAMESPACE_OID, paragraph),
-                    chunk_index = paragraph_chunk_index,
-                    cut_type = end_type,
-                )
-
-                paragraph_chunk_index = 0
-                paragraph_word_count = 0
-                paragraph = ""
-
+                chunk_dict["id"] = last_paragraph_id
+
+            yield chunk_dict
+
+            # Start new chunk with current sentence
+            current_chunk = sentence
+            current_word_count = word_count
+            chunk_index += 1
+        else:
+            # Just concatenate directly - no space handling
+            current_chunk += sentence
+            current_word_count += word_count
+
+        # Handle end of paragraph
+        if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
+            # For non-batch mode, yield each paragraph separately
+            chunk_dict = {
+                "text": current_chunk,
+                "word_count": current_word_count,
+                "id": paragraph_id,
+                "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+                "chunk_index": chunk_index,
+                "cut_type": end_type
+            }
+            yield chunk_dict
+            current_chunk = ""
+            current_word_count = 0
+            chunk_index = 0
+
         last_cut_type = end_type
         last_paragraph_id = paragraph_id
-
-    if len(paragraph) > 0:
-        yield dict(
-            chunk_id = uuid5(NAMESPACE_OID, paragraph),
-            text = paragraph,
-            word_count = paragraph_word_count,
-            paragraph_id = last_paragraph_id,
-            chunk_index = paragraph_chunk_index,
-            cut_type = last_cut_type,
-        )
+
+    # Yield any remaining text
+    if current_chunk:
+        chunk_dict = {
+            "text": current_chunk,
+            "word_count": current_word_count,
+            "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
+            "chunk_index": chunk_index,
+            "cut_type": last_cut_type
+        }
+
+        if batch_paragraphs:
+            chunk_dict["id"] = chunk_dict["chunk_id"]
+        else:
+            chunk_dict["id"] = last_paragraph_id
+
+        yield chunk_dict
diff --git a/cognee/tasks/chunks/chunk_by_sentence.py b/cognee/tasks/chunks/chunk_by_sentence.py
@@ -2,19 +2,20 @@
 
 
 from uuid import uuid4
+from typing import Optional
 from .chunk_by_word import chunk_by_word
 
-def chunk_by_sentence(data: str):
+def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
     sentence = ""
     paragraph_id = uuid4()
     chunk_index = 0
     word_count = 0
 
     for (word, word_type) in chunk_by_word(data):
-        sentence += (" " if len(sentence) > 0 else "") + word
+        sentence += word
         word_count += 1
 
-        if word_type == "paragraph_end" or word_type == "sentence_end":
+        if word_type == "paragraph_end" or word_type == "sentence_end" or ((maximum_length is not None) and (word_count == maximum_length)):
             yield (paragraph_id, chunk_index, sentence, word_count, word_type)
             sentence = ""
             word_count = 0

diff --git a/cognee/tasks/chunks/chunk_by_word.py b/cognee/tasks/chunks/chunk_by_word.py
@@ -1,60 +1,77 @@
 import re
 
+SENTENCE_ENDINGS = r"[.;!?…]"
+PARAGRAPH_ENDINGS = r"[\n\r]"
+
 def chunk_by_word(data: str):
-    sentence_endings = r"[.;!?…]"
-    paragraph_endings = r"[\n\r]"
+    """
+    Chunks text into words and endings while preserving whitespace.
+    Whitespace is included with the preceding word.
+    Outputs can be joined with "" to recreate the original input.
+    """
     last_processed_character = ""
-
-    word = ""
+    current_chunk = ""
     i = 0
-
+
+    # Handle leading whitespace if any
+    while i < len(data) and (re.match(PARAGRAPH_ENDINGS, data[i]) or data[i] == " "):
+        current_chunk += data[i]
+        i += 1
+    if current_chunk:
+        yield (current_chunk, "word")
+        current_chunk = ""
+
     while i < len(data):
         character = data[i]
-
-        if word == "" and (re.match(paragraph_endings, character) or character == " "):
-            i = i + 1
-            continue
-
+
         def is_real_paragraph_end():
-            if re.match(sentence_endings, last_processed_character):
+            if re.match(SENTENCE_ENDINGS, last_processed_character):
                 return True
-
             j = i + 1
             next_character = data[j] if j < len(data) else None
-            while next_character is not None and (re.match(paragraph_endings, next_character) or next_character == " "):
+            while next_character is not None and (re.match(PARAGRAPH_ENDINGS, next_character) or next_character == " "):
                 j += 1
                 next_character = data[j] if j < len(data) else None
             if next_character and next_character.isupper():
                 return True
-
             return False
-
-        if re.match(paragraph_endings, character):
-            yield (word, "paragraph_end" if is_real_paragraph_end() else "word")
-            word = ""
-            i = i + 1
+
+        if re.match(PARAGRAPH_ENDINGS, character):
+            if current_chunk:
+                yield (current_chunk, "word")
+                current_chunk = ""
+            yield (character, "paragraph_end" if is_real_paragraph_end() else "word")
+            i += 1
             continue
-
+
+        current_chunk += character
+        last_processed_character = character
+
         if character == " ":
-            yield [word, "word"]
-            word = ""
-            i = i + 1
+            yield (current_chunk, "word")
+            current_chunk = ""
+            i += 1
             continue
-
-        word += character
-        last_processed_character = character
-
-        if re.match(sentence_endings, character):
-            # Check for ellipses.
-            if i + 2 <= len(data) and data[i] == "." and data[i + 1] == "." and data[i + 2] == ".":
-                word += ".."
-                i = i + 2
-
-            is_paragraph_end = i + 1 < len(data) and re.match(paragraph_endings, data[i + 1])
-            yield (word, "paragraph_end" if is_paragraph_end else "sentence_end")
-            word = ""
-
+
+        if re.match(SENTENCE_ENDINGS, character):
+            # Check for ellipses
+            if i + 2 < len(data) and data[i:i+3] == "...":
+                current_chunk += ".."
+                i += 2
+
+            # Look ahead for whitespace
+            next_i = i + 1
+            while next_i < len(data) and data[next_i] == " ":
+                current_chunk += data[next_i]
+                next_i += 1
+
+            is_paragraph_end = next_i < len(data) and re.match(PARAGRAPH_ENDINGS, data[next_i])
+            yield (current_chunk, "paragraph_end" if is_paragraph_end else "sentence_end")
+            current_chunk = ""
+            i = next_i
+            continue
+
         i += 1
-
-    if len(word) > 0:
-        yield (word, "word")
+        
+    if current_chunk:
+        yield (current_chunk, "word")
diff --git a/cognee/tests/unit/documents/PdfDocument_test.py b/cognee/tests/unit/documents/PdfDocument_test.py
@@ -4,8 +4,8 @@
 from cognee.modules.data.processing.document_types.PdfDocument import PdfDocument
 
 GROUND_TRUTH = [
-    {"word_count": 879, "len_text": 5622, "cut_type": "sentence_end"},
-    {"word_count": 951, "len_text": 6384, "cut_type": "sentence_end"},
+    {"word_count": 879, "len_text": 5607, "cut_type": "sentence_end"},
+    {"word_count": 953, "len_text": 6363, "cut_type": "sentence_end"},
 ]