Skip to content

Commit

Permalink
Merge branch 'main' into COG-546-local-script-dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
borisarzentar authored Nov 18, 2024
2 parents 769524b + ced5385 commit 2bfaec4
Show file tree
Hide file tree
Showing 18 changed files with 762 additions and 157 deletions.
7 changes: 5 additions & 2 deletions cognee/infrastructure/engine/models/DataPoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,8 @@ class DataPoint(BaseModel):
def get_embeddable_data(self):
if self._metadata and len(self._metadata["index_fields"]) > 0 \
and hasattr(self, self._metadata["index_fields"][0]):

return getattr(self, self._metadata["index_fields"][0])
attribute = getattr(self, self._metadata["index_fields"][0])
if isinstance(attribute, str):
return(attribute.strip())
else:
return (attribute)
21 changes: 10 additions & 11 deletions cognee/modules/chunking/TextChunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,25 @@ class TextChunker():

chunk_index = 0
chunk_size = 0
paragraph_chunks = []

def __init__(self, document, get_text: callable, chunk_size: int = 1024):
self.document = document
self.max_chunk_size = chunk_size
self.get_text = get_text

def read(self):
self.paragraph_chunks = []
paragraph_chunks = []
for content_text in self.get_text():
for chunk_data in chunk_by_paragraph(
content_text,
self.max_chunk_size,
batch_paragraphs = True,
):
if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
self.paragraph_chunks.append(chunk_data)
paragraph_chunks.append(chunk_data)
self.chunk_size += chunk_data["word_count"]
else:
if len(self.paragraph_chunks) == 0:
if len(paragraph_chunks) == 0:
yield DocumentChunk(
id = chunk_data["chunk_id"],
text = chunk_data["text"],
Expand All @@ -37,35 +36,35 @@ def read(self):
chunk_index = self.chunk_index,
cut_type = chunk_data["cut_type"],
)
self.paragraph_chunks = []
paragraph_chunks = []
self.chunk_size = 0
else:
chunk_text = " ".join(chunk["text"] for chunk in self.paragraph_chunks)
chunk_text = " ".join(chunk["text"] for chunk in paragraph_chunks)
try:
yield DocumentChunk(
id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
text = chunk_text,
word_count = self.chunk_size,
is_part_of = self.document,
chunk_index = self.chunk_index,
cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
)
except Exception as e:
print(e)
self.paragraph_chunks = [chunk_data]
paragraph_chunks = [chunk_data]
self.chunk_size = chunk_data["word_count"]

self.chunk_index += 1

if len(self.paragraph_chunks) > 0:
if len(paragraph_chunks) > 0:
try:
yield DocumentChunk(
id = uuid5(NAMESPACE_OID, f"{str(self.document.id)}-{self.chunk_index}"),
text = " ".join(chunk["text"] for chunk in self.paragraph_chunks),
text = " ".join(chunk["text"] for chunk in paragraph_chunks),
word_count = self.chunk_size,
is_part_of = self.document,
chunk_index = self.chunk_index,
cut_type = self.paragraph_chunks[len(self.paragraph_chunks) - 1]["cut_type"],
cut_type = paragraph_chunks[len(paragraph_chunks) - 1]["cut_type"],
)
except Exception as e:
print(e)
10 changes: 7 additions & 3 deletions cognee/modules/data/processing/document_types/AudioDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@
class AudioDocument(Document):
type: str = "audio"

def create_transcript(self):
result = get_llm_client().create_transcript(self.raw_data_location)
return(result.text)

def read(self, chunk_size: int):
# Transcribe the audio file
result = get_llm_client().create_transcript(self.raw_data_location)
text = result.text

text = self.create_transcript()

chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text)
chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])

yield from chunker.read()
10 changes: 7 additions & 3 deletions cognee/modules/data/processing/document_types/ImageDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,15 @@
class ImageDocument(Document):
type: str = "image"


def transcribe_image(self):
result = get_llm_client().transcribe_image(self.raw_data_location)
return(result.choices[0].message.content)

def read(self, chunk_size: int):
# Transcribe the image file
result = get_llm_client().transcribe_image(self.raw_data_location)
text = result.choices[0].message.content
text = self.transcribe_image()

chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: text)
chunker = TextChunker(self, chunk_size = chunk_size, get_text = lambda: [text])

yield from chunker.read()
129 changes: 66 additions & 63 deletions cognee/tasks/chunks/chunk_by_paragraph.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,72 @@
from uuid import uuid5, NAMESPACE_OID
from typing import Dict, Any, Iterator
from .chunk_by_sentence import chunk_by_sentence

def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs = True):
paragraph = ""
def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
"""
Chunks text by paragraph while preserving exact text reconstruction capability.
When chunks are joined with empty string "", they reproduce the original text exactly.
"""
current_chunk = ""
current_word_count = 0
chunk_index = 0
paragraph_ids = []
last_cut_type = None
last_paragraph_id = None
paragraph_word_count = 0
paragraph_chunk_index = 0

for (paragraph_id, __, sentence, word_count, end_type) in chunk_by_sentence(data):
if paragraph_word_count > 0 and paragraph_word_count + word_count > paragraph_length:
if batch_paragraphs is True:
chunk_id = uuid5(NAMESPACE_OID, paragraph)
yield dict(
text = paragraph.strip(),
word_count = paragraph_word_count,
id = chunk_id, # When batching paragraphs, the paragraph_id is the same as chunk_id.
# paragraph_id doens't mean anything since multiple paragraphs are merged.
chunk_id = chunk_id,
chunk_index = paragraph_chunk_index,
cut_type = last_cut_type,
)
else:
yield dict(
text = paragraph.strip(),
word_count = paragraph_word_count,
id = last_paragraph_id,
chunk_id = uuid5(NAMESPACE_OID, paragraph),
chunk_index = paragraph_chunk_index,
cut_type = last_cut_type,
)

paragraph_chunk_index += 1
paragraph_word_count = 0
paragraph = ""

paragraph += (" " if len(paragraph) > 0 else "") + sentence
paragraph_word_count += word_count

if end_type == "paragraph_end" or end_type == "sentence_cut":
if batch_paragraphs is True:
paragraph += "\n\n" if end_type == "paragraph_end" else ""
else:
yield dict(
text = paragraph.strip(),
word_count = paragraph_word_count,
paragraph_id = paragraph_id,
chunk_id = uuid5(NAMESPACE_OID, paragraph),
chunk_index = paragraph_chunk_index,
cut_type = end_type,
)

paragraph_chunk_index = 0
paragraph_word_count = 0
paragraph = ""

for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
# Check if this sentence would exceed length limit
if current_word_count > 0 and current_word_count + word_count > paragraph_length:
# Yield current chunk
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,
"cut_type": last_cut_type,
}

yield chunk_dict

# Start new chunk with current sentence
paragraph_ids = []
current_chunk = ""
current_word_count = 0
chunk_index += 1

paragraph_ids.append(paragraph_id)
current_chunk += sentence
current_word_count += word_count

# Handle end of paragraph
if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
# For non-batch mode, yield each paragraph separately
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"paragraph_ids": paragraph_ids,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"chunk_index": chunk_index,
"cut_type": end_type
}
yield chunk_dict
paragraph_ids = []
current_chunk = ""
current_word_count = 0
chunk_index += 1

last_cut_type = end_type
last_paragraph_id = paragraph_id

if len(paragraph) > 0:
yield dict(
chunk_id = uuid5(NAMESPACE_OID, paragraph),
text = paragraph,
word_count = paragraph_word_count,
paragraph_id = last_paragraph_id,
chunk_index = paragraph_chunk_index,
cut_type = last_cut_type,
)

# Yield any remaining text
if current_chunk:
chunk_dict = {
"text": current_chunk,
"word_count": current_word_count,
"chunk_id": uuid5(NAMESPACE_OID, current_chunk),
"paragraph_ids": paragraph_ids,
"chunk_index": chunk_index,
"cut_type": "sentence_cut" if last_cut_type == "word" else last_cut_type
}


yield chunk_dict
29 changes: 21 additions & 8 deletions cognee/tasks/chunks/chunk_by_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,43 @@


from uuid import uuid4
from typing import Optional
from .chunk_by_word import chunk_by_word

def chunk_by_sentence(data: str):
def chunk_by_sentence(data: str, maximum_length: Optional[int] = None):
sentence = ""
paragraph_id = uuid4()
chunk_index = 0
word_count = 0
section_end = False
word_type_state = None

# the yielded word_type_state is identical to word_type, except when
# the word type is 'word', the word doesn't contain any letters
# and words with the same characteristics connect it to a preceding
# word with word_type 'paragraph_end' or 'sentence_end'
for (word, word_type) in chunk_by_word(data):
sentence += (" " if len(sentence) > 0 else "") + word
sentence += word
word_count += 1

if word_type == "paragraph_end" or word_type == "sentence_end":
yield (paragraph_id, chunk_index, sentence, word_count, word_type)
if word_type in ["paragraph_end", "sentence_end"]:
word_type_state = word_type
else:
for character in word:
if character.isalpha():
word_type_state = word_type
break

if word_type in ["paragraph_end", "sentence_end"] or (maximum_length and (word_count == maximum_length)):
yield (paragraph_id, sentence, word_count, word_type_state)
sentence = ""
word_count = 0
paragraph_id = uuid4() if word_type == "paragraph_end" else paragraph_id
chunk_index = 0 if word_type == "paragraph_end" else chunk_index + 1

if len(sentence) > 0:
section_end = "sentence_cut" if word_type_state == "word" else word_type_state
yield (
paragraph_id,
chunk_index,
sentence,
word_count,
"sentence_cut",
section_end,
)
Loading

0 comments on commit 2bfaec4

Please sign in to comment.