-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingest_data.py
139 lines (107 loc) · 4.38 KB
/
ingest_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from llm import LLM
from db import VectorDB
import tempfile
import os
class IngestData(VectorDB, LLM):
"""
A class used to ingest data from PDF files, summarize the content, split the text into chunks,
and load the data into a database.
Attributes
----------
pdf_paths : list
A list of paths to the PDF files to be ingested.
text : str, optional
Text content to be processed (default is None).
chunks_size : int
The size of the chunks to split the text into (default is 10000).
overlap_size : int
The size of the overlap between chunks (default is 900).
Methods
-------
summary(pdf_content)
Summarizes the content of the PDF files.
splittext(pdf_content)
Splits the text content of the PDF files into chunks.
load_data_to_db()
Loads the data from the PDF files into the database.
"""
def __init__(self, pdf_paths, text=None):
self.pdf_paths = pdf_paths
self.text = text
self.chunks_size=500
self.overlap_size=50
super().__init__(text)
def summary(self, pdf_content):
"""
Summarizes the content of the PDF files.
Parameters
----------
pdf_content : list
A list of PDF content to be summarized.
Returns
-------
list
A list of summarized content.
"""
documents = [Document(page_content=data.page_content) for data in pdf_content]
llm=LLM().init_llm()
#ChatOpenAI(model_name="gpt-4o", max_retries=0)
chain= load_summarize_chain(
llm, chain_type="stuff",
verbose=False
)
summaries = chain.invoke( documents)
return summaries
def splittext(self, pdf_content):
"""
Splits the text content of the PDF files into chunks.
Parameters
----------
pdf_content : list
A list of PDF content to be split into chunks.
Returns
-------
list
A list of text chunks.
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=self.chunks_size,
chunk_overlap=self.overlap_size)
chunks = splitter.split_documents(pdf_content)
return chunks
def load_data_to_db(self):
"""
Loads the data from the PDF files into the database.
This method processes each document, creates a temporary file to store the content,
loads the PDF, summarizes the content, splits the text, and stores the data in the database.
"""
#Verify if exist pdf_paths
if self.pdf_paths is not None:
# Processes each document
for document in self.pdf_paths:
#creating a temporaly file to stora the content of the load file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(document.read()) # Escribimos el contenido del archivo subido
doc = temp_file.name
try:
#Load the pdf
pdf_loader = PyPDFLoader(doc)
#Read the pdf
pdf_read=pdf_loader.load()
#Get the summary and store it
summary_result = self.summary(pdf_read)
VectorDB(text=summary_result["input_documents"][0].page_content, type_collection="Summary").create_and_store_embedding()
#Split text and store it
split_result = self.splittext(pdf_read)
VectorDB(text=split_result, type_collection="Splited_text").create_and_store_embedding()
#Document full store it
VectorDB(text=pdf_read, type_collection="Documents").create_and_store_embedding()
finally:
# Asegurarse de eliminar el archivo temporal
os.unlink(doc)