-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingestion.py
26 lines (22 loc) · 897 Bytes
/
ingestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
load_dotenv()
if __name__ == "__main__":
print("Ingesting...")
loader = TextLoader("mediumblog1.txt")
document = loader.load()
print("Splitting...")
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# Texts are the chunks which has documents.
texts = text_splitter.split_documents(document)
print(f"Created {len(texts)} chunks")
embeddings = OpenAIEmbeddings(openai_api_key=os.environ.get("OPENAI_API_KEY"))
print("Storing into Vector DB...")
PineconeVectorStore.from_documents(
texts, embeddings, index_name=os.environ["INDEX_NAME"]
)
print("Finished")