Skip to content

Commit

Permalink
Merge pull request #8 from labrijisaad/update_setup_semantic_database
Browse files Browse the repository at this point in the history
Updated setup semantic database
  • Loading branch information
labrijisaad authored Apr 13, 2024
2 parents 7b909ee + a46fb30 commit 233ae0b
Show file tree
Hide file tree
Showing 8 changed files with 151 additions and 11 deletions.
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,14 @@ app:
@$(VENV_ACTIVATE)
@python app.py

app2:
@$(VENV_ACTIVATE)
@python app2.py

stream:
@$(VENV_ACTIVATE)
@streamlit run .\streamlit_app.py

test:
@$(VENV_ACTIVATE)
@pytest tests/
Expand All @@ -60,4 +68,6 @@ help:
@echo make jupy - Activate the virtual environment and run Jupyter Lab
@echo make test - Tests the code using pytest
@echo make app - Runs the App
@echo make app2 - Runs the App2
@echo make stream - Runs the streamlit app
@echo Author: $(AUTHOR)
1 change: 1 addition & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def main():
embedding_model="text-embedding-3-small",
save_index=True,
directory_path=output_directory,
markdown_content=""
)
print(f"Total cost for setting up the semantic database: ${total_cost}")

Expand Down
2 changes: 1 addition & 1 deletion app2.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def main():

# Proceed with the rest of the querying process
similar_docs = query_pipeline.find_similar_documents(
query_text=user_query, num_results=2
query_text=user_query, num_results=4
)

context_enhanced_prompt, expertise_area_cost = (
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ openai
tqdm
faiss-cpu
pandas
pyyaml
pyyaml
streamlit
19 changes: 10 additions & 9 deletions src/pipelines/query_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,27 @@ def set_model(self, model_name):
# Assuming you might also need to set the model in ModelInferenceManager if necessary.

def setup_semantic_database(
self, markdown_path, embedding_model, save_index=False, directory_path=None
self, markdown_path=None,embedding_model=None, save_index=False, directory_path=None, markdown_content=None
):
# Ensure the embedding model is set
self.embedder.set_model(embedding_model)

# Process markdown to extract texts
self.embedder.read_and_process_markdown(markdown_path)
# Check if content is directly provided, otherwise read from the path
if markdown_content:
# Directly use provided markdown content
self.embedder.texts = [self.embedder.preprocess_text(text) for text in markdown_content.split("\n\n")]
elif markdown_path:
# Read and process markdown file if path is provided
self.embedder.read_and_process_markdown(markdown_path)

# Generate unique filenames for saving the index and texts
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
index_filename = f"faiss_db_{timestamp}.bin"
texts_filename = f"faiss_db_{timestamp}.json"

# Generate full file paths
index_path = (
os.path.join(directory_path, index_filename) if directory_path else None
)
texts_path = (
os.path.join(directory_path, texts_filename) if directory_path else None
)
index_path = os.path.join(directory_path, index_filename) if directory_path else None
texts_path = os.path.join(directory_path, texts_filename) if directory_path else None

# Generate embeddings and calculate the total cost
total_cost = self.embedder.generate_embeddings()
Expand Down
127 changes: 127 additions & 0 deletions streamlit_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import streamlit as st
from src.pipelines.query_pipeline import QueryPipeline
from src.utils.utils import load_models_config, load_credentials


def read_file_content(uploaded_file):
"""Function to read and decode the uploaded file."""
return uploaded_file.getvalue().decode("utf-8")


def get_embedding_models(models_config):
"""
Extract embedding model names and their pricing from the loaded models configuration.
"""
embedding_models = [model for group in models_config['models'] if group['name'] == 'Embedding models' for model in group['variants']]
model_info = [(model['model'], model['usage_price_per_token']) for model in embedding_models]
return model_info

def get_llm_models(models_config):
"""
Extract LLM model names and their input/output pricing from the loaded models configuration.
"""
llm_models = [
(model['model'], model['input_price_per_token'], model['output_price_per_token'])
for group in models_config['models']
if 'GPT' in group['name'] # Assumes that LLMs have 'GPT' in their group name
for model in group['variants']
]
return llm_models



def main():

st.title("LLM RAG Application")

# Global parameters and config/secrets loading
credentials = load_credentials("secrets/credentials.yml")
openai_api_key = credentials["OPENAI_CREDENTIALS"]
models_config = load_models_config("config/models_config.yml")

# Sidebar configuration - Embedding Models Settings
st.sidebar.markdown("## ✔️ OpenAI *Embedding Model* Settings")
embedding_model_info = get_embedding_models(models_config)
model_names = [model[0] for model in embedding_model_info]
model_prices = {model[0]: model[1] for model in embedding_model_info}
selected_model = st.sidebar.selectbox("Choose the embedding model", model_names)
selected_model_price = model_prices[selected_model]
st.sidebar.markdown(f"Selected Model: **`{selected_model}`**")
st.sidebar.markdown(f"Price per **1M token**: **`{selected_model_price*1000000:.2f} $`**")
st.sidebar.markdown(f"---")

# Sidebar configuration - LLMs Settings
st.sidebar.markdown("## ⚙️ OpenAI *LLMs* Settings")
llm_model_info = get_llm_models(models_config)
llm_model_names = [model[0] for model in llm_model_info]
llm_input_prices = {model[0]: model[1] for model in llm_model_info}
llm_output_prices = {model[0]: model[2] for model in llm_model_info}
selected_llm_model = st.sidebar.selectbox("Choose the LLM model", llm_model_names)
st.sidebar.markdown(f"Selected Model: **`{selected_llm_model}`**")
st.sidebar.markdown(f"Input price per **1K tokens**: **`{llm_input_prices[selected_llm_model]*1000:.4f} $`**")
st.sidebar.markdown(f"Output price per **1K tokens**: **`{llm_output_prices[selected_llm_model]*1000:.4f} $`**")
st.sidebar.markdown("## 🔥 Model Temperature")
temperature = st.sidebar.slider("Select the LLM temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
st.sidebar.markdown(f"Selected Temperature: **`{temperature}`**")
st.sidebar.markdown("## ⚡ Max Completion Tokens")
max_tokens = st.sidebar.slider("Select the LLM Max Completion Tokens", min_value=50, value=500, max_value=1500)
st.sidebar.markdown(f"Selected Max Completion Tokens: **`{max_tokens}`**")
st.sidebar.markdown(f"---")


# Quick Links
st.sidebar.markdown("## 🌐 Connect with Me")
st.sidebar.markdown("[![LinkedIn](https://img.shields.io/badge/LinkedIn-%230077B5.svg?&style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/labrijisaad/) [![GitHub](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/labrijisaad)")
st.sidebar.markdown("## 🔗 Quick Links")
st.sidebar.markdown("[View on GitHub](https://github.com/labrijisaad/LLM-RAG)", unsafe_allow_html=True)









tab1, tab2 = st.tabs(["Database Setup", "RAG Query"])

with tab1:
st.header("Setup Database")
uploaded_files = st.file_uploader("Upload markdown files:", type=['md'], accept_multiple_files=True, help="Upload markdown files for processing.")

if uploaded_files:
uploaded_file_names = ", ".join(["'"+f.name+"'" for f in uploaded_files])
st.success(f"**Uploaded Files:** **`{uploaded_file_names}`**")

markdown_content = ""
for uploaded_file in uploaded_files:
with st.expander(f"View Content of **`{uploaded_file.name}`**"):
file_content = read_file_content(uploaded_file)
markdown_content += file_content + "\n"
st.markdown("### File Content Preview 👀")
st.code(file_content, language='markdown')

query_pipeline = QueryPipeline(openai_api_key, models_config)
# Directory to save index and texts
output_directory = "data/processed"
if st.button("Create Database", key="create_db"):
with st.spinner("Creating database from files..."):

total_cost = query_pipeline.setup_semantic_database(
markdown_path="",
embedding_model=selected_model,
save_index=True,
directory_path=output_directory,
markdown_content=markdown_content
)
st.success(f"Database created successfully! Total cost: ${total_cost}")

else:
st.info("Upload markdown files to proceed with database setup.")

with tab2:
st.header("Perform RAG Query")
pass

if __name__ == "__main__":
main()
Empty file removed streamlit_app/__init__.py
Empty file.
Empty file removed streamlit_app/app.py
Empty file.

0 comments on commit 233ae0b

Please sign in to comment.