Merge pull request #8 from labrijisaad/update_setup_semantic_database

Updated setup semantic database
labrijisaad · Apr 13, 2024 · 233ae0b · 233ae0b
2 parents 7b909ee + a46fb30
commit 233ae0b
Show file tree

Hide file tree

Showing 8 changed files with 151 additions and 11 deletions.
diff --git a/Makefile b/Makefile
@@ -50,6 +50,14 @@ app:
 	@$(VENV_ACTIVATE)
 	@python app.py
 
+app2:
+	@$(VENV_ACTIVATE)
+	@python app2.py
+
+stream:
+	@$(VENV_ACTIVATE)
+	@streamlit run .\streamlit_app.py
+
 test:
 	@$(VENV_ACTIVATE)
 	@pytest tests/
@@ -60,4 +68,6 @@ help:
 	@echo   make jupy                                             - Activate the virtual environment and run Jupyter Lab
 	@echo   make test                                             - Tests the code using pytest
 	@echo   make app                                              - Runs the App
+	@echo   make app2                                             - Runs the App2
+	@echo   make stream                                           - Runs the streamlit app
 	@echo Author: $(AUTHOR)
diff --git a/app.py b/app.py
@@ -18,6 +18,7 @@ def main():
         embedding_model="text-embedding-3-small",
         save_index=True,
         directory_path=output_directory,
+        markdown_content=""
     )
     print(f"Total cost for setting up the semantic database: ${total_cost}")
 

diff --git a/app2.py b/app2.py
@@ -25,7 +25,7 @@ def main():
 
     # Proceed with the rest of the querying process
     similar_docs = query_pipeline.find_similar_documents(
-        query_text=user_query, num_results=2
+        query_text=user_query, num_results=4
     )
 
     context_enhanced_prompt, expertise_area_cost = (

diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,5 @@ openai
 tqdm
 faiss-cpu
 pandas
-pyyaml
+pyyaml
+streamlit
diff --git a/src/pipelines/query_pipeline.py b/src/pipelines/query_pipeline.py
@@ -20,26 +20,27 @@ def set_model(self, model_name):
         # Assuming you might also need to set the model in ModelInferenceManager if necessary.
 
     def setup_semantic_database(
-        self, markdown_path, embedding_model, save_index=False, directory_path=None
+        self, markdown_path=None,embedding_model=None, save_index=False, directory_path=None, markdown_content=None
     ):
         # Ensure the embedding model is set
         self.embedder.set_model(embedding_model)
 
-        # Process markdown to extract texts
-        self.embedder.read_and_process_markdown(markdown_path)
+        # Check if content is directly provided, otherwise read from the path
+        if markdown_content:
+            # Directly use provided markdown content
+            self.embedder.texts = [self.embedder.preprocess_text(text) for text in markdown_content.split("\n\n")]
+        elif markdown_path:
+            # Read and process markdown file if path is provided
+            self.embedder.read_and_process_markdown(markdown_path)
 
         # Generate unique filenames for saving the index and texts
         timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
         index_filename = f"faiss_db_{timestamp}.bin"
         texts_filename = f"faiss_db_{timestamp}.json"
 
         # Generate full file paths
-        index_path = (
-            os.path.join(directory_path, index_filename) if directory_path else None
-        )
-        texts_path = (
-            os.path.join(directory_path, texts_filename) if directory_path else None
-        )
+        index_path = os.path.join(directory_path, index_filename) if directory_path else None
+        texts_path = os.path.join(directory_path, texts_filename) if directory_path else None
 
         # Generate embeddings and calculate the total cost
         total_cost = self.embedder.generate_embeddings()

diff --git a/streamlit_app.py b/streamlit_app.py
@@ -0,0 +1,127 @@
+import streamlit as st
+from src.pipelines.query_pipeline import QueryPipeline
+from src.utils.utils import load_models_config, load_credentials
+
+
+def read_file_content(uploaded_file):
+    """Function to read and decode the uploaded file."""
+    return uploaded_file.getvalue().decode("utf-8")
+
+
+def get_embedding_models(models_config):
+    """
+    Extract embedding model names and their pricing from the loaded models configuration.
+    """
+    embedding_models = [model for group in models_config['models'] if group['name'] == 'Embedding models' for model in group['variants']]
+    model_info = [(model['model'], model['usage_price_per_token']) for model in embedding_models]
+    return model_info
+
+def get_llm_models(models_config):
+    """
+    Extract LLM model names and their input/output pricing from the loaded models configuration.
+    """
+    llm_models = [
+        (model['model'], model['input_price_per_token'], model['output_price_per_token'])
+        for group in models_config['models'] 
+        if 'GPT' in group['name']  # Assumes that LLMs have 'GPT' in their group name
+        for model in group['variants']
+    ]
+    return llm_models
+
+
+
+def main():
+
+    st.title("LLM RAG Application")
+
+    # Global parameters and config/secrets loading
+    credentials = load_credentials("secrets/credentials.yml")
+    openai_api_key = credentials["OPENAI_CREDENTIALS"]
+    models_config = load_models_config("config/models_config.yml")
+
+    # Sidebar configuration - Embedding Models Settings
+    st.sidebar.markdown("## ✔️ OpenAI *Embedding Model* Settings") 
+    embedding_model_info = get_embedding_models(models_config)
+    model_names = [model[0] for model in embedding_model_info]
+    model_prices = {model[0]: model[1] for model in embedding_model_info}
+    selected_model = st.sidebar.selectbox("Choose the embedding model", model_names)
+    selected_model_price = model_prices[selected_model]
+    st.sidebar.markdown(f"Selected Model: **`{selected_model}`**")
+    st.sidebar.markdown(f"Price per **1M token**: **`{selected_model_price*1000000:.2f} $`**")
+    st.sidebar.markdown(f"---")
+
+    # Sidebar configuration - LLMs Settings
+    st.sidebar.markdown("## ⚙️ OpenAI *LLMs* Settings") 
+    llm_model_info = get_llm_models(models_config)
+    llm_model_names = [model[0] for model in llm_model_info]
+    llm_input_prices = {model[0]: model[1] for model in llm_model_info}
+    llm_output_prices = {model[0]: model[2] for model in llm_model_info}
+    selected_llm_model = st.sidebar.selectbox("Choose the LLM model", llm_model_names)
+    st.sidebar.markdown(f"Selected Model: **`{selected_llm_model}`**")
+    st.sidebar.markdown(f"Input price per **1K tokens**: **`{llm_input_prices[selected_llm_model]*1000:.4f} $`**")
+    st.sidebar.markdown(f"Output price per **1K tokens**: **`{llm_output_prices[selected_llm_model]*1000:.4f} $`**")
+    st.sidebar.markdown("## 🔥 Model Temperature")
+    temperature = st.sidebar.slider("Select the LLM temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
+    st.sidebar.markdown(f"Selected Temperature: **`{temperature}`**")
+    st.sidebar.markdown("## ⚡ Max Completion Tokens")
+    max_tokens = st.sidebar.slider("Select the LLM Max Completion Tokens", min_value=50, value=500, max_value=1500)
+    st.sidebar.markdown(f"Selected Max Completion Tokens: **`{max_tokens}`**")
+    st.sidebar.markdown(f"---")
+
+
+    # Quick Links
+    st.sidebar.markdown("## 🌐 Connect with Me")
+    st.sidebar.markdown("[![LinkedIn](https://img.shields.io/badge/LinkedIn-%230077B5.svg?&style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/labrijisaad/) [![GitHub](https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white)](https://github.com/labrijisaad)")
+    st.sidebar.markdown("## 🔗 Quick Links")
+    st.sidebar.markdown("[View on GitHub](https://github.com/labrijisaad/LLM-RAG)", unsafe_allow_html=True)
+
+
+
+
+
+
+
+
+
+    tab1, tab2 = st.tabs(["Database Setup", "RAG Query"])
+
+    with tab1:
+        st.header("Setup Database")
+        uploaded_files = st.file_uploader("Upload markdown files:", type=['md'], accept_multiple_files=True, help="Upload markdown files for processing.")
+
+        if uploaded_files:
+            uploaded_file_names = ", ".join(["'"+f.name+"'" for f in uploaded_files])
+            st.success(f"**Uploaded Files:** **`{uploaded_file_names}`**")
+
+            markdown_content = ""
+            for uploaded_file in uploaded_files:
+                with st.expander(f"View Content of **`{uploaded_file.name}`**"):
+                    file_content = read_file_content(uploaded_file)
+                    markdown_content += file_content + "\n"  
+                    st.markdown("### File Content Preview 👀")
+                    st.code(file_content, language='markdown')
+
+            query_pipeline = QueryPipeline(openai_api_key, models_config)
+            # Directory to save index and texts
+            output_directory = "data/processed"
+            if st.button("Create Database", key="create_db"):
+                with st.spinner("Creating database from files..."):
+
+                    total_cost = query_pipeline.setup_semantic_database(
+                        markdown_path="",  
+                        embedding_model=selected_model,
+                        save_index=True,
+                        directory_path=output_directory,
+                        markdown_content=markdown_content
+                    )
+                    st.success(f"Database created successfully! Total cost: ${total_cost}")
+
+        else:
+            st.info("Upload markdown files to proceed with database setup.")
+
+    with tab2:
+        st.header("Perform RAG Query")
+        pass
+
+if __name__ == "__main__":
+    main()
diff --git a/streamlit_app/__init__.py b/streamlit_app/__init__.py
diff --git a/streamlit_app/app.py b/streamlit_app/app.py