Merge pull request #4 from labrijisaad/dev

Created the App Logic.
labrijisaad · Apr 7, 2024 · 6eb1f08 · 6eb1f08
2 parents 5eeb145 + e73804a
commit 6eb1f08
Show file tree

Hide file tree

Showing 13 changed files with 488 additions and 46 deletions.
diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml
@@ -4,9 +4,11 @@ on:
   push:
     branches:
       - main
+      - dev
   pull_request:
     branches:
       - main
+      - dev
 
 jobs:
   code-quality-check:

diff --git a/Makefile b/Makefile
@@ -41,55 +41,17 @@ else
     POWERSHELL_CMD := pwsh
 endif
 
-# Initialization of project structure and files
-init:
-	@mkdir "$(DATA_RAW_DIR)" "$(DATA_PROCESSED_DIR)" "$(DATA_EXTERNAL_DIR)" "$(NOTEBOOKS_DIR)" "$(DOCS_DIR)" 2> NUL || echo "Directories already exist."
-	@echo. 2> "$(DATA_RAW_DIR)\$(GITKEEP_FILE)" || echo "File $(DATA_RAW_DIR)\$(GITKEEP_FILE) already exists."
-	@echo. 2> "$(DATA_PROCESSED_DIR)\$(GITKEEP_FILE)" || echo "File $(DATA_PROCESSED_DIR)\$(GITKEEP_FILE) already exists."
-	@echo. 2> "$(DATA_EXTERNAL_DIR)\$(GITKEEP_FILE)" || echo "File $(DATA_EXTERNAL_DIR)\$(GITKEEP_FILE) already exists."
-	@echo. 2> "$(NOTEBOOKS_DIR)\$(GITKEEP_FILE)" || echo "File $(NOTEBOOKS_DIR)\$(GITKEEP_FILE) already exists."
-	@echo. 2> "$(DOCS_DIR)\$(GITKEEP_FILE)" || echo "File $(DOCS_DIR)\$(GITKEEP_FILE) already exists."
-	@echo # Project Title > "$(README_FILE)"
-	@echo. >> "$(README_FILE)"
-	@echo ## Connect >> "$(README_FILE)"
-	@echo - 🔗 Feel free to connect with me on [LinkedIn](https://www.linkedin.com/in/labrijisaad/) >> "$(README_FILE)"
-	@echo author: labriji saad > "$(CONFIG_FILE)"
-	@echo # Add your environment variables here > "$(ENV_FILE)"
-	@echo # Add files and directories to ignore in version control > "$(GITIGNORE_FILE)"
-	@echo # Add your project dependencies here > "$(REQUIREMENTS_FILE)"
-	@echo jupyterlab >> "$(REQUIREMENTS_FILE)"  # Add jupyterlab as a default requirement
-	@echo ipywidgets >> "$(REQUIREMENTS_FILE)"  # Add ipywidgets as a default requirement
-	@echo ">>>>>> Project structure initialized successfully <<<<<<"
-
-# Setup the virtual environment and install dependencies
-setup:
-	@python -m venv $(VENV_NAME)
-	@$(VENV_ACTIVATE)
-	@python.exe -m pip install --upgrade pip
-	@pip install -r $(REQUIREMENTS_FILE)
-	@echo ">>>>>> Environment is ready <<<<<<"
-
-# Update dependencies in the virtual environment
-update:
-	@$(VENV_ACTIVATE) && python.exe -m pip install --upgrade pip && pip install -r $(REQUIREMENTS_FILE)
-	@echo ">>>>>> Dependencies updated <<<<<<"
-
 # Activate the virtual environment and run Jupyter Lab
 jupy:
 	@$(VENV_ACTIVATE) && jupyter lab
 	@echo ">>>>>> Jupyter Lab is running <<<<<<"
 
-# Clean up the virtual environment and generated files
-clean:
-	@$(DELETE_CMD) $(VENV_NAME)
-	@echo ">>>>>> Cleaned up environment <<<<<<"
+app:
+	@python app.py
 
 # Display available make targets
 help:
 	@echo Available targets:
-	@echo   make init                                             - Initialize the project's structure and essential files
-	@echo   make setup                                            - Create a virtual environment and install dependencies
-	@echo   make update                                           - Update dependencies in the virtual environment
-	@echo   make clean                                            - Clean up the virtual environment and generated files
 	@echo   make jupy                                             - Activate the virtual environment and run Jupyter Lab
+	@echo   make app                                              - Runs the App
 	@echo Author: $(AUTHOR)
diff --git a/README.md b/README.md
@@ -30,6 +30,47 @@ graph TD
     style K fill:#e07b53,stroke:#fff,stroke-width:2px
 ```
 
+
+## Project Architecture
+The project structure is organized as follows, ensuring modularity and ease of maintenance:
+
+```
+LLM-RAG/
+│
+├── src/                        # Source code for the application
+│   │
+│   ├── models/                 
+│   │   ├── inference.py        # ModelInferenceManager class
+│   │   └── vectorization.py    # SemanticVectorizer class
+│   │
+│   ├── pipelines/              # Pipeline for processing queries
+│   │   └── query_pipeline.py   # QueryPipeline class
+│   │
+│   ├── utils/                  # Utility functions and classes
+│   │   └── utils.py            # Helper functions, e.g., for loading configs
+│   │
+│   └── __init__.py             # Makes src a Python module
+│
+├── configs/                    # Configuration files
+│   └── models_config.yml       # Model configurations
+│
+├── data/                       # Data used by the application
+│   ├── raw/                    # Raw data like markdown files
+│   ├── processed/              # Processed data like embeddings
+│   └── faiss_index/            # FAISS indices
+│
+├── notebooks/                  # Jupyter notebooks for experiments
+│   └── rag_llm_experiments.ipynb
+│
+├── secrets/                    # Secret keys and credentials
+│   └── credentials.yml         # OpenAI API credentials
+│
+├── app.py                      # Main Streamlit application script
+├── requirements.txt            # Python dependencies for the project
+├── README.md                   # Project documentation
+└── .gitignore                  # Specifies files to ignore in git
+```
+
 ## Connect with me 🌐
 <div align="center">
   <a href="https://www.linkedin.com/in/labrijisaad/">

diff --git a/app.py b/app.py
@@ -0,0 +1,59 @@
+from src.pipelines.query_pipeline import QueryPipeline
+from src.utils.utils import load_models_config, load_credentials
+
+
+def main():
+    # Load OpenAI API Key and Model Configurations
+    credentials = load_credentials("secrets/credentials.yml")
+    openai_api_key = credentials["OPENAI_CREDENTIALS"]
+    models_config = load_models_config("config/models_config.yml")
+
+    # Initialize the QueryPipeline
+    query_pipeline = QueryPipeline(openai_api_key, models_config)
+
+    # Set up the semantic database (example path and model)
+    total_cost = query_pipeline.setup_semantic_database(
+        markdown_path="data/raw/mock_markdown.md",
+        embedding_model="text-embedding-3-small",
+        save_index=True,
+        index_path="data/processed/faiss_index.bin",
+    )
+    print(f"Total cost for setting up the semantic database: ${total_cost}")
+
+    # Example query
+    user_query = input("Enter your query: ")
+
+    # Find similar documents
+    similar_docs = query_pipeline.find_similar_documents(
+        query_text=user_query, num_results=3
+    )
+
+    # Determine expertise area and prepare the prompt
+    context_enhanced_prompt, expertise_area_cost = (
+        query_pipeline.determine_expertise_and_prepare_prompt(
+            user_query=user_query,
+            similar_docs=similar_docs,
+            inference_model="gpt-3.5-turbo-0125",
+            max_completion_tokens=150,
+            temperature=0.2,
+        )
+    )
+    print(f"Cost for determining expertise area: ${expertise_area_cost}")
+
+    # Query the model for a response
+    contextual_response, response_cost = query_pipeline.query_model_for_response(
+        context_enhanced_prompt=context_enhanced_prompt,
+        max_completion_tokens=1500,
+        temperature=0.7,
+    )
+    print(f"Cost for querying the model for a response: ${response_cost}")
+
+    # Output the response
+    print("--------\nContextual Prompt:\n--------")
+    print(context_enhanced_prompt)
+    print("--------\nResponse:\n--------")
+    print(contextual_response)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/app2.py b/app2.py
@@ -0,0 +1,55 @@
+from src.pipelines.query_pipeline import QueryPipeline
+from src.utils.utils import load_models_config, load_credentials
+
+
+def main():
+    # Load OpenAI API Key and Model Configurations
+    credentials = load_credentials("secrets/credentials.yml")
+    openai_api_key = credentials["OPENAI_CREDENTIALS"]
+    models_config = load_models_config("config/models_config.yml")
+
+    # Initialize the QueryPipeline
+    query_pipeline = QueryPipeline(openai_api_key, models_config)
+
+    # Set the model
+    query_pipeline.set_model("text-embedding-3-small")
+
+    # Load the semantic database FAISS index
+    index_path = "data/processed/faiss_index.bin"
+    query_pipeline.load_faiss_index(index_path)
+
+    # Example query
+    user_query = input("Enter your query: ")
+
+    # Proceed with the rest of the querying process
+    similar_docs = query_pipeline.find_similar_documents(
+        query_text=user_query, num_results=3
+    )
+
+    context_enhanced_prompt, expertise_area_cost = (
+        query_pipeline.determine_expertise_and_prepare_prompt(
+            user_query=user_query,
+            similar_docs=similar_docs,
+            inference_model="gpt-3.5-turbo-0125",
+            max_completion_tokens=150,
+            temperature=0.2,
+        )
+    )
+    print(f"Cost for determining expertise area: ${expertise_area_cost}")
+
+    contextual_response, response_cost = query_pipeline.query_model_for_response(
+        context_enhanced_prompt=context_enhanced_prompt,
+        max_completion_tokens=1500,
+        temperature=0.7,
+    )
+    print(f"Cost for querying the model for a response: ${response_cost}")
+
+    # Output the response
+    print("--------\nContextual Prompt:\n--------")
+    print(context_enhanced_prompt)
+    print("--------\nResponse:\n--------")
+    print(contextual_response)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/config.yaml b/config.yaml
diff --git a/faiss_index.bin → data/processed/faiss_index.bin b/faiss_index.bin → data/processed/faiss_index.bin
diff --git a/notebooks/RAG_LLM_experiments.ipynb b/notebooks/RAG_LLM_experiments.ipynb
@@ -230,7 +230,7 @@
     "            )\n",
     "            return None, None\n",
     "\n",
-    "    def generate_embeddings(self):\n",
+    "    def generate_embeddings(self, save_index=False, index_path=None):\n",
     "        total_cost = 0\n",
     "        for text in tqdm(self.texts):\n",
     "            embedding, usage = self.query_openai_embedding(text)\n",
@@ -239,6 +239,10 @@
     "                total_cost += self.calculate_cost(usage)\n",
     "        self.embeddings = np.array(self.embeddings)\n",
     "        self.create_faiss_index()\n",
+    "\n",
+    "        if save_index and index_path:\n",
+    "            self.save_faiss_index(index_path)\n",
+    "\n",
     "        return total_cost\n",
     "\n",
     "    def create_faiss_index(self):\n",
@@ -299,11 +303,14 @@
     "            openai_api_key, models_config\n",
     "        )\n",
     "\n",
-    "    def setup_semantic_database(self, markdown_path, embedding_model):\n",
+    "    def setup_semantic_database(\n",
+    "        self, markdown_path, embedding_model, save_index=False, index_path=None\n",
+    "    ):\n",
     "        self.embedder.set_model(embedding_model)\n",
     "        self.embedder.read_and_process_markdown(markdown_path)\n",
-    "        total_cost = self.embedder.generate_embeddings()\n",
-    "        self.embedder.create_faiss_index()\n",
+    "        total_cost = self.embedder.generate_embeddings(\n",
+    "            save_index=save_index, index_path=index_path\n",
+    "        )\n",
     "        return total_cost\n",
     "\n",
     "    def find_similar_documents(self, query_text, num_results):\n",

diff --git a/data/external/.gitkeep → src/__init__.py b/data/external/.gitkeep → src/__init__.py
diff --git a/src/models/inference.py b/src/models/inference.py
@@ -0,0 +1,86 @@
+import requests
+
+
+class ModelInferenceManager:
+    def __init__(self, api_key, models_config):
+        self.api_key = api_key
+        self.models_config = models_config
+        self.model = None
+        self.input_token_price = None
+        self.output_token_price = None
+
+    def set_model(self, model_name):
+        for group in self.models_config["models"]:
+            for variant in group["variants"]:
+                if variant["model"] == model_name:
+                    self.model = model_name
+                    self.input_token_price = variant["input_price_per_token"]
+                    self.output_token_price = variant["output_price_per_token"]
+                    return
+        raise ValueError(f"Model {model_name} not found in configuration.")
+
+    def query_openai(self, prompt_text, max_completion_tokens=100, temperature=0.7):
+        if not self.model:
+            raise ValueError(
+                "Model not set. Please use set_model() to set a model before querying."
+            )
+        url = "https://api.openai.com/v1/chat/completions"
+        headers = {"Authorization": f"Bearer {self.api_key}"}
+        payload = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt_text}],
+            "max_tokens": max_completion_tokens,
+            "temperature": temperature,
+        }
+
+        try:
+            response = requests.post(url, headers=headers, json=payload, timeout=60)
+            if response.status_code == 200:
+                data = response.json()
+                content = data["choices"][0]["message"]["content"]
+                usage = data["usage"]
+                return content, usage
+            else:
+                return (
+                    f"HTTP Error {response.status_code}: {response.json().get('error', {}).get('message', 'An unspecified error occurred')}",
+                    None,
+                )
+        except requests.RequestException as e:
+            return f"Connection error: {e}", None
+
+    def calculate_cost(self, usage):
+        if usage:
+            total_price = (usage["prompt_tokens"] * self.input_token_price) + (
+                usage["completion_tokens"] * self.output_token_price
+            )
+            return total_price
+        else:
+            return None
+
+    def determine_expertise_area(
+        self, user_question, max_completion_tokens, temperature
+    ):
+        prompt_text = f"""Based on the question provided, identify the relevant expertise area(s). Return your answer in the format: 
+        'expertise1, expertise2, ...'. Provide only the expertise areas as a comma-separated list, no additional explanations are needed.
+        Here is the user Question:
+        {user_question}
+        """
+        response, usage = self.query_openai(
+            prompt_text, max_completion_tokens, temperature
+        )
+        return response.strip(), (
+            usage if response else "Error determining expertise area."
+        )
+
+    def prepare_prompt_for_llm(self, expertise_area, user_question, context_documents):
+        prompt = (
+            f"You are an expert in '{expertise_area}'. A user has asked for help with the following question: "
+            f"'{user_question}'. Please provide insights using only the information from the provided documents. "
+            "If certain aspects are ambiguous or the documents do not fully address the question, please make educated inferences based on your expertise.\n\n"
+            "Here are the documents provided:\n\n"
+        )
+        for i, document in enumerate(context_documents, start=1):
+            prompt += f'Document {i}:\n"""\n{document}\n"""\n\n'
+        prompt += "Given your expertise and the information provided in these documents, synthesize the key insights to craft a detailed and relevant response to the above question.\n\n"
+        prompt += "Start your response below:\n\n"
+        return prompt