Skip to content

Commit

Permalink
Merge pull request #4 from labrijisaad/dev
Browse files Browse the repository at this point in the history
Created the App Logic.
  • Loading branch information
labrijisaad authored Apr 7, 2024
2 parents 5eeb145 + e73804a commit 6eb1f08
Show file tree
Hide file tree
Showing 13 changed files with 488 additions and 46 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/code_quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ on:
push:
branches:
- main
- dev
pull_request:
branches:
- main
- dev

jobs:
code-quality-check:
Expand Down
44 changes: 3 additions & 41 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,55 +41,17 @@ else
POWERSHELL_CMD := pwsh
endif

# Initialization of project structure and files
init:
@mkdir "$(DATA_RAW_DIR)" "$(DATA_PROCESSED_DIR)" "$(DATA_EXTERNAL_DIR)" "$(NOTEBOOKS_DIR)" "$(DOCS_DIR)" 2> NUL || echo "Directories already exist."
@echo. 2> "$(DATA_RAW_DIR)\$(GITKEEP_FILE)" || echo "File $(DATA_RAW_DIR)\$(GITKEEP_FILE) already exists."
@echo. 2> "$(DATA_PROCESSED_DIR)\$(GITKEEP_FILE)" || echo "File $(DATA_PROCESSED_DIR)\$(GITKEEP_FILE) already exists."
@echo. 2> "$(DATA_EXTERNAL_DIR)\$(GITKEEP_FILE)" || echo "File $(DATA_EXTERNAL_DIR)\$(GITKEEP_FILE) already exists."
@echo. 2> "$(NOTEBOOKS_DIR)\$(GITKEEP_FILE)" || echo "File $(NOTEBOOKS_DIR)\$(GITKEEP_FILE) already exists."
@echo. 2> "$(DOCS_DIR)\$(GITKEEP_FILE)" || echo "File $(DOCS_DIR)\$(GITKEEP_FILE) already exists."
@echo # Project Title > "$(README_FILE)"
@echo. >> "$(README_FILE)"
@echo ## Connect >> "$(README_FILE)"
@echo - 🔗 Feel free to connect with me on [LinkedIn](https://www.linkedin.com/in/labrijisaad/) >> "$(README_FILE)"
@echo author: labriji saad > "$(CONFIG_FILE)"
@echo # Add your environment variables here > "$(ENV_FILE)"
@echo # Add files and directories to ignore in version control > "$(GITIGNORE_FILE)"
@echo # Add your project dependencies here > "$(REQUIREMENTS_FILE)"
@echo jupyterlab >> "$(REQUIREMENTS_FILE)" # Add jupyterlab as a default requirement
@echo ipywidgets >> "$(REQUIREMENTS_FILE)" # Add ipywidgets as a default requirement
@echo ">>>>>> Project structure initialized successfully <<<<<<"

# Setup the virtual environment and install dependencies
setup:
@python -m venv $(VENV_NAME)
@$(VENV_ACTIVATE)
@python.exe -m pip install --upgrade pip
@pip install -r $(REQUIREMENTS_FILE)
@echo ">>>>>> Environment is ready <<<<<<"

# Update dependencies in the virtual environment
update:
@$(VENV_ACTIVATE) && python.exe -m pip install --upgrade pip && pip install -r $(REQUIREMENTS_FILE)
@echo ">>>>>> Dependencies updated <<<<<<"

# Activate the virtual environment and run Jupyter Lab
jupy:
@$(VENV_ACTIVATE) && jupyter lab
@echo ">>>>>> Jupyter Lab is running <<<<<<"

# Clean up the virtual environment and generated files
clean:
@$(DELETE_CMD) $(VENV_NAME)
@echo ">>>>>> Cleaned up environment <<<<<<"
app:
@python app.py

# Display available make targets
help:
@echo Available targets:
@echo make init - Initialize the project's structure and essential files
@echo make setup - Create a virtual environment and install dependencies
@echo make update - Update dependencies in the virtual environment
@echo make clean - Clean up the virtual environment and generated files
@echo make jupy - Activate the virtual environment and run Jupyter Lab
@echo make app - Runs the App
@echo Author: $(AUTHOR)
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,47 @@ graph TD
style K fill:#e07b53,stroke:#fff,stroke-width:2px
```


## Project Architecture
The project structure is organized as follows, ensuring modularity and ease of maintenance:

```
LLM-RAG/
├── src/ # Source code for the application
│ │
│ ├── models/
│ │ ├── inference.py # ModelInferenceManager class
│ │ └── vectorization.py # SemanticVectorizer class
│ │
│ ├── pipelines/ # Pipeline for processing queries
│ │ └── query_pipeline.py # QueryPipeline class
│ │
│ ├── utils/ # Utility functions and classes
│ │ └── utils.py # Helper functions, e.g., for loading configs
│ │
│ └── __init__.py # Makes src a Python module
├── configs/ # Configuration files
│ └── models_config.yml # Model configurations
├── data/ # Data used by the application
│ ├── raw/ # Raw data like markdown files
│ ├── processed/ # Processed data like embeddings
│ └── faiss_index/ # FAISS indices
├── notebooks/ # Jupyter notebooks for experiments
│ └── rag_llm_experiments.ipynb
├── secrets/ # Secret keys and credentials
│ └── credentials.yml # OpenAI API credentials
├── app.py # Main Streamlit application script
├── requirements.txt # Python dependencies for the project
├── README.md # Project documentation
└── .gitignore # Specifies files to ignore in git
```

## Connect with me 🌐
<div align="center">
<a href="https://www.linkedin.com/in/labrijisaad/">
Expand Down
59 changes: 59 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from src.pipelines.query_pipeline import QueryPipeline
from src.utils.utils import load_models_config, load_credentials


def main():
# Load OpenAI API Key and Model Configurations
credentials = load_credentials("secrets/credentials.yml")
openai_api_key = credentials["OPENAI_CREDENTIALS"]
models_config = load_models_config("config/models_config.yml")

# Initialize the QueryPipeline
query_pipeline = QueryPipeline(openai_api_key, models_config)

# Set up the semantic database (example path and model)
total_cost = query_pipeline.setup_semantic_database(
markdown_path="data/raw/mock_markdown.md",
embedding_model="text-embedding-3-small",
save_index=True,
index_path="data/processed/faiss_index.bin",
)
print(f"Total cost for setting up the semantic database: ${total_cost}")

# Example query
user_query = input("Enter your query: ")

# Find similar documents
similar_docs = query_pipeline.find_similar_documents(
query_text=user_query, num_results=3
)

# Determine expertise area and prepare the prompt
context_enhanced_prompt, expertise_area_cost = (
query_pipeline.determine_expertise_and_prepare_prompt(
user_query=user_query,
similar_docs=similar_docs,
inference_model="gpt-3.5-turbo-0125",
max_completion_tokens=150,
temperature=0.2,
)
)
print(f"Cost for determining expertise area: ${expertise_area_cost}")

# Query the model for a response
contextual_response, response_cost = query_pipeline.query_model_for_response(
context_enhanced_prompt=context_enhanced_prompt,
max_completion_tokens=1500,
temperature=0.7,
)
print(f"Cost for querying the model for a response: ${response_cost}")

# Output the response
print("--------\nContextual Prompt:\n--------")
print(context_enhanced_prompt)
print("--------\nResponse:\n--------")
print(contextual_response)


if __name__ == "__main__":
main()
55 changes: 55 additions & 0 deletions app2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from src.pipelines.query_pipeline import QueryPipeline
from src.utils.utils import load_models_config, load_credentials


def main():
# Load OpenAI API Key and Model Configurations
credentials = load_credentials("secrets/credentials.yml")
openai_api_key = credentials["OPENAI_CREDENTIALS"]
models_config = load_models_config("config/models_config.yml")

# Initialize the QueryPipeline
query_pipeline = QueryPipeline(openai_api_key, models_config)

# Set the model
query_pipeline.set_model("text-embedding-3-small")

# Load the semantic database FAISS index
index_path = "data/processed/faiss_index.bin"
query_pipeline.load_faiss_index(index_path)

# Example query
user_query = input("Enter your query: ")

# Proceed with the rest of the querying process
similar_docs = query_pipeline.find_similar_documents(
query_text=user_query, num_results=3
)

context_enhanced_prompt, expertise_area_cost = (
query_pipeline.determine_expertise_and_prepare_prompt(
user_query=user_query,
similar_docs=similar_docs,
inference_model="gpt-3.5-turbo-0125",
max_completion_tokens=150,
temperature=0.2,
)
)
print(f"Cost for determining expertise area: ${expertise_area_cost}")

contextual_response, response_cost = query_pipeline.query_model_for_response(
context_enhanced_prompt=context_enhanced_prompt,
max_completion_tokens=1500,
temperature=0.7,
)
print(f"Cost for querying the model for a response: ${response_cost}")

# Output the response
print("--------\nContextual Prompt:\n--------")
print(context_enhanced_prompt)
print("--------\nResponse:\n--------")
print(contextual_response)


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion config.yaml

This file was deleted.

Binary file renamed faiss_index.bin → data/processed/faiss_index.bin
Binary file not shown.
15 changes: 11 additions & 4 deletions notebooks/RAG_LLM_experiments.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@
" )\n",
" return None, None\n",
"\n",
" def generate_embeddings(self):\n",
" def generate_embeddings(self, save_index=False, index_path=None):\n",
" total_cost = 0\n",
" for text in tqdm(self.texts):\n",
" embedding, usage = self.query_openai_embedding(text)\n",
Expand All @@ -239,6 +239,10 @@
" total_cost += self.calculate_cost(usage)\n",
" self.embeddings = np.array(self.embeddings)\n",
" self.create_faiss_index()\n",
"\n",
" if save_index and index_path:\n",
" self.save_faiss_index(index_path)\n",
"\n",
" return total_cost\n",
"\n",
" def create_faiss_index(self):\n",
Expand Down Expand Up @@ -299,11 +303,14 @@
" openai_api_key, models_config\n",
" )\n",
"\n",
" def setup_semantic_database(self, markdown_path, embedding_model):\n",
" def setup_semantic_database(\n",
" self, markdown_path, embedding_model, save_index=False, index_path=None\n",
" ):\n",
" self.embedder.set_model(embedding_model)\n",
" self.embedder.read_and_process_markdown(markdown_path)\n",
" total_cost = self.embedder.generate_embeddings()\n",
" self.embedder.create_faiss_index()\n",
" total_cost = self.embedder.generate_embeddings(\n",
" save_index=save_index, index_path=index_path\n",
" )\n",
" return total_cost\n",
"\n",
" def find_similar_documents(self, query_text, num_results):\n",
Expand Down
File renamed without changes.
86 changes: 86 additions & 0 deletions src/models/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import requests


class ModelInferenceManager:
def __init__(self, api_key, models_config):
self.api_key = api_key
self.models_config = models_config
self.model = None
self.input_token_price = None
self.output_token_price = None

def set_model(self, model_name):
for group in self.models_config["models"]:
for variant in group["variants"]:
if variant["model"] == model_name:
self.model = model_name
self.input_token_price = variant["input_price_per_token"]
self.output_token_price = variant["output_price_per_token"]
return
raise ValueError(f"Model {model_name} not found in configuration.")

def query_openai(self, prompt_text, max_completion_tokens=100, temperature=0.7):
if not self.model:
raise ValueError(
"Model not set. Please use set_model() to set a model before querying."
)
url = "https://api.openai.com/v1/chat/completions"
headers = {"Authorization": f"Bearer {self.api_key}"}
payload = {
"model": self.model,
"messages": [{"role": "user", "content": prompt_text}],
"max_tokens": max_completion_tokens,
"temperature": temperature,
}

try:
response = requests.post(url, headers=headers, json=payload, timeout=60)
if response.status_code == 200:
data = response.json()
content = data["choices"][0]["message"]["content"]
usage = data["usage"]
return content, usage
else:
return (
f"HTTP Error {response.status_code}: {response.json().get('error', {}).get('message', 'An unspecified error occurred')}",
None,
)
except requests.RequestException as e:
return f"Connection error: {e}", None

def calculate_cost(self, usage):
if usage:
total_price = (usage["prompt_tokens"] * self.input_token_price) + (
usage["completion_tokens"] * self.output_token_price
)
return total_price
else:
return None

def determine_expertise_area(
self, user_question, max_completion_tokens, temperature
):
prompt_text = f"""Based on the question provided, identify the relevant expertise area(s). Return your answer in the format:
'expertise1, expertise2, ...'. Provide only the expertise areas as a comma-separated list, no additional explanations are needed.
Here is the user Question:
{user_question}
"""
response, usage = self.query_openai(
prompt_text, max_completion_tokens, temperature
)
return response.strip(), (
usage if response else "Error determining expertise area."
)

def prepare_prompt_for_llm(self, expertise_area, user_question, context_documents):
prompt = (
f"You are an expert in '{expertise_area}'. A user has asked for help with the following question: "
f"'{user_question}'. Please provide insights using only the information from the provided documents. "
"If certain aspects are ambiguous or the documents do not fully address the question, please make educated inferences based on your expertise.\n\n"
"Here are the documents provided:\n\n"
)
for i, document in enumerate(context_documents, start=1):
prompt += f'Document {i}:\n"""\n{document}\n"""\n\n'
prompt += "Given your expertise and the information provided in these documents, synthesize the key insights to craft a detailed and relevant response to the above question.\n\n"
prompt += "Start your response below:\n\n"
return prompt
Loading

0 comments on commit 6eb1f08

Please sign in to comment.