chore: autopublish 2024-04-06T16:52:20Z

labrijisaad · Apr 6, 2024 · 3145542 · 3145542
1 parent 210e786
commit 3145542
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 27 deletions.
diff --git a/notebooks/llm_inference_experiments.ipynb b/notebooks/llm_inference_experiments.ipynb
@@ -377,6 +377,7 @@
    "source": [
     "import requests\n",
     "\n",
+    "\n",
     "class PromptPreparator:\n",
     "    def __init__(self, api_key, models_config):\n",
     "        self.api_key = api_key\n",
@@ -386,18 +387,20 @@
     "        self.output_token_price = 0\n",
     "\n",
     "    def set_model(self, model_name):\n",
-    "        for group in self.models_config['models']:\n",
-    "            for variant in group['variants']:\n",
-    "                if variant['model'] == model_name:\n",
+    "        for group in self.models_config[\"models\"]:\n",
+    "            for variant in group[\"variants\"]:\n",
+    "                if variant[\"model\"] == model_name:\n",
     "                    self.model = model_name\n",
-    "                    self.input_token_price = variant.get('input_price_per_token', 0)\n",
-    "                    self.output_token_price = variant.get('output_price_per_token', 0)\n",
+    "                    self.input_token_price = variant.get(\"input_price_per_token\", 0)\n",
+    "                    self.output_token_price = variant.get(\"output_price_per_token\", 0)\n",
     "                    return\n",
     "        raise ValueError(f\"Model {model_name} not found in configuration.\")\n",
     "\n",
     "    def query_openai(self, prompt_text, max_completion_tokens=100, temperature=0.7):\n",
     "        if not self.model:\n",
-    "            raise ValueError(\"Model not set. Please use set_model() to set a model before querying.\")\n",
+    "            raise ValueError(\n",
+    "                \"Model not set. Please use set_model() to set a model before querying.\"\n",
+    "            )\n",
     "        url = \"https://api.openai.com/v1/chat/completions\"\n",
     "        headers = {\"Authorization\": f\"Bearer {self.api_key}\"}\n",
     "        payload = {\n",
@@ -415,36 +418,49 @@
     "                usage = data[\"usage\"]\n",
     "                return content, usage\n",
     "            else:\n",
-    "                return f\"HTTP Error {response.status_code}: {response.json().get('error', {}).get('message', 'An unspecified error occurred')}\", None\n",
+    "                return (\n",
+    "                    f\"HTTP Error {response.status_code}: {response.json().get('error', {}).get('message', 'An unspecified error occurred')}\",\n",
+    "                    None,\n",
+    "                )\n",
     "        except requests.RequestException as e:\n",
     "            return f\"Connection error: {e}\", None\n",
     "\n",
     "    def calculate_inference_price(self, usage):\n",
     "        if usage:\n",
-    "            total_price = (usage[\"prompt_tokens\"] * self.input_token_price) + (usage[\"completion_tokens\"] * self.output_token_price)\n",
+    "            total_price = (usage[\"prompt_tokens\"] * self.input_token_price) + (\n",
+    "                usage[\"completion_tokens\"] * self.output_token_price\n",
+    "            )\n",
     "            return total_price\n",
     "        else:\n",
     "            return None\n",
-    "            \n",
-    "    def determine_expertise_area(self, user_question, max_completion_tokens=150, temperature=0.2):\n",
+    "\n",
+    "    def determine_expertise_area(\n",
+    "        self, user_question, max_completion_tokens=150, temperature=0.2\n",
+    "    ):\n",
     "        prompt_text = f\"\"\"Based on the question provided, identify the relevant expertise area(s). Return your answer in the format: \n",
     "        'expertise1, expertise2, ...'. Provide only the expertise areas as a comma-separated list, no additional explanations are needed.\n",
     "        Here is the user Question:\n",
     "        {user_question}\n",
     "        \"\"\"\n",
-    "        response, usage = self.query_openai(prompt_text, max_completion_tokens, temperature)\n",
-    "        return response.strip(), usage if response else \"Error determining expertise area.\"\n",
+    "        response, usage = self.query_openai(\n",
+    "            prompt_text, max_completion_tokens, temperature\n",
+    "        )\n",
+    "        return response.strip(), (\n",
+    "            usage if response else \"Error determining expertise area.\"\n",
+    "        )\n",
     "\n",
     "    def prepare_prompt_for_llm(self, expertise_area, user_question, context_documents):\n",
-    "        prompt = (f\"You are an expert in '{expertise_area}'. A user has asked for help with the following question: \"\n",
-    "                  f\"'{user_question}'. Please provide insights using only the information from the provided documents. \"\n",
-    "                  \"If certain aspects are ambiguous or the documents do not fully address the question, please make educated inferences based on your expertise.\\n\\n\"\n",
-    "                  \"Here are the documents provided:\\n\\n\")\n",
+    "        prompt = (\n",
+    "            f\"You are an expert in '{expertise_area}'. A user has asked for help with the following question: \"\n",
+    "            f\"'{user_question}'. Please provide insights using only the information from the provided documents. \"\n",
+    "            \"If certain aspects are ambiguous or the documents do not fully address the question, please make educated inferences based on your expertise.\\n\\n\"\n",
+    "            \"Here are the documents provided:\\n\\n\"\n",
+    "        )\n",
     "        for i, document in enumerate(context_documents, start=1):\n",
     "            prompt += f'Document {i}:\\n\"\"\"\\n{document}\\n\"\"\"\\n\\n'\n",
     "        prompt += \"Given your expertise and the information provided in these documents, synthesize the key insights to craft a detailed and relevant response to the above question.\\n\\n\"\n",
     "        prompt += \"Start your response below:\\n\\n\"\n",
-    "        return prompt    "
+    "        return prompt"
    ]
   },
   {
@@ -524,17 +540,19 @@
    "source": [
     "import yaml\n",
     "\n",
+    "\n",
     "def load_models_config(file_path):\n",
-    "    with open(file_path, 'r') as stream:\n",
+    "    with open(file_path, \"r\") as stream:\n",
     "        try:\n",
     "            models_config = yaml.safe_load(stream)\n",
     "            return models_config\n",
     "        except yaml.YAMLError as exc:\n",
     "            print(exc)\n",
     "            return None\n",
     "\n",
+    "\n",
     "# Example usage\n",
-    "file_path = '../config/models_config.yml'\n",
+    "file_path = \"../config/models_config.yml\"\n",
     "models_config = load_models_config(file_path)\n",
     "models_config"
    ]
@@ -655,7 +673,9 @@
     "    \"Document 1 content about RAG...\",\n",
     "    \"Document 2 content about LLMs & RAGs...\",\n",
     "]\n",
-    "prompt = inference.prepare_prompt_for_llm(expertise_area, user_question, context_documents)\n",
+    "prompt = inference.prepare_prompt_for_llm(\n",
+    "    expertise_area, user_question, context_documents\n",
+    ")\n",
     "print(prompt)"
    ]
   },
@@ -688,7 +708,9 @@
     }
    ],
    "source": [
-    "final_response, final_usage = inference.query_openai(prompt, max_completion_tokens=1500, temperature=0.7)\n",
+    "final_response, final_usage = inference.query_openai(\n",
+    "    prompt, max_completion_tokens=1500, temperature=0.7\n",
+    ")\n",
     "print(final_response)"
    ]
   },

diff --git a/notebooks/vectorization_experiments.ipynb b/notebooks/vectorization_experiments.ipynb
@@ -491,8 +491,13 @@
     "import re\n",
     "from tqdm.auto import tqdm\n",
     "\n",
+    "\n",
     "class OpenAIEmbeddings:\n",
-    "    def __init__(self, credentials_path=\"../secrets/credentials.yml\", embedding_model=\"text-embedding-ada-002\"):\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        credentials_path=\"../secrets/credentials.yml\",\n",
+    "        embedding_model=\"text-embedding-ada-002\",\n",
+    "    ):\n",
     "        self.credentials_path = credentials_path\n",
     "        self.embedding_model = embedding_model\n",
     "        self.api_key = self.load_credentials()\n",
@@ -509,7 +514,9 @@
     "        with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
     "            text = file.read()\n",
     "        sections = re.split(r\"\\n(#{1,3} .*)\\n\", text)\n",
-    "        self.texts = [sections[0]] + [sections[i] + sections[i + 1] for i in range(1, len(sections), 2)]\n",
+    "        self.texts = [sections[0]] + [\n",
+    "            sections[i] + sections[i + 1] for i in range(1, len(sections), 2)\n",
+    "        ]\n",
     "        return self.texts\n",
     "\n",
     "    def query_openai_embedding(self, text):\n",
@@ -528,7 +535,13 @@
     "            return None\n",
     "\n",
     "    def generate_embeddings(self):\n",
-    "        self.embeddings = np.array([self.query_openai_embedding(text) for text in tqdm(self.texts) if text is not None])\n",
+    "        self.embeddings = np.array(\n",
+    "            [\n",
+    "                self.query_openai_embedding(text)\n",
+    "                for text in tqdm(self.texts)\n",
+    "                if text is not None\n",
+    "            ]\n",
+    "        )\n",
     "\n",
     "    def create_faiss_index(self):\n",
     "        dimension = self.embeddings.shape[1]\n",
@@ -537,7 +550,9 @@
     "\n",
     "    def search_similar_sections(self, query_text, num_results=2):\n",
     "        query_embedding = self.query_openai_embedding(query_text)\n",
-    "        distances, indices = self.faiss_index.search(np.array([query_embedding], dtype=\"float32\"), num_results)\n",
+    "        distances, indices = self.faiss_index.search(\n",
+    "            np.array([query_embedding], dtype=\"float32\"), num_results\n",
+    "        )\n",
     "        return [{\"index\": idx, \"text\": self.texts[idx]} for idx in indices[0]]\n",
     "\n",
     "    def save_faiss_index(self, index_path=\"../faiss_index.bin\"):\n",
@@ -555,8 +570,10 @@
    "outputs": [],
    "source": [
     "# Initialize the class\n",
-    "embedder = OpenAIEmbeddings(credentials_path=\"../secrets/credentials.yml\", \n",
-    "                            embedding_model=\"text-embedding-ada-002\")"
+    "embedder = OpenAIEmbeddings(\n",
+    "    credentials_path=\"../secrets/credentials.yml\",\n",
+    "    embedding_model=\"text-embedding-ada-002\",\n",
+    ")"
    ]
   },
   {