From ed04ddf23d8a6dfb7afc48023a05656afceabf6d Mon Sep 17 00:00:00 2001 From: Alexander Date: Fri, 18 Oct 2024 17:40:21 +0400 Subject: [PATCH 01/12] Added notebook to showcase quantization of Sentence Transformers model --- .../sentence_transformer_quantization.ipynb | 612 ++++++++++++++++++ optimum/intel/openvino/modeling_diffusion.py | 5 + 2 files changed, 617 insertions(+) create mode 100644 notebooks/openvino/sentence_transformer_quantization.ipynb diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb new file mode 100644 index 000000000..db5d20297 --- /dev/null +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -0,0 +1,612 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantization of Text Embedding model from Sentence Transformers library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install optimum[openvino]\n", + "%pip install evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize staticly model to 8-bit with NNCF via Optimum-Intel API" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n", + "Framework not specified. Using pt to export the model.\n", + "Using framework PyTorch: 2.4.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "Compiling the model to CPU ...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fd8729d418f3453bb1d97a2b038ff072", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "758502bfa6a142cc9078b8404a4b5d78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e82786c713694f0da616dee6164aa242",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "344f6318563c4bed8dbedbdee08c9b59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Configuration saved in all-MiniLM-L6-v2_int8/openvino_config.json\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('all-MiniLM-L6-v2_int8/tokenizer_config.json',\n",
+       " 'all-MiniLM-L6-v2_int8/special_tokens_map.json',\n",
+       " 'all-MiniLM-L6-v2_int8/vocab.txt',\n",
+       " 'all-MiniLM-L6-v2_int8/added_tokens.json',\n",
+       " 'all-MiniLM-L6-v2_int8/tokenizer.json')"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functools import partial\n",
+    "import datasets\n",
+    "from transformers import AutoTokenizer\n",
+    "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n",
+    "\n",
+    "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "\n",
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "model.save_pretrained(\"all-MiniLM-L6-v2\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+    "DATASET_NAME = \"squad\"\n",
+    "dataset = datasets.load_dataset(DATASET_NAME)\n",
+    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
+    "\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
+    "\n",
+    "def preprocess_function(examples, tokenizer):\n",
+    "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
+    "\n",
+    "calibration_dataset = quantizer.get_calibration_dataset(\n",
+    "    \"glue\",\n",
+    "    dataset_config_name=\"sst2\",\n",
+    "    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),\n",
+    "    num_samples=300,\n",
+    "    dataset_split=\"train\",\n",
+    ")\n",
+    "\n",
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
+    "\n",
+    "quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=int8_ptq_model_path)\n",
+    "tokenizer.save_pretrained(int8_ptq_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark model accuracy on GLUE STSB task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Pipeline\n",
+    "import torch.nn.functional as F\n",
+    "import torch \n",
+    "\n",
+    "# copied from the model card\n",
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
+    "\n",
+    "\n",
+    "class SentenceEmbeddingPipeline(Pipeline):\n",
+    "    def _sanitize_parameters(self, **kwargs):\n",
+    "        # we don\"t have any hyperameters to sanitize\n",
+    "        preprocess_kwargs = {}\n",
+    "        return preprocess_kwargs, {}, {}\n",
+    "      \n",
+    "    def preprocess(self, inputs):\n",
+    "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
+    "        return encoded_inputs\n",
+    "\n",
+    "    def _forward(self, model_inputs):\n",
+    "        outputs = self.model(**model_inputs)\n",
+    "        return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n",
+    "\n",
+    "    def postprocess(self, model_outputs):\n",
+    "        # Perform pooling\n",
+    "        sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs[\"attention_mask\"])\n",
+    "        # Normalize embeddings\n",
+    "        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
+    "        return sentence_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n",
+      "Framework not specified. Using pt to export the model.\n",
+      "Using framework PyTorch: 2.4.1+cpu\n",
+      "Overriding 1 configuration item(s)\n",
+      "\t- use_cache -> False\n",
+      "Compiling the model to CPU ...\n",
+      "Compiling the model to CPU ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
+    "\n",
+    "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
+    "q8_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from evaluate import load\n",
+    "\n",
+    "eval_dataset = load_dataset(\"glue\",\"stsb\",split=\"validation\")\n",
+    "metric = load(\"glue\", \"stsb\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6b39c9e0096a423bbcda949bede6a9cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1500 [00:00\n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 14.89 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         2067.30 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        9.88 ms\n",
+      "[ INFO ]    Average:       10.15 ms\n",
+      "[ INFO ]    Min:           9.60 ms\n",
+      "[ INFO ]    Max:           11.37 ms\n",
+      "[ INFO ] Throughput:   96.74 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# FP32 baseline model\n",
+    "!benchmark_app -m all-MiniLM-L6-v2/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Step 1/11] Parsing and validating input arguments\n",
+      "[ INFO ] Parsing input parameters\n",
+      "[Step 2/11] Loading OpenVINO Runtime\n",
+      "[ INFO ] OpenVINO:\n",
+      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] \n",
+      "[ INFO ] Device info:\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ INFO ] CPU\n",
+      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] \n",
+      "[ INFO ] \n",
+      "[Step 3/11] Setting device configuration\n",
+      "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
+      "[Step 4/11] Reading model files\n",
+      "[ INFO ] Loading model files\n",
+      "[ INFO ] Read model took 21.99 ms\n",
+      "[ INFO ] Original model I/O parameters:\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [?,?,384]\n",
+      "[Step 5/11] Resizing model to match image sizes and given batch\n",
+      "[ INFO ] Model batch size: 1\n",
+      "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
+      "[ INFO ] Reshape model took 3.60 ms\n",
+      "[Step 6/11] Configuring input of the model\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
+      "[Step 7/11] Loading the model to the device\n",
+      "[ INFO ] Compile model took 324.67 ms\n",
+      "[Step 8/11] Querying optimal runtime parameters\n",
+      "[ INFO ] Model:\n",
+      "[ INFO ]   NETWORK_NAME: Model0\n",
+      "[ INFO ]   OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1\n",
+      "[ INFO ]   NUM_STREAMS: 1\n",
+      "[ INFO ]   INFERENCE_NUM_THREADS: 18\n",
+      "[ INFO ]   PERF_COUNT: NO\n",
+      "[ INFO ]   INFERENCE_PRECISION_HINT: \n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 9.54 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         906.86 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        4.19 ms\n",
+      "[ INFO ]    Average:       4.42 ms\n",
+      "[ INFO ]    Min:           4.09 ms\n",
+      "[ INFO ]    Max:           5.56 ms\n",
+      "[ INFO ] Throughput:   220.54 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# INT8 counterpart\n",
+    "!benchmark_app -m all-MiniLM-L6-v2_int8/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test3.11_cpu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 81dc085df..0367c20e4 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -765,6 +765,10 @@ def ov_config(self) -> OVConfig:
     def dtype(self) -> torch.dtype:
         return OV_TO_PT_TYPE[self.ov_config.get("dtype", "f32")]
 
+    def modules(self):
+        return {}
+
+
     def _compile(self):
         if self.request is None:
             if (
@@ -842,6 +846,7 @@ def forward(
         return ModelOutput(**model_outputs)
 
 
+
 class OVModelUnet(OVPipelinePart):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

From c3a3aea694f5eae27cf1985ae1786bb3a06a36c0 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Fri, 18 Oct 2024 17:44:39 +0400
Subject: [PATCH 02/12] Update optimum/intel/openvino/modeling_diffusion.py

---
 optimum/intel/openvino/modeling_diffusion.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 0367c20e4..3b89da759 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -765,10 +765,6 @@ def ov_config(self) -> OVConfig:
     def dtype(self) -> torch.dtype:
         return OV_TO_PT_TYPE[self.ov_config.get("dtype", "f32")]
 
-    def modules(self):
-        return {}
-
-
     def _compile(self):
         if self.request is None:
             if (

From 0bf2325089cae4dc396646d0b6f4315f2e6e9748 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Fri, 18 Oct 2024 17:45:02 +0400
Subject: [PATCH 03/12] Update optimum/intel/openvino/modeling_diffusion.py

---
 optimum/intel/openvino/modeling_diffusion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 3b89da759..81dc085df 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -842,7 +842,6 @@ def forward(
         return ModelOutput(**model_outputs)
 
 
-
 class OVModelUnet(OVPipelinePart):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

From b6220d5f8f8f3eca73353b0bdc0af89cde339eb9 Mon Sep 17 00:00:00 2001
From: Alexander 
Date: Fri, 18 Oct 2024 17:48:37 +0400
Subject: [PATCH 04/12] Style

---
 .../sentence_transformer_quantization.ipynb   | 45 ++++++++++---------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index db5d20297..2de76c6e4 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -177,9 +177,11 @@
     "\n",
     "quantizer = OVQuantizer.from_pretrained(model)\n",
     "\n",
+    "\n",
     "def preprocess_function(examples, tokenizer):\n",
     "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
     "\n",
+    "\n",
     "calibration_dataset = quantizer.get_calibration_dataset(\n",
     "    \"glue\",\n",
     "    dataset_config_name=\"sst2\",\n",
@@ -194,13 +196,6 @@
     "tokenizer.save_pretrained(int8_ptq_model_path)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -216,11 +211,12 @@
    "source": [
     "from transformers import Pipeline\n",
     "import torch.nn.functional as F\n",
-    "import torch \n",
+    "import torch\n",
+    "\n",
     "\n",
     "# copied from the model card\n",
     "def mean_pooling(model_output, attention_mask):\n",
-    "    token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n",
+    "    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings\n",
     "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
     "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
     "\n",
@@ -230,7 +226,7 @@
     "        # we don\"t have any hyperameters to sanitize\n",
     "        preprocess_kwargs = {}\n",
     "        return preprocess_kwargs, {}, {}\n",
-    "      \n",
+    "\n",
     "    def preprocess(self, inputs):\n",
     "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
     "        return encoded_inputs\n",
@@ -283,7 +279,7 @@
     "from datasets import load_dataset\n",
     "from evaluate import load\n",
     "\n",
-    "eval_dataset = load_dataset(\"glue\",\"stsb\",split=\"validation\")\n",
+    "eval_dataset = load_dataset(\"glue\", \"stsb\", split=\"validation\")\n",
     "metric = load(\"glue\", \"stsb\")"
    ]
   },
@@ -315,7 +311,7 @@
     }
    ],
    "source": [
-    "def compute_sentence_similarity(sentence_1, sentence_2,pipeline):\n",
+    "def compute_sentence_similarity(sentence_1, sentence_2, pipeline):\n",
     "    embedding_1 = pipeline(sentence_1)\n",
     "    embedding_2 = pipeline(sentence_2)\n",
     "    # compute cosine similarity between two sentences\n",
@@ -323,13 +319,14 @@
     "\n",
     "\n",
     "def evaluate_stsb(example):\n",
-    "  default = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], vanilla_emb)\n",
-    "  quantized = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], q8_emb)\n",
-    "  return {\n",
-    "      \"reference\": (example[\"label\"] - 1) / (5 - 1), # rescale to [0,1]\n",
-    "      \"default\": float(default),\n",
-    "      \"quantized\": float(quantized),\n",
-    "      }\n",
+    "    default = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], vanilla_emb)\n",
+    "    quantized = compute_sentence_similarity(example[\"sentence1\"], example[\"sentence2\"], q8_emb)\n",
+    "    return {\n",
+    "        \"reference\": (example[\"label\"] - 1) / (5 - 1),  # rescale to [0,1]\n",
+    "        \"default\": float(default),\n",
+    "        \"quantized\": float(quantized),\n",
+    "    }\n",
+    "\n",
     "\n",
     "result = eval_dataset.map(evaluate_stsb)"
    ]
@@ -353,9 +350,13 @@
     "default_acc = metric.compute(predictions=result[\"default\"], references=result[\"reference\"])\n",
     "quantized = metric.compute(predictions=result[\"quantized\"], references=result[\"reference\"])\n",
     "\n",
-    "print(\"vanilla model: pearson=\", default_acc['pearson'])\n",
-    "print(\"quantized model: pearson=\", quantized['pearson'])\n",
-    "print(\"The quantized model achieves \", round(quantized[\"pearson\"]/default_acc[\"pearson\"],2)*100, \"% accuracy of the fp32 model\")"
+    "print(\"vanilla model: pearson=\", default_acc[\"pearson\"])\n",
+    "print(\"quantized model: pearson=\", quantized[\"pearson\"])\n",
+    "print(\n",
+    "    \"The quantized model achieves \",\n",
+    "    round(quantized[\"pearson\"] / default_acc[\"pearson\"], 2) * 100,\n",
+    "    \"% accuracy of the fp32 model\",\n",
+    ")"
    ]
   },
   {

From 6de610c1d8f57c7fc6595133a372f2796f8d503a Mon Sep 17 00:00:00 2001
From: Alexander 
Date: Sat, 19 Oct 2024 09:36:10 +0400
Subject: [PATCH 05/12] Fixed small issue. Results are the same.

---
 .../sentence_transformer_quantization.ipynb   | 82 +++++++++----------
 1 file changed, 38 insertions(+), 44 deletions(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 2de76c6e4..40cd7d4d7 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -44,7 +44,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fd8729d418f3453bb1d97a2b038ff072",
+       "model_id": "53d4d1f1703a4e52812ea366c06f2d67",
        "version_major": 2,
        "version_minor": 0
       },
@@ -68,7 +68,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "758502bfa6a142cc9078b8404a4b5d78",
+       "model_id": "a3de9a9bbdd942069b85519c83267f83",
        "version_major": 2,
        "version_minor": 0
       },
@@ -92,7 +92,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e82786c713694f0da616dee6164aa242",
+       "model_id": "ebc55f3ce3974aaa8861474699d5a15f",
        "version_major": 2,
        "version_minor": 0
       },
@@ -116,7 +116,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "344f6318563c4bed8dbedbdee08c9b59",
+       "model_id": "f206e4e8651f4f449f9dcb1fc11ef266",
        "version_major": 2,
        "version_minor": 0
       },
@@ -267,7 +267,7 @@
     "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
     "\n",
     "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
-    "q8_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)"
+    "q8_emb = SentenceEmbeddingPipeline(model=q_model, tokenizer=tokenizer)"
    ]
   },
   {
@@ -292,13 +292,13 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6b39c9e0096a423bbcda949bede6a9cb",
+       "model_id": "0f28df147f95484c955c3f20f2f954d2",
        "version_major": 2,
        "version_minor": 0
       },
@@ -341,7 +341,7 @@
      "output_type": "stream",
      "text": [
       "vanilla model: pearson= 0.869619439095004\n",
-      "quantized model: pearson= 0.869619439095004\n",
+      "quantized model: pearson= 0.869326218489249\n",
       "The quantized model achieves  100.0 % accuracy of the fp32 model\n"
      ]
     }
@@ -368,7 +368,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -391,13 +391,7 @@
       "[ INFO ] OpenVINO:\n",
       "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
       "[ INFO ] \n",
-      "[ INFO ] Device info:\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "[ INFO ] Device info:\n",
       "[ INFO ] CPU\n",
       "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
       "[ INFO ] \n",
@@ -406,7 +400,7 @@
       "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
       "[Step 4/11] Reading model files\n",
       "[ INFO ] Loading model files\n",
-      "[ INFO ] Read model took 11.28 ms\n",
+      "[ INFO ] Read model took 10.87 ms\n",
       "[ INFO ] Original model I/O parameters:\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
@@ -417,7 +411,7 @@
       "[Step 5/11] Resizing model to match image sizes and given batch\n",
       "[ INFO ] Model batch size: 1\n",
       "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
-      "[ INFO ] Reshape model took 2.58 ms\n",
+      "[ INFO ] Reshape model took 3.02 ms\n",
       "[Step 6/11] Configuring input of the model\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
@@ -426,7 +420,7 @@
       "[ INFO ] Model outputs:\n",
       "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
       "[Step 7/11] Loading the model to the device\n",
-      "[ INFO ] Compile model took 132.33 ms\n",
+      "[ INFO ] Compile model took 125.14 ms\n",
       "[Step 8/11] Querying optimal runtime parameters\n",
       "[ INFO ] Model:\n",
       "[ INFO ]   NETWORK_NAME: Model0\n",
@@ -458,17 +452,23 @@
       "[ INFO ] Fill input 'token_type_ids' with random values \n",
       "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
       "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
-      "[ INFO ] First inference took 14.89 ms\n",
+      "[ INFO ] First inference took 13.97 ms\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "[Step 11/11] Dumping statistics report\n",
       "[ INFO ] Execution Devices:['CPU']\n",
       "[ INFO ] Count:            200 iterations\n",
-      "[ INFO ] Duration:         2067.30 ms\n",
+      "[ INFO ] Duration:         1988.82 ms\n",
       "[ INFO ] Latency:\n",
-      "[ INFO ]    Median:        9.88 ms\n",
-      "[ INFO ]    Average:       10.15 ms\n",
-      "[ INFO ]    Min:           9.60 ms\n",
-      "[ INFO ]    Max:           11.37 ms\n",
-      "[ INFO ] Throughput:   96.74 FPS\n"
+      "[ INFO ]    Median:        9.70 ms\n",
+      "[ INFO ]    Average:       9.77 ms\n",
+      "[ INFO ]    Min:           9.54 ms\n",
+      "[ INFO ]    Max:           11.35 ms\n",
+      "[ INFO ] Throughput:   100.56 FPS\n"
      ]
     }
    ],
@@ -479,7 +479,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -502,13 +502,7 @@
       "[ INFO ] OpenVINO:\n",
       "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
       "[ INFO ] \n",
-      "[ INFO ] Device info:\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "[ INFO ] Device info:\n",
       "[ INFO ] CPU\n",
       "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
       "[ INFO ] \n",
@@ -517,7 +511,7 @@
       "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
       "[Step 4/11] Reading model files\n",
       "[ INFO ] Loading model files\n",
-      "[ INFO ] Read model took 21.99 ms\n",
+      "[ INFO ] Read model took 15.46 ms\n",
       "[ INFO ] Original model I/O parameters:\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
@@ -528,7 +522,7 @@
       "[Step 5/11] Resizing model to match image sizes and given batch\n",
       "[ INFO ] Model batch size: 1\n",
       "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
-      "[ INFO ] Reshape model took 3.60 ms\n",
+      "[ INFO ] Reshape model took 6.89 ms\n",
       "[Step 6/11] Configuring input of the model\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
@@ -537,7 +531,7 @@
       "[ INFO ] Model outputs:\n",
       "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
       "[Step 7/11] Loading the model to the device\n",
-      "[ INFO ] Compile model took 324.67 ms\n",
+      "[ INFO ] Compile model took 325.40 ms\n",
       "[Step 8/11] Querying optimal runtime parameters\n",
       "[ INFO ] Model:\n",
       "[ INFO ]   NETWORK_NAME: Model0\n",
@@ -569,17 +563,17 @@
       "[ INFO ] Fill input 'token_type_ids' with random values \n",
       "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
       "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
-      "[ INFO ] First inference took 9.54 ms\n",
+      "[ INFO ] First inference took 8.49 ms\n",
       "[Step 11/11] Dumping statistics report\n",
       "[ INFO ] Execution Devices:['CPU']\n",
       "[ INFO ] Count:            200 iterations\n",
-      "[ INFO ] Duration:         906.86 ms\n",
+      "[ INFO ] Duration:         869.96 ms\n",
       "[ INFO ] Latency:\n",
-      "[ INFO ]    Median:        4.19 ms\n",
-      "[ INFO ]    Average:       4.42 ms\n",
-      "[ INFO ]    Min:           4.09 ms\n",
-      "[ INFO ]    Max:           5.56 ms\n",
-      "[ INFO ] Throughput:   220.54 FPS\n"
+      "[ INFO ]    Median:        4.17 ms\n",
+      "[ INFO ]    Average:       4.23 ms\n",
+      "[ INFO ]    Min:           4.08 ms\n",
+      "[ INFO ]    Max:           6.04 ms\n",
+      "[ INFO ] Throughput:   229.89 FPS\n"
      ]
     }
    ],

From bae977207a8cf49c40902c04f4c410b1ecaf6b92 Mon Sep 17 00:00:00 2001
From: Alexander 
Date: Tue, 22 Oct 2024 10:35:06 +0400
Subject: [PATCH 06/12] Added description to the sections of the notebook

---
 notebooks/openvino/requirements.txt            |  1 -
 .../sentence_transformer_quantization.ipynb    | 18 +++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/notebooks/openvino/requirements.txt b/notebooks/openvino/requirements.txt
index bb7a517cf..64ccd6d8c 100644
--- a/notebooks/openvino/requirements.txt
+++ b/notebooks/openvino/requirements.txt
@@ -4,4 +4,3 @@ evaluate[evaluator]
 ipywidgets
 pillow
 torchaudio
-
diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 40cd7d4d7..16992bc8a 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -24,6 +24,15 @@
     "## Quantize staticly model to 8-bit with NNCF via Optimum-Intel API"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The code snippet below shows how to use Optimum-Intel [Model Optimization API](https://huggingface.co/docs/optimum/en/intel/openvino/optimization#static-quantization) to quantize the model staticly. It leaverages [NNCF](https://github.com/openvinotoolkit/nncf) capabilites for static quantization of Transformer models where a combination of the special quantization scheme + SmoothQuant method + Bias Correction method are used to provide state-of-the-art accuracy.\n",
+    "\n",
+    "The static quantization requires some data to estimate quantization parameters of activations. It means that some calibration dataset should be provided. `OVQuantizer` class used for quantization provides an API to build such a dataset with `.get_calibration_dataset()` method."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -203,6 +212,13 @@
     "## Benchmark model accuracy on GLUE STSB task"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we estimate accuracy impact from model quantization. We evaluate accuracy of both the baseline and quantized model on a different task from the GLUE benchmark."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -214,7 +230,7 @@
     "import torch\n",
     "\n",
     "\n",
-    "# copied from the model card\n",
+    "# copied from the model card \"sentence-transformers/all-MiniLM-L6-v2\"\n",
     "def mean_pooling(model_output, attention_mask):\n",
     "    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings\n",
     "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",

From 8b2b912ae2877640892386be077f97382c48af9d Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Wed, 23 Oct 2024 13:20:46 +0400
Subject: [PATCH 07/12] Update
 notebooks/openvino/sentence_transformer_quantization.ipynb

Co-authored-by: Helena Kloosterman 
---
 notebooks/openvino/sentence_transformer_quantization.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 16992bc8a..77775ec63 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -370,7 +370,7 @@
     "print(\"quantized model: pearson=\", quantized[\"pearson\"])\n",
     "print(\n",
     "    \"The quantized model achieves \",\n",
-    "    round(quantized[\"pearson\"] / default_acc[\"pearson\"], 2) * 100,\n",
+    "    round((quantized["pearson"] / default_acc["pearson"]) * 100,2),\n",
     "    \"% accuracy of the fp32 model\",\n",
     ")"
    ]

From 4d23b99457bd8647bd2251409cac8e0f0d189374 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Wed, 23 Oct 2024 13:21:09 +0400
Subject: [PATCH 08/12] Update
 notebooks/openvino/sentence_transformer_quantization.ipynb

Co-authored-by: Helena Kloosterman 
---
 notebooks/openvino/sentence_transformer_quantization.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 77775ec63..5520ef757 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -180,6 +180,7 @@
     "model.save_pretrained(\"all-MiniLM-L6-v2\")\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+    "tokenizer.save_pretrained(\"all-MiniLM-L6-v2\")\n",
     "DATASET_NAME = \"squad\"\n",
     "dataset = datasets.load_dataset(DATASET_NAME)\n",
     "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",

From 6f025b483248f263541ca61388e591a7e6852d15 Mon Sep 17 00:00:00 2001
From: Alexander Kozlov 
Date: Wed, 23 Oct 2024 13:21:17 +0400
Subject: [PATCH 09/12] Update
 notebooks/openvino/sentence_transformer_quantization.ipynb

Co-authored-by: Helena Kloosterman 
---
 notebooks/openvino/sentence_transformer_quantization.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 5520ef757..6aa7cd066 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -380,7 +380,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Compare performance of the baselein and INT8 models"
+    "## Compare performance of the baseline and INT8 models"
    ]
   },
   {

From e279b70317566631da5e7b4be6610a81620ff018 Mon Sep 17 00:00:00 2001
From: Alexander 
Date: Wed, 23 Oct 2024 16:35:32 +0400
Subject: [PATCH 10/12] Fixed issue. Added info about benchmarking

---
 .../openvino/sentence_transformer_quantization.ipynb     | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 6aa7cd066..0be35000a 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -371,7 +371,7 @@
     "print(\"quantized model: pearson=\", quantized[\"pearson\"])\n",
     "print(\n",
     "    \"The quantized model achieves \",\n",
-    "    round((quantized["pearson"] / default_acc["pearson"]) * 100,2),\n",
+    "    round(quantized[\"pearson\"] / default_acc[\"pearson\"], 2) * 100,\n",
     "    \"% accuracy of the fp32 model\",\n",
     ")"
    ]
@@ -383,6 +383,13 @@
     "## Compare performance of the baseline and INT8 models"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We use OpenVINO `benchmark_app` with static input shape `[1,384]` for performance benchmarking. It should reflect the application performance as the tokenizer pads or trancates the input sequence to `max_length=384`."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,

From f6756c94735e81ff12c50a1fc321a4d52a3c4feb Mon Sep 17 00:00:00 2001
From: Alexander 
Date: Wed, 23 Oct 2024 16:40:44 +0400
Subject: [PATCH 11/12] Fixed paths to models

---
 .../sentence_transformer_quantization.ipynb   | 106 +++++++++---------
 1 file changed, 51 insertions(+), 55 deletions(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 0be35000a..953b529b6 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -53,7 +53,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "53d4d1f1703a4e52812ea366c06f2d67",
+       "model_id": "a9bd847756fd467e905a7ad7a243640c",
        "version_major": 2,
        "version_minor": 0
       },
@@ -77,7 +77,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a3de9a9bbdd942069b85519c83267f83",
+       "model_id": "9d8ad91623d642f48e85b60ac823aca4",
        "version_major": 2,
        "version_minor": 0
       },
@@ -101,7 +101,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ebc55f3ce3974aaa8861474699d5a15f",
+       "model_id": "a2a7d09a573c4092a830bbaadc39f756",
        "version_major": 2,
        "version_minor": 0
       },
@@ -125,7 +125,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f206e4e8651f4f449f9dcb1fc11ef266",
+       "model_id": "b67c493aab36426090f8fafd25a17a00",
        "version_major": 2,
        "version_minor": 0
       },
@@ -163,7 +163,7 @@
        " 'all-MiniLM-L6-v2_int8/tokenizer.json')"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -175,16 +175,17 @@
     "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n",
     "\n",
     "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "base_model_path = \"all-MiniLM-L6-v2\"\n",
+    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
     "\n",
     "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
-    "model.save_pretrained(\"all-MiniLM-L6-v2\")\n",
+    "model.save_pretrained(base_model_path)\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
-    "tokenizer.save_pretrained(\"all-MiniLM-L6-v2\")\n",
+    "tokenizer.save_pretrained(base_model_path)\n",
+    "\n",
     "DATASET_NAME = \"squad\"\n",
     "dataset = datasets.load_dataset(DATASET_NAME)\n",
-    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
-    "\n",
     "quantizer = OVQuantizer.from_pretrained(model)\n",
     "\n",
     "\n",
@@ -222,7 +223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -262,25 +263,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n",
-      "Framework not specified. Using pt to export the model.\n",
-      "Using framework PyTorch: 2.4.1+cpu\n",
-      "Overriding 1 configuration item(s)\n",
-      "\t- use_cache -> False\n",
-      "Compiling the model to CPU ...\n",
+      "Compiling the model to CPU ...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
       "Compiling the model to CPU ...\n"
      ]
     }
    ],
    "source": [
-    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "model = OVModelForFeatureExtraction.from_pretrained(base_model_path)\n",
     "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
     "\n",
     "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
@@ -289,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -302,20 +304,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "0f28df147f95484c955c3f20f2f954d2",
+       "model_id": "5cab9e8fc58245a4b395a9575017633b",
        "version_major": 2,
        "version_minor": 0
       },
@@ -350,7 +352,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -358,7 +360,7 @@
      "output_type": "stream",
      "text": [
       "vanilla model: pearson= 0.869619439095004\n",
-      "quantized model: pearson= 0.869326218489249\n",
+      "quantized model: pearson= 0.869415534480936\n",
       "The quantized model achieves  100.0 % accuracy of the fp32 model\n"
      ]
     }
@@ -392,7 +394,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -413,18 +415,18 @@
       "[ INFO ] Parsing input parameters\n",
       "[Step 2/11] Loading OpenVINO Runtime\n",
       "[ INFO ] OpenVINO:\n",
-      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
       "[ INFO ] \n",
       "[ INFO ] Device info:\n",
       "[ INFO ] CPU\n",
-      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
       "[ INFO ] \n",
       "[ INFO ] \n",
       "[Step 3/11] Setting device configuration\n",
       "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
       "[Step 4/11] Reading model files\n",
       "[ INFO ] Loading model files\n",
-      "[ INFO ] Read model took 10.87 ms\n",
+      "[ INFO ] Read model took 10.17 ms\n",
       "[ INFO ] Original model I/O parameters:\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
@@ -435,7 +437,7 @@
       "[Step 5/11] Resizing model to match image sizes and given batch\n",
       "[ INFO ] Model batch size: 1\n",
       "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
-      "[ INFO ] Reshape model took 3.02 ms\n",
+      "[ INFO ] Reshape model took 2.23 ms\n",
       "[Step 6/11] Configuring input of the model\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
@@ -444,7 +446,7 @@
       "[ INFO ] Model outputs:\n",
       "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
       "[Step 7/11] Loading the model to the device\n",
-      "[ INFO ] Compile model took 125.14 ms\n",
+      "[ INFO ] Compile model took 134.63 ms\n",
       "[Step 8/11] Querying optimal runtime parameters\n",
       "[ INFO ] Model:\n",
       "[ INFO ]   NETWORK_NAME: Model0\n",
@@ -476,22 +478,16 @@
       "[ INFO ] Fill input 'token_type_ids' with random values \n",
       "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
       "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
-      "[ INFO ] First inference took 13.97 ms\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
+      "[ INFO ] First inference took 12.27 ms\n",
       "[Step 11/11] Dumping statistics report\n",
       "[ INFO ] Execution Devices:['CPU']\n",
       "[ INFO ] Count:            200 iterations\n",
-      "[ INFO ] Duration:         1988.82 ms\n",
+      "[ INFO ] Duration:         1988.84 ms\n",
       "[ INFO ] Latency:\n",
-      "[ INFO ]    Median:        9.70 ms\n",
+      "[ INFO ]    Median:        9.74 ms\n",
       "[ INFO ]    Average:       9.77 ms\n",
-      "[ INFO ]    Min:           9.54 ms\n",
-      "[ INFO ]    Max:           11.35 ms\n",
+      "[ INFO ]    Min:           9.59 ms\n",
+      "[ INFO ]    Max:           11.12 ms\n",
       "[ INFO ] Throughput:   100.56 FPS\n"
      ]
     }
@@ -503,7 +499,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -524,18 +520,18 @@
       "[ INFO ] Parsing input parameters\n",
       "[Step 2/11] Loading OpenVINO Runtime\n",
       "[ INFO ] OpenVINO:\n",
-      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
       "[ INFO ] \n",
       "[ INFO ] Device info:\n",
       "[ INFO ] CPU\n",
-      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] Build ................................. 2024.4.1-16618-643f23d1318-releases/2024/4\n",
       "[ INFO ] \n",
       "[ INFO ] \n",
       "[Step 3/11] Setting device configuration\n",
       "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
       "[Step 4/11] Reading model files\n",
       "[ INFO ] Loading model files\n",
-      "[ INFO ] Read model took 15.46 ms\n",
+      "[ INFO ] Read model took 20.87 ms\n",
       "[ INFO ] Original model I/O parameters:\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
@@ -546,7 +542,7 @@
       "[Step 5/11] Resizing model to match image sizes and given batch\n",
       "[ INFO ] Model batch size: 1\n",
       "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
-      "[ INFO ] Reshape model took 6.89 ms\n",
+      "[ INFO ] Reshape model took 3.42 ms\n",
       "[Step 6/11] Configuring input of the model\n",
       "[ INFO ] Model inputs:\n",
       "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
@@ -555,7 +551,7 @@
       "[ INFO ] Model outputs:\n",
       "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
       "[Step 7/11] Loading the model to the device\n",
-      "[ INFO ] Compile model took 325.40 ms\n",
+      "[ INFO ] Compile model took 323.91 ms\n",
       "[Step 8/11] Querying optimal runtime parameters\n",
       "[ INFO ] Model:\n",
       "[ INFO ]   NETWORK_NAME: Model0\n",
@@ -587,17 +583,17 @@
       "[ INFO ] Fill input 'token_type_ids' with random values \n",
       "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
       "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
-      "[ INFO ] First inference took 8.49 ms\n",
+      "[ INFO ] First inference took 6.72 ms\n",
       "[Step 11/11] Dumping statistics report\n",
       "[ INFO ] Execution Devices:['CPU']\n",
       "[ INFO ] Count:            200 iterations\n",
-      "[ INFO ] Duration:         869.96 ms\n",
+      "[ INFO ] Duration:         853.85 ms\n",
       "[ INFO ] Latency:\n",
-      "[ INFO ]    Median:        4.17 ms\n",
-      "[ INFO ]    Average:       4.23 ms\n",
-      "[ INFO ]    Min:           4.08 ms\n",
-      "[ INFO ]    Max:           6.04 ms\n",
-      "[ INFO ] Throughput:   229.89 FPS\n"
+      "[ INFO ]    Median:        4.13 ms\n",
+      "[ INFO ]    Average:       4.15 ms\n",
+      "[ INFO ]    Min:           4.05 ms\n",
+      "[ INFO ]    Max:           5.13 ms\n",
+      "[ INFO ] Throughput:   234.23 FPS\n"
      ]
     }
    ],

From c7ffcf04f55a2d57bc051ceb5321753b0593b474 Mon Sep 17 00:00:00 2001
From: Alexander 
Date: Thu, 24 Oct 2024 15:41:41 +0400
Subject: [PATCH 12/12] Removed unused code

---
 notebooks/openvino/sentence_transformer_quantization.ipynb | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb
index 953b529b6..714544aa9 100644
--- a/notebooks/openvino/sentence_transformer_quantization.ipynb
+++ b/notebooks/openvino/sentence_transformer_quantization.ipynb
@@ -184,10 +184,8 @@
     "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
     "tokenizer.save_pretrained(base_model_path)\n",
     "\n",
-    "DATASET_NAME = \"squad\"\n",
-    "dataset = datasets.load_dataset(DATASET_NAME)\n",
-    "quantizer = OVQuantizer.from_pretrained(model)\n",
     "\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
     "\n",
     "def preprocess_function(examples, tokenizer):\n",
     "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",