diff --git a/notebooks/openvino/sentence_transformer_quantization.ipynb b/notebooks/openvino/sentence_transformer_quantization.ipynb new file mode 100644 index 0000000000..db5d202971 --- /dev/null +++ b/notebooks/openvino/sentence_transformer_quantization.ipynb @@ -0,0 +1,612 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quantization of Text Embedding model from Sentence Transformers library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install optimum[openvino]\n", + "%pip install evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize staticly model to 8-bit with NNCF via Optimum-Intel API" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n", + "Framework not specified. Using pt to export the model.\n", + "Using framework PyTorch: 2.4.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "Compiling the model to CPU ...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fd8729d418f3453bb1d97a2b038ff072", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "758502bfa6a142cc9078b8404a4b5d78",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e82786c713694f0da616dee6164aa242",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "344f6318563c4bed8dbedbdee08c9b59",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Configuration saved in all-MiniLM-L6-v2_int8/openvino_config.json\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "('all-MiniLM-L6-v2_int8/tokenizer_config.json',\n",
+       " 'all-MiniLM-L6-v2_int8/special_tokens_map.json',\n",
+       " 'all-MiniLM-L6-v2_int8/vocab.txt',\n",
+       " 'all-MiniLM-L6-v2_int8/added_tokens.json',\n",
+       " 'all-MiniLM-L6-v2_int8/tokenizer.json')"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from functools import partial\n",
+    "import datasets\n",
+    "from transformers import AutoTokenizer\n",
+    "from optimum.intel import OVModelForFeatureExtraction, OVQuantizer, OVQuantizationConfig, OVConfig\n",
+    "\n",
+    "MODEL_ID = \"sentence-transformers/all-MiniLM-L6-v2\"\n",
+    "\n",
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "model.save_pretrained(\"all-MiniLM-L6-v2\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)\n",
+    "DATASET_NAME = \"squad\"\n",
+    "dataset = datasets.load_dataset(DATASET_NAME)\n",
+    "int8_ptq_model_path = \"all-MiniLM-L6-v2_int8\"\n",
+    "\n",
+    "quantizer = OVQuantizer.from_pretrained(model)\n",
+    "\n",
+    "def preprocess_function(examples, tokenizer):\n",
+    "    return tokenizer(examples[\"sentence\"], padding=\"max_length\", max_length=384, truncation=True)\n",
+    "\n",
+    "calibration_dataset = quantizer.get_calibration_dataset(\n",
+    "    \"glue\",\n",
+    "    dataset_config_name=\"sst2\",\n",
+    "    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),\n",
+    "    num_samples=300,\n",
+    "    dataset_split=\"train\",\n",
+    ")\n",
+    "\n",
+    "ov_config = OVConfig(quantization_config=OVQuantizationConfig())\n",
+    "\n",
+    "quantizer.quantize(ov_config=ov_config, calibration_dataset=calibration_dataset, save_directory=int8_ptq_model_path)\n",
+    "tokenizer.save_pretrained(int8_ptq_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benchmark model accuracy on GLUE STSB task"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Pipeline\n",
+    "import torch.nn.functional as F\n",
+    "import torch \n",
+    "\n",
+    "# copied from the model card\n",
+    "def mean_pooling(model_output, attention_mask):\n",
+    "    token_embeddings = model_output[0] #First element of model_output contains all token embeddings\n",
+    "    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()\n",
+    "    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)\n",
+    "\n",
+    "\n",
+    "class SentenceEmbeddingPipeline(Pipeline):\n",
+    "    def _sanitize_parameters(self, **kwargs):\n",
+    "        # we don\"t have any hyperameters to sanitize\n",
+    "        preprocess_kwargs = {}\n",
+    "        return preprocess_kwargs, {}, {}\n",
+    "      \n",
+    "    def preprocess(self, inputs):\n",
+    "        encoded_inputs = self.tokenizer(inputs, padding=True, truncation=True, return_tensors=\"pt\")\n",
+    "        return encoded_inputs\n",
+    "\n",
+    "    def _forward(self, model_inputs):\n",
+    "        outputs = self.model(**model_inputs)\n",
+    "        return {\"outputs\": outputs, \"attention_mask\": model_inputs[\"attention_mask\"]}\n",
+    "\n",
+    "    def postprocess(self, model_outputs):\n",
+    "        # Perform pooling\n",
+    "        sentence_embeddings = mean_pooling(model_outputs[\"outputs\"], model_outputs[\"attention_mask\"])\n",
+    "        # Normalize embeddings\n",
+    "        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)\n",
+    "        return sentence_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "No OpenVINO files were found for sentence-transformers/all-MiniLM-L6-v2, setting `export=True` to convert the model to the OpenVINO IR. Don't forget to save the resulting model with `.save_pretrained()`\n",
+      "Framework not specified. Using pt to export the model.\n",
+      "Using framework PyTorch: 2.4.1+cpu\n",
+      "Overriding 1 configuration item(s)\n",
+      "\t- use_cache -> False\n",
+      "Compiling the model to CPU ...\n",
+      "Compiling the model to CPU ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = OVModelForFeatureExtraction.from_pretrained(MODEL_ID)\n",
+    "vanilla_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)\n",
+    "\n",
+    "q_model = OVModelForFeatureExtraction.from_pretrained(int8_ptq_model_path)\n",
+    "q8_emb = SentenceEmbeddingPipeline(model=model, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from evaluate import load\n",
+    "\n",
+    "eval_dataset = load_dataset(\"glue\",\"stsb\",split=\"validation\")\n",
+    "metric = load(\"glue\", \"stsb\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Parameter 'function'= of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only showed once. Subsequent hashing failures won't be showed.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6b39c9e0096a423bbcda949bede6a9cb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map:   0%|          | 0/1500 [00:00\n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 14.89 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         2067.30 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        9.88 ms\n",
+      "[ INFO ]    Average:       10.15 ms\n",
+      "[ INFO ]    Min:           9.60 ms\n",
+      "[ INFO ]    Max:           11.37 ms\n",
+      "[ INFO ] Throughput:   96.74 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# FP32 baseline model\n",
+    "!benchmark_app -m all-MiniLM-L6-v2/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
+      "To disable this warning, you can either:\n",
+      "\t- Avoid using `tokenizers` before the fork if possible\n",
+      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Step 1/11] Parsing and validating input arguments\n",
+      "[ INFO ] Parsing input parameters\n",
+      "[Step 2/11] Loading OpenVINO Runtime\n",
+      "[ INFO ] OpenVINO:\n",
+      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] \n",
+      "[ INFO ] Device info:\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[ INFO ] CPU\n",
+      "[ INFO ] Build ................................. 2024.5.0-16971-8a02b4c17bb\n",
+      "[ INFO ] \n",
+      "[ INFO ] \n",
+      "[Step 3/11] Setting device configuration\n",
+      "[ WARNING ] Performance hint was not explicitly specified in command line. Device(CPU) performance hint will be set to PerformanceMode.LATENCY.\n",
+      "[Step 4/11] Reading model files\n",
+      "[ INFO ] Loading model files\n",
+      "[ INFO ] Read model took 21.99 ms\n",
+      "[ INFO ] Original model I/O parameters:\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [?,?]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [?,?]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [?,?,384]\n",
+      "[Step 5/11] Resizing model to match image sizes and given batch\n",
+      "[ INFO ] Model batch size: 1\n",
+      "[ INFO ] Reshaping model: 'input_ids': [1,384], 'attention_mask': [1,384], 'token_type_ids': [1,384]\n",
+      "[ INFO ] Reshape model took 3.60 ms\n",
+      "[Step 6/11] Configuring input of the model\n",
+      "[ INFO ] Model inputs:\n",
+      "[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] / [1,384]\n",
+      "[ INFO ]     token_type_ids (node: token_type_ids) : i64 / [...] / [1,384]\n",
+      "[ INFO ] Model outputs:\n",
+      "[ INFO ]     last_hidden_state (node: __module.encoder.layer.5.output.LayerNorm/aten::layer_norm/Add) : f32 / [...] / [1,384,384]\n",
+      "[Step 7/11] Loading the model to the device\n",
+      "[ INFO ] Compile model took 324.67 ms\n",
+      "[Step 8/11] Querying optimal runtime parameters\n",
+      "[ INFO ] Model:\n",
+      "[ INFO ]   NETWORK_NAME: Model0\n",
+      "[ INFO ]   OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1\n",
+      "[ INFO ]   NUM_STREAMS: 1\n",
+      "[ INFO ]   INFERENCE_NUM_THREADS: 18\n",
+      "[ INFO ]   PERF_COUNT: NO\n",
+      "[ INFO ]   INFERENCE_PRECISION_HINT: \n",
+      "[ INFO ]   PERFORMANCE_HINT: LATENCY\n",
+      "[ INFO ]   EXECUTION_MODE_HINT: ExecutionMode.PERFORMANCE\n",
+      "[ INFO ]   PERFORMANCE_HINT_NUM_REQUESTS: 0\n",
+      "[ INFO ]   ENABLE_CPU_PINNING: True\n",
+      "[ INFO ]   SCHEDULING_CORE_TYPE: SchedulingCoreType.ANY_CORE\n",
+      "[ INFO ]   MODEL_DISTRIBUTION_POLICY: set()\n",
+      "[ INFO ]   ENABLE_HYPER_THREADING: False\n",
+      "[ INFO ]   EXECUTION_DEVICES: ['CPU']\n",
+      "[ INFO ]   CPU_DENORMALS_OPTIMIZATION: False\n",
+      "[ INFO ]   LOG_LEVEL: Level.NO\n",
+      "[ INFO ]   CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1.0\n",
+      "[ INFO ]   DYNAMIC_QUANTIZATION_GROUP_SIZE: 32\n",
+      "[ INFO ]   KV_CACHE_PRECISION: \n",
+      "[ INFO ]   AFFINITY: Affinity.CORE\n",
+      "[Step 9/11] Creating infer requests and preparing input tensors\n",
+      "[ WARNING ] No input files were given for input 'input_ids'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'attention_mask'!. This input will be filled with random values!\n",
+      "[ WARNING ] No input files were given for input 'token_type_ids'!. This input will be filled with random values!\n",
+      "[ INFO ] Fill input 'input_ids' with random values \n",
+      "[ INFO ] Fill input 'attention_mask' with random values \n",
+      "[ INFO ] Fill input 'token_type_ids' with random values \n",
+      "[Step 10/11] Measuring performance (Start inference synchronously, limits: 200 iterations)\n",
+      "[ INFO ] Benchmarking in inference only mode (inputs filling are not included in measurement loop).\n",
+      "[ INFO ] First inference took 9.54 ms\n",
+      "[Step 11/11] Dumping statistics report\n",
+      "[ INFO ] Execution Devices:['CPU']\n",
+      "[ INFO ] Count:            200 iterations\n",
+      "[ INFO ] Duration:         906.86 ms\n",
+      "[ INFO ] Latency:\n",
+      "[ INFO ]    Median:        4.19 ms\n",
+      "[ INFO ]    Average:       4.42 ms\n",
+      "[ INFO ]    Min:           4.09 ms\n",
+      "[ INFO ]    Max:           5.56 ms\n",
+      "[ INFO ] Throughput:   220.54 FPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# INT8 counterpart\n",
+    "!benchmark_app -m all-MiniLM-L6-v2_int8/openvino_model.xml -shape \"input_ids[1,384],attention_mask[1,384],token_type_ids[1,384]\" -api sync -niter 200"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "test3.11_cpu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 81dc085df9..0367c20e49 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -765,6 +765,10 @@ def ov_config(self) -> OVConfig:
     def dtype(self) -> torch.dtype:
         return OV_TO_PT_TYPE[self.ov_config.get("dtype", "f32")]
 
+    def modules(self):
+        return {}
+
+
     def _compile(self):
         if self.request is None:
             if (
@@ -842,6 +846,7 @@ def forward(
         return ModelOutput(**model_outputs)
 
 
+
 class OVModelUnet(OVPipelinePart):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)