diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index dc919ec5a..f74675ddd 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -46,7 +46,7 @@ ) from optimum.intel.utils.import_utils import is_ipex_version from optimum.utils.testing_utils import grid_parameters -from utils_tests import MODEL_NAMES +from utils_tests import MODEL_NAMES, IS_XPU SEED = 42 @@ -80,11 +80,12 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True) + device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id) + transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = "This is a sample input" - tokens = tokenizer(inputs, return_tensors="pt") + tokens = tokenizer(inputs, return_tensors="pt").to(device) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) @@ -144,11 +145,12 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ipex_model = IPEXModelForQuestionAnswering.from_pretrained(model_id, export=True) + device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id) + transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id).to(device) tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = "This is a sample input" - tokens = tokenizer(inputs, return_tensors="pt") + tokens = tokenizer(inputs, return_tensors="pt").to(device) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) @@ -201,14 +203,14 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "gpt_neo", "gpt_neox", "mistral", - "llama", + # "llama", "llama2", # "phi", - "distilgpt2", + # "distilgpt2", "mpt", "opt", ) - IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "distilgpt2", "falcon") + IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "falcon", "gpt2") GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.0 @@ -216,7 +218,11 @@ class IPEXModelForCausalLMTest(unittest.TestCase): def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) + dtype = torch.float32 + if IS_XPU: + dtype = torch.float16 + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype) + device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) self.assertTrue(ipex_model.use_cache) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -224,20 +230,20 @@ def test_compare_to_transformers(self, model_arch): "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch in ("llama", "llama2") else None, - ) + ).to(device) inputs = ipex_model.prepare_inputs_for_generation(**tokens) outputs = ipex_model(**inputs) self.assertIsInstance(outputs.logits, torch.Tensor) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) # Test re-load model with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) - loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype) loaded_model_outputs = loaded_model(**inputs) # Test init method @@ -252,11 +258,14 @@ def test_compare_to_transformers(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): + dtype = torch.float32 + if IS_XPU: + dtype = torch.float16 model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype) model.config.encoder_no_repeat_ngram_size = 0 - model.to("cpu") + # model.to("cpu") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) outputs = pipe("This is a sample", max_new_tokens=10) self.assertEqual(pipe.device, model.device) @@ -264,14 +273,18 @@ def test_pipeline(self, model_arch): @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_assisted_decoding(self, model_arch): - # Patched models are not support assisted decoding if ipex < 2.5. - if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES and is_ipex_version("<", "2.4.0"): + # assist decoding does not support static cache now + if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES: return model_id = MODEL_NAMES[model_arch] + dtype = torch.float32 + if IS_XPU: + dtype = torch.float16 tokenizer = AutoTokenizer.from_pretrained(model_id) - ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt") + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype) + device = ipex_model.device + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device) + tokens = tokenizer("This is a sample input", return_tensors="pt").to(device) ipex_output = ipex_model.generate(**tokens, do_sample=False, max_new_tokens=4) ipex_output_assisted = ipex_model.generate( **tokens, do_sample=False, assistant_model=transformers_model, max_new_tokens=4 @@ -299,8 +312,12 @@ def test_assisted_decoding(self, model_arch): def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) + dtype = torch.float32 + if IS_XPU: + dtype = torch.float16 + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache, torch_dtype=dtype) + device = model.device + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device) self.assertEqual(model.use_cache, use_cache) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token @@ -316,7 +333,7 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): ), ) for text in texts: - tokens = tokenizer(text, padding=True, return_tensors="pt") + tokens = tokenizer(text, padding=True, return_tensors="pt").to(device) for generation_config in generation_configs: outputs = model.generate(**tokens, generation_config=generation_config) transformers_outputs = transformers_model.generate(**tokens, generation_config=generation_config) @@ -325,18 +342,21 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching") def test_compare_with_and_without_past_key_values(self): - model_id = "Jiqing/tiny_random_llama2" + model_id = "Intel/tiny_random_llama2" + dtype = torch.float32 + if IS_XPU: + dtype = torch.float16 + model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True, torch_dtype=dtype) + device = model_with_pkv.device tokenizer = AutoTokenizer.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt") - - model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True) + tokens = tokenizer("This is a sample input", return_tensors="pt").to(device) # Warmup model_with_pkv.generate(**tokens) with Timer() as with_pkv_timer: outputs_model_with_pkv = model_with_pkv.generate( **tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1 ) - model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False) + model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=dtype) # Warmup model_without_pkv.generate(**tokens) with Timer() as without_pkv_timer: @@ -366,10 +386,11 @@ def _generate_random_audio_data(self): def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True) + device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id) + transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) - inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt") + inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt").to(device) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) outputs = ipex_model(**inputs) @@ -417,12 +438,13 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True) + device = ipex_model.device self.assertIsInstance(ipex_model.config, PretrainedConfig) - transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id) + transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device) preprocessor = AutoFeatureExtractor.from_pretrained(model_id) url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) - inputs = preprocessor(images=image, return_tensors="pt") + inputs = preprocessor(images=image, return_tensors="pt").to(device) with torch.no_grad(): transformers_outputs = transformers_model(**inputs) outputs = ipex_model(**inputs) @@ -440,7 +462,7 @@ def test_compare_to_transformers(self, model_arch): self.assertIn("logits", outputs) # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) - self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) + self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-4)) self.assertTrue(torch.allclose(init_model_outputs.logits, transformers_outputs.logits, atol=1e-4)) @parameterized.expand(SUPPORTED_ARCHITECTURES) diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py index 696f5c9c2..458030346 100644 --- a/tests/ipex/test_pipelines.py +++ b/tests/ipex/test_pipelines.py @@ -20,7 +20,7 @@ from parameterized import parameterized from transformers import AutoTokenizer from transformers.pipelines import pipeline as transformers_pipeline -from utils_tests import MODEL_NAMES +from utils_tests import IS_XPU, MODEL_NAMES from optimum.intel.ipex.modeling_base import ( IPEXModelForAudioClassification, @@ -56,7 +56,6 @@ class PipelinesIntegrationTest(unittest.TestCase): "gpt2", "gpt_neo", "gpt_neox", - "llama", "llama2", "mistral", "mpt", @@ -130,8 +129,11 @@ def test_fill_mask_pipeline_inference(self, model_arch): @parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES) def test_text_generation_pipeline_inference(self, model_arch): model_id = MODEL_NAMES[model_arch] - transformers_generator = transformers_pipeline("text-generation", model_id) - ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex") + dtype = torch.float32 + if IS_XPU: + dtype = torch.float16 + transformers_generator = transformers_pipeline("text-generation", model_id, torch_dtype=dtype) + ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex", torch_dtype=dtype) inputs = "Describe a real-world application of AI." with torch.inference_mode(): transformers_output = transformers_generator(inputs, max_new_tokens=10) diff --git a/tests/ipex/utils_tests.py b/tests/ipex/utils_tests.py index 78bdcd7ec..a16f91dc0 100644 --- a/tests/ipex/utils_tests.py +++ b/tests/ipex/utils_tests.py @@ -11,8 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from transformers import is_torch_xpu_available +IS_XPU = is_torch_xpu_available(check_device=True) + MODEL_NAMES = { "albert": "hf-internal-testing/tiny-random-albert", "beit": "hf-internal-testing/tiny-random-BeitForImageClassification", @@ -28,15 +31,15 @@ "distilgpt2": "Jiqing/tiny_random_distilgpt2", "electra": "hf-internal-testing/tiny-random-electra", "flaubert": "hf-internal-testing/tiny-random-flaubert", - "falcon": "Jiqing/tiny_random_falcon", + "falcon": "Intel/tiny_random_falcon", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", - "gpt2": "hf-internal-testing/tiny-random-gpt2", + "gpt2": "Intel/tiny_random_gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", "levit": "hf-internal-testing/tiny-random-LevitModel", "llama": "fxmarty/tiny-llama-fast-tokenizer", - "llama2": "Jiqing/tiny_random_llama2", + "llama2": "Intel/tiny_random_llama2", "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", "mistral": "echarlaix/tiny-random-mistral",