Skip to content

Commit

Permalink
skip assited decoding unit test for models using paged attention (#998)
Browse files Browse the repository at this point in the history
* skip assited decoding unit test for models using paged attention

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>

* XPU CI tests get almost all passed

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>

---------

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>
  • Loading branch information
kaixuanliu authored Nov 22, 2024
1 parent 76d32be commit 039c72d
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 39 deletions.
86 changes: 54 additions & 32 deletions tests/ipex/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
)
from optimum.intel.utils.import_utils import is_ipex_version
from optimum.utils.testing_utils import grid_parameters
from utils_tests import MODEL_NAMES
from utils_tests import MODEL_NAMES, IS_XPU


SEED = 42
Expand Down Expand Up @@ -80,11 +80,12 @@ def test_compare_to_transformers(self, model_arch):
model_id = MODEL_NAMES[model_arch]
set_seed(SEED)
ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True)
device = ipex_model.device
self.assertIsInstance(ipex_model.config, PretrainedConfig)
transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id)
transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = "This is a sample input"
tokens = tokenizer(inputs, return_tensors="pt")
tokens = tokenizer(inputs, return_tensors="pt").to(device)
with torch.no_grad():
transformers_outputs = transformers_model(**tokens)
outputs = ipex_model(**tokens)
Expand Down Expand Up @@ -144,11 +145,12 @@ def test_compare_to_transformers(self, model_arch):
model_id = MODEL_NAMES[model_arch]
set_seed(SEED)
ipex_model = IPEXModelForQuestionAnswering.from_pretrained(model_id, export=True)
device = ipex_model.device
self.assertIsInstance(ipex_model.config, PretrainedConfig)
transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id)
transformers_model = AutoModelForQuestionAnswering.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = "This is a sample input"
tokens = tokenizer(inputs, return_tensors="pt")
tokens = tokenizer(inputs, return_tensors="pt").to(device)
with torch.no_grad():
transformers_outputs = transformers_model(**tokens)
outputs = ipex_model(**tokens)
Expand Down Expand Up @@ -201,43 +203,47 @@ class IPEXModelForCausalLMTest(unittest.TestCase):
"gpt_neo",
"gpt_neox",
"mistral",
"llama",
# "llama",
"llama2",
# "phi",
"distilgpt2",
# "distilgpt2",
"mpt",
"opt",
)
IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "distilgpt2", "falcon")
IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2", "falcon", "gpt2")
GENERATION_LENGTH = 100
SPEEDUP_CACHE = 1.0

@parameterized.expand(SUPPORTED_ARCHITECTURES)
def test_compare_to_transformers(self, model_arch):
model_id = MODEL_NAMES[model_arch]
set_seed(SEED)
ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
dtype = torch.float32
if IS_XPU:
dtype = torch.float16
ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype)
device = ipex_model.device
self.assertIsInstance(ipex_model.config, PretrainedConfig)
self.assertTrue(ipex_model.use_cache)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokens = tokenizer(
"This is a sample",
return_tensors="pt",
return_token_type_ids=False if model_arch in ("llama", "llama2") else None,
)
).to(device)
inputs = ipex_model.prepare_inputs_for_generation(**tokens)
outputs = ipex_model(**inputs)

self.assertIsInstance(outputs.logits, torch.Tensor)

transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
with torch.no_grad():
transformers_outputs = transformers_model(**tokens)

# Test re-load model
with tempfile.TemporaryDirectory() as tmpdirname:
ipex_model.save_pretrained(tmpdirname)
loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname)
loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype)
loaded_model_outputs = loaded_model(**inputs)

# Test init method
Expand All @@ -252,26 +258,33 @@ def test_compare_to_transformers(self, model_arch):

@parameterized.expand(SUPPORTED_ARCHITECTURES)
def test_pipeline(self, model_arch):
dtype = torch.float32
if IS_XPU:
dtype = torch.float16
model_id = MODEL_NAMES[model_arch]
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype)
model.config.encoder_no_repeat_ngram_size = 0
model.to("cpu")
# model.to("cpu")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
outputs = pipe("This is a sample", max_new_tokens=10)
self.assertEqual(pipe.device, model.device)
self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs))

@parameterized.expand(SUPPORTED_ARCHITECTURES)
def test_assisted_decoding(self, model_arch):
# Patched models are not support assisted decoding if ipex < 2.5.
if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES and is_ipex_version("<", "2.4.0"):
# assist decoding does not support static cache now
if model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES:
return
model_id = MODEL_NAMES[model_arch]
dtype = torch.float32
if IS_XPU:
dtype = torch.float16
tokenizer = AutoTokenizer.from_pretrained(model_id)
ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
tokens = tokenizer("This is a sample input", return_tensors="pt")
ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype)
device = ipex_model.device
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
tokens = tokenizer("This is a sample input", return_tensors="pt").to(device)
ipex_output = ipex_model.generate(**tokens, do_sample=False, max_new_tokens=4)
ipex_output_assisted = ipex_model.generate(
**tokens, do_sample=False, assistant_model=transformers_model, max_new_tokens=4
Expand Down Expand Up @@ -299,8 +312,12 @@ def test_assisted_decoding(self, model_arch):
def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):
model_id = MODEL_NAMES[model_arch]
set_seed(SEED)
model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache)
transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
dtype = torch.float32
if IS_XPU:
dtype = torch.float16
model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache, torch_dtype=dtype)
device = model.device
transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype).to(device)
self.assertEqual(model.use_cache, use_cache)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
Expand All @@ -316,7 +333,7 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):
),
)
for text in texts:
tokens = tokenizer(text, padding=True, return_tensors="pt")
tokens = tokenizer(text, padding=True, return_tensors="pt").to(device)
for generation_config in generation_configs:
outputs = model.generate(**tokens, generation_config=generation_config)
transformers_outputs = transformers_model.generate(**tokens, generation_config=generation_config)
Expand All @@ -325,18 +342,21 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache):

@unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
def test_compare_with_and_without_past_key_values(self):
model_id = "Jiqing/tiny_random_llama2"
model_id = "Intel/tiny_random_llama2"
dtype = torch.float32
if IS_XPU:
dtype = torch.float16
model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True, torch_dtype=dtype)
device = model_with_pkv.device
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokens = tokenizer("This is a sample input", return_tensors="pt")

model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True)
tokens = tokenizer("This is a sample input", return_tensors="pt").to(device)
# Warmup
model_with_pkv.generate(**tokens)
with Timer() as with_pkv_timer:
outputs_model_with_pkv = model_with_pkv.generate(
**tokens, min_new_tokens=self.GENERATION_LENGTH, max_new_tokens=self.GENERATION_LENGTH, num_beams=1
)
model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False)
model_without_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=False, torch_dtype=dtype)
# Warmup
model_without_pkv.generate(**tokens)
with Timer() as without_pkv_timer:
Expand Down Expand Up @@ -366,10 +386,11 @@ def _generate_random_audio_data(self):
def test_compare_to_transformers(self, model_arch):
model_id = MODEL_NAMES[model_arch]
ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True)
device = ipex_model.device
self.assertIsInstance(ipex_model.config, PretrainedConfig)
transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id)
transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device)
preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt")
inputs = preprocessor(self._generate_random_audio_data(), return_tensors="pt").to(device)
with torch.no_grad():
transformers_outputs = transformers_model(**inputs)
outputs = ipex_model(**inputs)
Expand Down Expand Up @@ -417,12 +438,13 @@ def test_compare_to_transformers(self, model_arch):
model_id = MODEL_NAMES[model_arch]
set_seed(SEED)
ipex_model = self.IPEX_MODEL_CLASS.from_pretrained(model_id, export=True)
device = ipex_model.device
self.assertIsInstance(ipex_model.config, PretrainedConfig)
transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id)
transformers_model = self.IPEX_MODEL_CLASS.auto_model_class.from_pretrained(model_id).to(device)
preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = preprocessor(images=image, return_tensors="pt")
inputs = preprocessor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
transformers_outputs = transformers_model(**inputs)
outputs = ipex_model(**inputs)
Expand All @@ -440,7 +462,7 @@ def test_compare_to_transformers(self, model_arch):
self.assertIn("logits", outputs)
# Compare tensor outputs
self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits))
self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-4))
self.assertTrue(torch.allclose(init_model_outputs.logits, transformers_outputs.logits, atol=1e-4))

@parameterized.expand(SUPPORTED_ARCHITECTURES)
Expand Down
10 changes: 6 additions & 4 deletions tests/ipex/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from parameterized import parameterized
from transformers import AutoTokenizer
from transformers.pipelines import pipeline as transformers_pipeline
from utils_tests import MODEL_NAMES
from utils_tests import IS_XPU, MODEL_NAMES

from optimum.intel.ipex.modeling_base import (
IPEXModelForAudioClassification,
Expand Down Expand Up @@ -56,7 +56,6 @@ class PipelinesIntegrationTest(unittest.TestCase):
"gpt2",
"gpt_neo",
"gpt_neox",
"llama",
"llama2",
"mistral",
"mpt",
Expand Down Expand Up @@ -130,8 +129,11 @@ def test_fill_mask_pipeline_inference(self, model_arch):
@parameterized.expand(TEXT_GENERATION_SUPPORTED_ARCHITECTURES)
def test_text_generation_pipeline_inference(self, model_arch):
model_id = MODEL_NAMES[model_arch]
transformers_generator = transformers_pipeline("text-generation", model_id)
ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex")
dtype = torch.float32
if IS_XPU:
dtype = torch.float16
transformers_generator = transformers_pipeline("text-generation", model_id, torch_dtype=dtype)
ipex_generator = ipex_pipeline("text-generation", model_id, accelerator="ipex", torch_dtype=dtype)
inputs = "Describe a real-world application of AI."
with torch.inference_mode():
transformers_output = transformers_generator(inputs, max_new_tokens=10)
Expand Down
9 changes: 6 additions & 3 deletions tests/ipex/utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers import is_torch_xpu_available


IS_XPU = is_torch_xpu_available(check_device=True)

MODEL_NAMES = {
"albert": "hf-internal-testing/tiny-random-albert",
"beit": "hf-internal-testing/tiny-random-BeitForImageClassification",
Expand All @@ -28,15 +31,15 @@
"distilgpt2": "Jiqing/tiny_random_distilgpt2",
"electra": "hf-internal-testing/tiny-random-electra",
"flaubert": "hf-internal-testing/tiny-random-flaubert",
"falcon": "Jiqing/tiny_random_falcon",
"falcon": "Intel/tiny_random_falcon",
"gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
"gpt2": "hf-internal-testing/tiny-random-gpt2",
"gpt2": "Intel/tiny_random_gpt2",
"gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
"gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
"gptj": "hf-internal-testing/tiny-random-GPTJModel",
"levit": "hf-internal-testing/tiny-random-LevitModel",
"llama": "fxmarty/tiny-llama-fast-tokenizer",
"llama2": "Jiqing/tiny_random_llama2",
"llama2": "Intel/tiny_random_llama2",
"marian": "sshleifer/tiny-marian-en-de",
"mbart": "hf-internal-testing/tiny-random-mbart",
"mistral": "echarlaix/tiny-random-mistral",
Expand Down

0 comments on commit 039c72d

Please sign in to comment.