From 845dd548f4f04ae0538fda656438298dc189ece3 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Wed, 11 Sep 2024 15:32:54 +0200 Subject: [PATCH 1/6] Release v3.1.0 --- pyproject.toml | 2 +- sentence_transformers/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8862e17f4..9297b1150 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sentence-transformers" -version = "3.1.0.dev0" +version = "3.1.0" description = "Multilingual text embeddings" license = { file = "LICENSE" } readme = "README.md" diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py index 79a390ca4..14f3ec7c4 100644 --- a/sentence_transformers/__init__.py +++ b/sentence_transformers/__init__.py @@ -1,6 +1,6 @@ from __future__ import annotations -__version__ = "3.1.0.dev0" +__version__ = "3.1.0" __MODEL_HUB_ORGANIZATION__ = "sentence-transformers" import importlib From 44ed026099d40b5c17867c618dcb965bb85bd525 Mon Sep 17 00:00:00 2001 From: alperctnkaya <33162362+alperctnkaya@users.noreply.github.com> Date: Sun, 29 Sep 2024 09:53:38 +0300 Subject: [PATCH 2/6] multi-GPU support for mine_hard_negatives Added support for multi-GPU encoding in sentence embeddings with model.encode_multi_process --- sentence_transformers/util.py | 44 +++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 5b8baa5f7..167e6fbf4 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -533,6 +533,7 @@ def mine_hard_negatives( faiss_batch_size: int = 16384, use_faiss: bool = False, verbose: bool = True, + use_multiple_gpus = True, ) -> Dataset: """ Add hard negatives to a dataset of (anchor, positive) pairs to create (anchor, positive, negative) triplets or @@ -714,12 +715,21 @@ def mine_hard_negatives( except Exception: pass - corpus_embeddings = model.encode( - corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True - ) - query_embeddings = model.encode( - queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True - ) + if use_multiple_gpus: + pool = model.start_multi_process_pool() + + corpus_embeddings = model.encode_multi_process( + corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + + query_embeddings = model.encode_multi_process( + queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + else + corpus_embeddings = model.encode( + corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) + query_embeddings = model.encode( + queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) index.add(corpus_embeddings) scores_list = [] @@ -735,12 +745,22 @@ def mine_hard_negatives( else: # Embed the corpus and the queries - corpus_embeddings = model.encode( - corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True - ) - query_embeddings = model.encode( - queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True - ) + + if use_multiple_gpus: + pool = model.start_multi_process_pool() + + corpus_embeddings = model.encode_multi_process( + corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + + query_embeddings = model.encode_multi_process( + queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + else + corpus_embeddings = model.encode( + corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) + query_embeddings = model.encode( + queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) scores = model.similarity(query_embeddings, corpus_embeddings).to(device) # Keep only the range_max + max_positives highest scores. We offset by 1 to potentially include the positive pair From ee79a72636c277b37f4943cfb70bed8a121f81e7 Mon Sep 17 00:00:00 2001 From: alperctnkaya <33162362+alperctnkaya@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:53:09 +0300 Subject: [PATCH 3/6] multi-GPU support for mine_hard_negatives --- sentence_transformers/util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 167e6fbf4..ee5f1a00a 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -723,7 +723,7 @@ def mine_hard_negatives( query_embeddings = model.encode_multi_process( queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) - else + else: corpus_embeddings = model.encode( corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True ) @@ -754,7 +754,7 @@ def mine_hard_negatives( query_embeddings = model.encode_multi_process( queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) - else + else: corpus_embeddings = model.encode( corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True ) From 25f1c08f581b9ebabd711d14cd192838b0ea3afc Mon Sep 17 00:00:00 2001 From: alperctnkaya <33162362+alperctnkaya@users.noreply.github.com> Date: Wed, 2 Oct 2024 09:36:00 +0300 Subject: [PATCH 4/6] multi-GPU support for mine_hard_negatives --- sentence_transformers/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index ee5f1a00a..be523dde1 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -533,7 +533,7 @@ def mine_hard_negatives( faiss_batch_size: int = 16384, use_faiss: bool = False, verbose: bool = True, - use_multiple_gpus = True, + use_multiple_gpus = False, ) -> Dataset: """ Add hard negatives to a dataset of (anchor, positive) pairs to create (anchor, positive, negative) triplets or From 6e024bbbc82ef792abf41b592cb8c2e5b6ecd555 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Thu, 17 Oct 2024 18:06:06 +0200 Subject: [PATCH 5/6] Run 'pre-commit run --all' --- sentence_transformers/util.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index d1c04f67c..0186c53e1 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -536,7 +536,7 @@ def mine_hard_negatives( faiss_batch_size: int = 16384, use_faiss: bool = False, verbose: bool = True, - use_multiple_gpus = False, + use_multiple_gpus=False, ) -> Dataset: """ Add hard negatives to a dataset of (anchor, positive) pairs to create (anchor, positive, negative) triplets or @@ -735,16 +735,22 @@ def mine_hard_negatives( pool = model.start_multi_process_pool() corpus_embeddings = model.encode_multi_process( - corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True + ) query_embeddings = model.encode_multi_process( - queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True + ) else: corpus_embeddings = model.encode( corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True ) query_embeddings = model.encode( - queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + queries, + batch_size=batch_size, + normalize_embeddings=True, + convert_to_numpy=True, + show_progress_bar=True, ) index.add(corpus_embeddings) @@ -766,16 +772,22 @@ def mine_hard_negatives( pool = model.start_multi_process_pool() corpus_embeddings = model.encode_multi_process( - corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True + ) query_embeddings = model.encode_multi_process( - queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True) + queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True + ) else: corpus_embeddings = model.encode( corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True ) query_embeddings = model.encode( - queries, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + queries, + batch_size=batch_size, + normalize_embeddings=True, + convert_to_numpy=True, + show_progress_bar=True, ) scores = model.similarity(query_embeddings, corpus_embeddings).to(device) From b161e5e911e4822149cf5fb1e43609d947f06206 Mon Sep 17 00:00:00 2001 From: Tom Aarsen Date: Tue, 26 Nov 2024 16:01:06 +0100 Subject: [PATCH 6/6] Rename use_multiple_gpus to use_multi_process Because this also supports multiple CPUs & multi_process is what the underlying code is called. Also stop the pools afterwards again And remove code duplication --- sentence_transformers/util.py | 74 ++++++++++++++--------------------- 1 file changed, 29 insertions(+), 45 deletions(-) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 0186c53e1..0cee7f0dc 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -535,8 +535,8 @@ def mine_hard_negatives( batch_size: int = 32, faiss_batch_size: int = 16384, use_faiss: bool = False, + use_multi_process: list[str] | bool = False, verbose: bool = True, - use_multiple_gpus=False, ) -> Dataset: """ Add hard negatives to a dataset of (anchor, positive) pairs to create (anchor, positive, negative) triplets or @@ -644,6 +644,9 @@ def mine_hard_negatives( batch_size (int): Batch size for encoding the dataset. Defaults to 32. faiss_batch_size (int): Batch size for FAISS top-k search. Defaults to 16384. use_faiss (bool): Whether to use FAISS for similarity search. May be recommended for large datasets. Defaults to False. + use_multi_process (bool | List[str], optional): Whether to use multi-GPU/CPU processing. If True, uses all GPUs if CUDA + is available, and 4 CPU processes if it's not available. You can also pass a list of PyTorch devices like + ["cuda:0", "cuda:1", ...] or ["cpu", "cpu", "cpu", "cpu"]. verbose (bool): Whether to print statistics and logging. Defaults to True. Returns: @@ -718,6 +721,30 @@ def mine_hard_negatives( avg_positives_per_query = np.mean(positives_per_query) print(f"Found an average of {avg_positives_per_query:.3f} positives per query.") + # Embed the corpus and the queries + if use_multi_process: + pool = model.start_multi_process_pool( + target_devices=None if isinstance(use_multi_process, bool) else use_multi_process + ) + corpus_embeddings = model.encode_multi_process( + corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True + ) + query_embeddings = model.encode_multi_process( + queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True + ) + model.stop_multi_process_pool(pool) + else: + corpus_embeddings = model.encode( + corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True + ) + query_embeddings = model.encode( + queries, + batch_size=batch_size, + normalize_embeddings=True, + convert_to_numpy=True, + show_progress_bar=True, + ) + if use_faiss: import faiss @@ -731,27 +758,6 @@ def mine_hard_negatives( except Exception: pass - if use_multiple_gpus: - pool = model.start_multi_process_pool() - - corpus_embeddings = model.encode_multi_process( - corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True - ) - - query_embeddings = model.encode_multi_process( - queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True - ) - else: - corpus_embeddings = model.encode( - corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True - ) - query_embeddings = model.encode( - queries, - batch_size=batch_size, - normalize_embeddings=True, - convert_to_numpy=True, - show_progress_bar=True, - ) index.add(corpus_embeddings) scores_list = [] @@ -766,29 +772,7 @@ def mine_hard_negatives( indices = torch.from_numpy(np.concatenate(indices_list, axis=0)).to(device) else: - # Embed the corpus and the queries - - if use_multiple_gpus: - pool = model.start_multi_process_pool() - - corpus_embeddings = model.encode_multi_process( - corpus, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True - ) - - query_embeddings = model.encode_multi_process( - queries, pool, batch_size=batch_size, normalize_embeddings=True, show_progress_bar=True - ) - else: - corpus_embeddings = model.encode( - corpus, batch_size=batch_size, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True - ) - query_embeddings = model.encode( - queries, - batch_size=batch_size, - normalize_embeddings=True, - convert_to_numpy=True, - show_progress_bar=True, - ) + # Compute the similarity scores between the queries and the corpus scores = model.similarity(query_embeddings, corpus_embeddings).to(device) # Keep only the range_max + max_positives highest scores. We offset by 1 to potentially include the positive pair