From 427bf944f74a303f829f6bf7015807d9617e0805 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 20 Nov 2024 12:45:30 -0600 Subject: [PATCH 1/7] fix(api): Added score field to V4 Search API results Fixes: #4312 --- cl/lib/test_helpers.py | 1 + cl/search/api_serializers.py | 54 ++++++++++++++++++++++++++---------- cl/search/api_utils.py | 2 ++ cl/tests/cases.py | 23 +++++++++++---- 4 files changed, 61 insertions(+), 19 deletions(-) diff --git a/cl/lib/test_helpers.py b/cl/lib/test_helpers.py index 6dfd2fccea..c795f99d9e 100644 --- a/cl/lib/test_helpers.py +++ b/cl/lib/test_helpers.py @@ -476,6 +476,7 @@ def midnight_pt_test(d: datetime.date) -> datetime.datetime: "timestamp": lambda x: x["result"] .date_created.isoformat() .replace("+00:00", "Z"), + "score": lambda x: {"bm25": None}, } v4_recap_meta_keys = v4_meta_keys.copy() diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index 1f9cbb7d75..52d0bd6cd4 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -435,7 +435,11 @@ class Meta: ) -class MetaDataSerializer(serializers.Serializer): +class ScoreDataSerializer(serializers.Serializer): + bm25 = serializers.FloatField(read_only=True, source="bm25_score") + + +class BaseMetaDataSerializer(serializers.Serializer): """The metadata serializer V4 Search API.""" timestamp = TimeStampField(read_only=True, default_timezone=timezone.utc) @@ -444,7 +448,15 @@ class MetaDataSerializer(serializers.Serializer): ) -class RECAPMetaDataSerializer(MetaDataSerializer): +class MainDocumentMetaDataSerializer(BaseMetaDataSerializer): + """The metadata serializer V4 Search API for main documents. + Includes the score field. + """ + + score = ScoreDataSerializer(source="*", read_only=True) + + +class RECAPMetaDataSerializer(MainDocumentMetaDataSerializer): """The metadata serializer for the RECAP search type includes the additional more_docs field. """ @@ -454,10 +466,10 @@ class RECAPMetaDataSerializer(MetaDataSerializer): ) -class MetaMixin(serializers.Serializer): - """Mixin to add nested metadata serializer.""" +class MainMetaMixin(serializers.Serializer): + """Mixin to add nested metadata serializer for main documents.""" - meta = MetaDataSerializer(source="*", read_only=True) + meta = MainDocumentMetaDataSerializer(source="*", read_only=True) class RECAPMetaMixin(serializers.Serializer): @@ -466,7 +478,13 @@ class RECAPMetaMixin(serializers.Serializer): meta = RECAPMetaDataSerializer(source="*", read_only=True) -class BaseRECAPDocumentESResultSerializer(MetaMixin, DocumentSerializer): +class ChildMetaMixin(serializers.Serializer): + """Mixin to add nested metadata serializer for child documents.""" + + meta = BaseMetaDataSerializer(source="*", read_only=True) + + +class BaseRECAPDocumentESResultSerializer(DocumentSerializer): """The base serializer class for RECAP_DOCUMENT search type results.""" # Fields from the RECAPDocument @@ -505,6 +523,12 @@ class Meta: ) +class NestedRECAPDocumentESResultSerializer( + BaseRECAPDocumentESResultSerializer, ChildMetaMixin +): + """Mixin to add nested metadata serializer for nested Recap documents.""" + + class BaseDocketESResultSerializer(DocumentSerializer): """The serializer class for DOCKETS Search type results.""" @@ -541,25 +565,27 @@ class Meta: ) -class RECAPDocumentESResultSerializer(BaseRECAPDocumentESResultSerializer): +class RECAPDocumentESResultSerializer( + BaseRECAPDocumentESResultSerializer, MainMetaMixin +): """The serializer for RECAP_DOCUMENT search type results.""" docket_id = serializers.IntegerField(read_only=True) -class DocketESResultSerializer(MetaMixin, BaseDocketESResultSerializer): +class DocketESResultSerializer(MainMetaMixin, BaseDocketESResultSerializer): """The serializer class for DOCKETS Search type results.""" class RECAPESResultSerializer(RECAPMetaMixin, BaseDocketESResultSerializer): """The serializer class for RECAP search type results.""" - recap_documents = BaseRECAPDocumentESResultSerializer( + recap_documents = NestedRECAPDocumentESResultSerializer( many=True, read_only=True, source="child_docs" ) -class OpinionDocumentESResultSerializer(MetaMixin, DocumentSerializer): +class OpinionDocumentESResultSerializer(ChildMetaMixin, DocumentSerializer): """The serializer for OpinionDocument results.""" snippet = HighlightedField(read_only=True, source="text") @@ -579,7 +605,7 @@ class Meta: ) -class OpinionClusterESResultSerializer(MetaMixin, DocumentSerializer): +class OpinionClusterESResultSerializer(MainMetaMixin, DocumentSerializer): """The serializer for OpinionCluster Search results.""" opinions = OpinionDocumentESResultSerializer( @@ -609,7 +635,7 @@ class Meta: ) -class PositionESResultSerializer(MetaMixin, DocumentSerializer): +class PositionESResultSerializer(ChildMetaMixin, DocumentSerializer): """The serializer for Positions Search results.""" class Meta: @@ -644,7 +670,7 @@ class Meta: ) -class PersonESResultSerializer(MetaMixin, DocumentSerializer): +class PersonESResultSerializer(MainMetaMixin, DocumentSerializer): """The serializer for Person Search results.""" name = HighlightedField(read_only=True) @@ -674,7 +700,7 @@ class Meta: ) -class OAESResultSerializer(MetaMixin, DocumentSerializer): +class OAESResultSerializer(MainMetaMixin, DocumentSerializer): """The serializer for V4 Oral argument results.""" snippet = HighlightedField(read_only=True, source="text") diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index 122b06f944..a53487ff08 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -486,6 +486,8 @@ def process_results(self, results: Response) -> None: ) ) result["child_docs"] = child_result_objects + # Include the ES main document score as bm25_score. + result["bm25_score"] = result.meta.score if self.reverse: # If doing backward pagination, reverse the results of the current diff --git a/cl/tests/cases.py b/cl/tests/cases.py index 5b0c03e374..d28ce1c4c0 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -262,11 +262,20 @@ async def _compare_field( meta_expected_value = await sync_to_async(get_meta_expected_value)( content_to_compare ) - self.assertEqual( - meta_value, - meta_expected_value, - f"The field '{meta_field}' does not match.", - ) + if meta_field == "score": + # Special case for the score field. Only confirm the presence of + # keys and avoid comparing values, as they differ in each response. + self.assertEqual( + set(meta_value.keys()), + set(meta_expected_value.keys()), + f"The keys in field '{meta_field}' do not match.", + ) + else: + self.assertEqual( + meta_value, + meta_expected_value, + f"The field '{meta_field}' does not match.", + ) async def _test_api_fields_content( self, @@ -296,6 +305,10 @@ async def _test_api_fields_content( meta_value, ) in child_value.items(): with self.subTest(meta_field=meta_field): + self.assertFalse( + meta_field == "score", + msg="score key should not be present in nested documents", + ) await self._compare_field( meta_field, meta_value, From 0114e211d45f8ca677cb2ab997de838b6b3b3455 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 20 Nov 2024 13:57:30 -0600 Subject: [PATCH 2/7] fix(webhook): Removed score field from RECAP Search Webhooks --- cl/api/tasks.py | 4 ++-- cl/search/api_serializers.py | 30 ++++++++++++++++++++++-------- cl/tests/cases.py | 10 ++++++++++ 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/cl/api/tasks.py b/cl/api/tasks.py index 22db8820f7..ec1c5971ac 100644 --- a/cl/api/tasks.py +++ b/cl/api/tasks.py @@ -14,7 +14,7 @@ from cl.corpus_importer.api_serializers import DocketEntrySerializer from cl.lib.elasticsearch_utils import merge_highlights_into_result from cl.search.api_serializers import ( - RECAPESResultSerializer, + RECAPESWebhookResultSerializer, V3OAESResultSerializer, ) from cl.search.api_utils import ResultObject @@ -180,7 +180,7 @@ def send_search_alert_webhook_es( meta_hl, result, ) - serialized_results = RECAPESResultSerializer( + serialized_results = RECAPESWebhookResultSerializer( results, many=True ).data case _: diff --git a/cl/search/api_serializers.py b/cl/search/api_serializers.py index 52d0bd6cd4..f27053e95d 100644 --- a/cl/search/api_serializers.py +++ b/cl/search/api_serializers.py @@ -462,20 +462,24 @@ class RECAPMetaDataSerializer(MainDocumentMetaDataSerializer): """ more_docs = serializers.BooleanField( - read_only=True, source="child_remaining" + read_only=True, source="child_remaining", default=False ) -class MainMetaMixin(serializers.Serializer): - """Mixin to add nested metadata serializer for main documents.""" +class RECAPWebhookMetaDataSerializer(BaseMetaDataSerializer): + """The metadata serializer for the RECAP search Webhook that includes the + additional more_docs field without the score field. + """ - meta = MainDocumentMetaDataSerializer(source="*", read_only=True) + more_docs = serializers.BooleanField( + read_only=True, source="child_remaining", default=False + ) -class RECAPMetaMixin(serializers.Serializer): - """Mixin to add nested metadata serializer for the RECAP search type.""" +class MainMetaMixin(serializers.Serializer): + """Mixin to add nested metadata serializer for main documents.""" - meta = RECAPMetaDataSerializer(source="*", read_only=True) + meta = MainDocumentMetaDataSerializer(source="*", read_only=True) class ChildMetaMixin(serializers.Serializer): @@ -577,12 +581,22 @@ class DocketESResultSerializer(MainMetaMixin, BaseDocketESResultSerializer): """The serializer class for DOCKETS Search type results.""" -class RECAPESResultSerializer(RECAPMetaMixin, BaseDocketESResultSerializer): +class RECAPESResultSerializer(BaseDocketESResultSerializer): """The serializer class for RECAP search type results.""" recap_documents = NestedRECAPDocumentESResultSerializer( many=True, read_only=True, source="child_docs" ) + meta = RECAPMetaDataSerializer(source="*", read_only=True) + + +class RECAPESWebhookResultSerializer(BaseDocketESResultSerializer): + """The serializer class for RECAP search Webhooks results.""" + + recap_documents = NestedRECAPDocumentESResultSerializer( + many=True, read_only=True, source="child_docs" + ) + meta = RECAPWebhookMetaDataSerializer(source="*", read_only=True) class OpinionDocumentESResultSerializer(ChildMetaMixin, DocumentSerializer): diff --git a/cl/tests/cases.py b/cl/tests/cases.py index d28ce1c4c0..2f0db20e88 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -657,6 +657,11 @@ def _assert_webhook_hit_hl( if webhook["payload"]["alert"]["name"] == alert_title: hit = webhook["payload"]["results"][0] if child_field: + self.assertNotIn( + "score", + hit["recap_documents"][0]["meta"], + msg="score shouldn't be present on webhook nested documents", + ) child_field_content = hit["recap_documents"][0][field_name] self.assertIn( hl_expected, @@ -665,6 +670,11 @@ def _assert_webhook_hit_hl( % field_name, ) else: + self.assertNotIn( + "score", + hit["meta"], + msg="score shouldn't be present on webhook main document", + ) parent_field_content = hit[field_name] self.assertIn( hl_expected, From c37e051b52c4ee1f4fd9159b0bffce5d118e1cc3 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 20 Nov 2024 14:25:23 -0600 Subject: [PATCH 3/7] fix(api): Updated documentation related to score field --- cl/api/templates/search-api-docs-vlatest.html | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cl/api/templates/search-api-docs-vlatest.html b/cl/api/templates/search-api-docs-vlatest.html index 3c0ae0b083..d2a65537c1 100644 --- a/cl/api/templates/search-api-docs-vlatest.html +++ b/cl/api/templates/search-api-docs-vlatest.html @@ -93,7 +93,10 @@

Basic Usage

"lexisCite": "", "meta": { "timestamp": "2024-06-22T10:26:35.320787Z", - "date_created": "2022-06-26T23:24:18.926040Z" + "date_created": "2022-06-26T23:24:18.926040Z", + "score": { + "bm25": 2.1369965 + } }, "neutralCite": "", "non_participating_judge_ids": [], @@ -249,6 +252,10 @@

Special Notes

This field only displays Opinion text content.

+
  • +

    The meta field in main documents contains the score field, which is currently a hash that includes the bm25 score used by Elasticsearch to rank results. Additional scores may be introduced in the future. +

    +
  • From 924960ca0ff265c3e06b25322f5e6f35df6f2a03 Mon Sep 17 00:00:00 2001 From: flooie <6464529+flooie@users.noreply.github.com> Date: Thu, 21 Nov 2024 17:17:16 +0000 Subject: [PATCH 4/7] Update freelawproject dependencies --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 47de93c49c..d062c4b304 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2320,13 +2320,13 @@ setuptools = "*" [[package]] name = "juriscraper" -version = "2.6.40" +version = "2.6.42" description = "An API to scrape American court websites for metadata." optional = false python-versions = "*" files = [ - {file = "juriscraper-2.6.40-py27-none-any.whl", hash = "sha256:961987e618293545ea227bdf0b90af90a6fa28cafeab939b2633d253392559d8"}, - {file = "juriscraper-2.6.40.tar.gz", hash = "sha256:63a53d5345e5303ba90bd2c6939a31c1c67eba3a2b64a94a91847be502547aea"}, + {file = "juriscraper-2.6.42-py27-none-any.whl", hash = "sha256:010d6578714f3262f16d15bee709872e0584381b93988d0d504bdb774f43b403"}, + {file = "juriscraper-2.6.42.tar.gz", hash = "sha256:293299112201ed217a1eccb05d8cba01aa208e8ed9686d3a6b90c24e752f51fb"}, ] [package.dependencies] From 6da6a4600d0fe4d400bd41b7be103c43357d9616 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 21 Nov 2024 11:18:24 -0600 Subject: [PATCH 5/7] fix(docs): Tweaked the language regarding the score field in the Search API documentation --- cl/api/templates/search-api-docs-vlatest.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/api/templates/search-api-docs-vlatest.html b/cl/api/templates/search-api-docs-vlatest.html index d2a65537c1..19d3c716be 100644 --- a/cl/api/templates/search-api-docs-vlatest.html +++ b/cl/api/templates/search-api-docs-vlatest.html @@ -253,7 +253,7 @@

    Special Notes

  • -

    The meta field in main documents contains the score field, which is currently a hash that includes the bm25 score used by Elasticsearch to rank results. Additional scores may be introduced in the future. +

    The meta field in main documents contains the score field, which is currently a JSON object that includes the bm25 score used by Elasticsearch to rank results. Additional scores may be introduced in the future.

  • From 0924905cfe5f254b5d11cbcc26c334164282b5bb Mon Sep 17 00:00:00 2001 From: grossir <14970769+grossir@users.noreply.github.com> Date: Thu, 21 Nov 2024 21:04:45 +0000 Subject: [PATCH 6/7] Update freelawproject dependencies --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index d062c4b304..68f09e1cfb 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2320,13 +2320,13 @@ setuptools = "*" [[package]] name = "juriscraper" -version = "2.6.42" +version = "2.6.43" description = "An API to scrape American court websites for metadata." optional = false python-versions = "*" files = [ - {file = "juriscraper-2.6.42-py27-none-any.whl", hash = "sha256:010d6578714f3262f16d15bee709872e0584381b93988d0d504bdb774f43b403"}, - {file = "juriscraper-2.6.42.tar.gz", hash = "sha256:293299112201ed217a1eccb05d8cba01aa208e8ed9686d3a6b90c24e752f51fb"}, + {file = "juriscraper-2.6.43-py27-none-any.whl", hash = "sha256:c2765e5f0a6563fe4842bf72b13aec2b6feb873dc2350523ff6b5102bdf1f757"}, + {file = "juriscraper-2.6.43.tar.gz", hash = "sha256:99029ab83cbe99673e4598c8e9b30df9e3d21ef98bd78baef9907ab53ad96e10"}, ] [package.dependencies] From de6ab4b03986eee0edce000a492b361d7523430b Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 21 Nov 2024 16:11:37 -0600 Subject: [PATCH 7/7] fix(search): Fixes o-es-active flag on court_homepage --- cl/opinion_page/views.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index 95cbec270d..0eaf0addb9 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -151,11 +151,11 @@ async def court_homepage(request: HttpRequest, pk: str) -> HttpResponse: mutable_GET = request.GET.copy() - es_flag_for_oa = await sync_to_async(waffle.flag_is_active)( - request, "oa-es-active" + es_flag_for_o = await sync_to_async(waffle.flag_is_active)( + request, "o-es-active" ) - if not es_flag_for_oa: + if not es_flag_for_o: # Do solr search response = await sync_to_async(do_search)( mutable_GET,