Do not crash when cache file cannot be read

UUDigitalHumanitieslab · Feb 21, 2024 · 5bb6147 · 5bb6147
1 parent 77c0786
commit 5bb6147
Showing 1 changed file with 32 additions and 10 deletions.
diff --git a/backend/search/models.py b/backend/search/models.py
@@ -59,18 +59,40 @@ def _get_cache_path(self) -> pathlib.Path:
         settings.CACHING_DIR.mkdir(exist_ok=True, parents=True)
         return settings.CACHING_DIR / str(self.id)
 
-    def check_results(self) -> bool:
+    def check_cache(self) -> bool:
+        """Check if the cache file is readable and if its contents corresponds
+        to what the database knows about it. If not, the search will have to 
+        be repeated. If search is not yet completed, the cache size is not
+        tested."""
+        cache_path = self._get_cache_path()
         try:
-            self.get_results()
+            with cache_path.open():
+                pass
+        except OSError:
+            return False
+        # Cache file is readable, now check if size is correct
+        if self.search_completed is None:
             return True
-        except Exception:
-            logger.exception('Failed reading results of ComponentSearchQuery: %d', self.pk)
-
-        return False
+        else:
+            actual_size = cache_path.stat().st_size
+            size_equal = actual_size == self.cache_size
+            if not size_equal:
+                logger.warning(
+                    f"Cache size for {self} is {actual_size} while it is supposed to be {self.cache_size}"
+                )
+            return size_equal
 
     def get_results(self) -> ResultSet:
         """Return results as a dict"""
-        results = self._get_cache_path().read_text()
+        cache_path = self._get_cache_path()
+        try:
+            results = cache_path.read_text()
+        except OSError:
+            # Cache file is not present or not readable. This may happen when
+            # results are fetched but searching has not yet started, or if
+            # the cache file was deleted. Go on and return an empty set -
+            # 
+            results = ''
         self.last_accessed = timezone.now()
         # This method may be called from multiple processes while the query is still
         # running. If we save the entire model, we will overwrite the progress
@@ -80,7 +102,7 @@ def get_results(self) -> ResultSet:
         return parse_search_result(results, self.component.slug)
 
     def get_completed_part(self) -> Optional[int]:
-        if self.check_results():
+        if self.check_cache():
             return self.completed_part
         return None
 
@@ -389,7 +411,7 @@ def perform_search(self) -> None:
 
         result_objs = list(result_objs_query)
         # append results that should be complete but can't be read
-        result_objs += [r for r in self.results.filter(search_completed__isnull=False) if not r.check_results()]
+        result_objs += [r for r in self.results.filter(search_completed__isnull=False) if not r.check_cache()]
 
         # loop through the linked ComponentSearchResults.
         # for each component, we have to either run the query (perform_search)
@@ -400,7 +422,7 @@ def perform_search(self) -> None:
             if result_obj.search_completed and not result_obj.errors:
                 # kinda roundabout way to make sure the results are readable before skipping it
                 # make sure the results are accessible, because reading the cache might fail
-                if result_obj.check_results():
+                if result_obj.check_cache():
                     # results are readable, skip the rest of the loop
                     continue
             try: