Search: Added new deduplication method Fixes #710

2024-12-01 11:41:20 +01:00 · 2014-10-17 12:43:29 +01:00 · 2014-10-17 12:43:29 +01:00 · 7a4651ad64
commit 7a4651ad64
parent b4fdec74ac
5 changed files with 48 additions and 75 deletions
--- a/wagtail/tests/models.py
+++ b/wagtail/tests/models.py
@ -425,6 +425,17 @@ class SearchTest(models.Model, index.Indexed):
    def callable_indexed_field(self):
        return "Callable"

+    @classmethod
+    def get_indexed_objects(cls):
+        indexed_objects = super(SearchTest, cls).get_indexed_objects()
+
+        # Exclude SearchTests that have a SearchTestChild to prevent duplicates
+        if cls is SearchTest:
+            indexed_objects = indexed_objects.exclude(
+                id__in=SearchTestChild.objects.all().values_list('searchtest_ptr_id', flat=True)
+            )
+
+        return indexed_objects

 class SearchTestChild(SearchTest):
    subtitle = models.CharField(max_length=255, null=True, blank=True)
--- a/wagtail/wagtailsearch/backends/base.py
+++ b/wagtail/wagtailsearch/backends/base.py
@ -29,7 +29,7 @@ class BaseSearch(object):
    def add(self, obj):
        return NotImplemented

-    def add_bulk(self, obj_list):
+    def add_bulk(self, model, obj_list):
        return NotImplemented

    def delete(self, obj):
--- a/wagtail/wagtailsearch/backends/db.py
+++ b/wagtail/wagtailsearch/backends/db.py
@ -19,8 +19,8 @@ class DBSearch(BaseSearch):
    def add(self, obj):
        pass # Not needed

-    def add_bulk(self, obj_list):
-        return [] # Not needed
+    def add_bulk(self, model, obj_list):
+        pass # Not needed

    def delete(self, obj):
        pass # Not needed
--- a/wagtail/wagtailsearch/backends/elasticsearch.py
+++ b/wagtail/wagtailsearch/backends/elasticsearch.py
@ -572,42 +572,29 @@ class ElasticSearch(BaseSearch):
        # Add document to index
        self.es.index(self.es_index, mapping.get_document_type(), mapping.get_document(obj), id=mapping.get_document_id(obj))

-    def add_bulk(self, obj_list):
-        # Group all objects by their type
-        type_set = {}
+    def add_bulk(self, model, obj_list):
+        # Get mapping
+        mapping = ElasticSearchMapping(model)
+        doc_type = mapping.get_document_type()
+
+        # Create list of actions
+        actions = []
        for obj in obj_list:
            # Object must be a decendant of Indexed and be a django model
            if not self.object_can_be_indexed(obj):
                continue

-            # Get mapping
-            mapping = ElasticSearchMapping(obj.__class__)
+            # Create the action
+            action = {
+                '_index': self.es_index,
+                '_type': doc_type,
+                '_id': mapping.get_document_id(obj),
+            }
+            action.update(mapping.get_document(obj))
+            actions.append(action)

-            # Get document type
-            doc_type = mapping.get_document_type()
-
-            # If type is currently not in set, add it
-            if doc_type not in type_set:
-                type_set[doc_type] = []
-
-            # Add document to set
-            type_set[doc_type].append((mapping.get_document_id(obj), mapping.get_document(obj)))
-
-        # Loop through each type and bulk add them
-        for type_name, type_documents in type_set.items():
-            # Get list of actions
-            actions = []
-            for doc_id, doc in type_documents:
-                action = {
-                    '_index': self.es_index,
-                    '_type': type_name,
-                    '_id': doc_id,
-                }
-                action.update(doc)
-                actions.append(action)
-
-            yield type_name, len(type_documents)
-            bulk(self.es, actions)
+        # Run the actions
+        bulk(self.es, actions)

    def delete(self, obj):
        # Object must be a decendant of Indexed and be a django model
--- a/wagtail/wagtailsearch/management/commands/update_index.py
+++ b/wagtail/wagtailsearch/management/commands/update_index.py
@ -10,43 +10,16 @@ from wagtail.wagtailsearch.backends import get_search_backend

 class Command(BaseCommand):
    def get_object_list(self):
-        # Print info
-        self.stdout.write("Getting object list")
-
        # Get list of indexed models
        indexed_models = [model for model in models.get_models() if issubclass(model, Indexed)]

-        # Object set
-        object_set = {}
+        # Return list of (model_name, queryset) tuples
+        return [
+            (model, model.get_indexed_objects())
+            for model in indexed_models
+        ]

-        # Add all objects to object set and detect any duplicates
-        # Duplicates are caused when both a model and a derived model are indexed
-        # Eg, if BlogPost inherits from Page and both of these models are indexed
-        # If we were to add all objects from both models into the index, all the BlogPosts will have two entries
-        for model in indexed_models:
-            # Get toplevel content type
-            toplevel_content_type = model.indexed_get_toplevel_content_type()
-
-            # Loop through objects
-            for obj in model.get_indexed_objects():
-                # Get key for this object
-                key = toplevel_content_type + ':' + str(obj.pk)
-
-                # Check if this key already exists
-                if key in object_set:
-                    # Conflict, work out who should get this space
-                    # The object with the longest content type string gets the space
-                    # Eg, "wagtailcore.Page-myapp.BlogPost" kicks out "wagtailcore.Page"
-                    if len(obj.indexed_get_content_type()) > len(object_set[key].indexed_get_content_type()):
-                        # Take the spot
-                        object_set[key] = obj
-                else:
-                    # Space free, take it
-                    object_set[key] = obj
-
-        return indexed_models, object_set.values()
-
-    def update_backend(self, backend_name, models, object_list):
+    def update_backend(self, backend_name, object_list):
        # Print info
        self.stdout.write("Updating backend: " + backend_name)

@ -57,15 +30,17 @@ class Command(BaseCommand):
        self.stdout.write(backend_name + ": Reseting index")
        backend.reset_index()

-        # Add types
-        self.stdout.write(backend_name + ": Adding types")
-        for model in models:
+        for model, queryset in object_list:
+            self.stdout.write(backend_name + ": Indexing model '%s.%s'" % (
+                model._meta.app_label,
+                model.__name__,
+            ))
+
+            # Add type
            backend.add_type(model)

-        # Add objects to index
-        self.stdout.write(backend_name + ": Adding objects")
-        for result in backend.add_bulk(object_list):
-            self.stdout.write(result[0] + ' ' + str(result[1]))
+            # Add objects
+            backend.add_bulk(model, queryset)

        # Refresh index
        self.stdout.write(backend_name + ": Refreshing index")
@ -82,7 +57,7 @@ class Command(BaseCommand):

    def handle(self, **options):
        # Get object list
-        models, object_list = self.get_object_list()
+        object_list = self.get_object_list()

        # Get list of backends to index
        if options['backend_name']:
@ -97,4 +72,4 @@ class Command(BaseCommand):

        # Update backends
        for backend_name in backend_names:
-            self.update_backend(backend_name, models, object_list)
+            self.update_backend(backend_name, object_list)