mirror of
https://github.com/wagtail/wagtail.git
synced 2024-12-01 11:41:20 +01:00
Search: Added new deduplication method Fixes #710
This commit is contained in:
parent
b4fdec74ac
commit
7a4651ad64
@ -425,6 +425,17 @@ class SearchTest(models.Model, index.Indexed):
|
||||
def callable_indexed_field(self):
|
||||
return "Callable"
|
||||
|
||||
@classmethod
|
||||
def get_indexed_objects(cls):
|
||||
indexed_objects = super(SearchTest, cls).get_indexed_objects()
|
||||
|
||||
# Exclude SearchTests that have a SearchTestChild to prevent duplicates
|
||||
if cls is SearchTest:
|
||||
indexed_objects = indexed_objects.exclude(
|
||||
id__in=SearchTestChild.objects.all().values_list('searchtest_ptr_id', flat=True)
|
||||
)
|
||||
|
||||
return indexed_objects
|
||||
|
||||
class SearchTestChild(SearchTest):
|
||||
subtitle = models.CharField(max_length=255, null=True, blank=True)
|
||||
|
@ -29,7 +29,7 @@ class BaseSearch(object):
|
||||
def add(self, obj):
|
||||
return NotImplemented
|
||||
|
||||
def add_bulk(self, obj_list):
|
||||
def add_bulk(self, model, obj_list):
|
||||
return NotImplemented
|
||||
|
||||
def delete(self, obj):
|
||||
|
@ -19,8 +19,8 @@ class DBSearch(BaseSearch):
|
||||
def add(self, obj):
|
||||
pass # Not needed
|
||||
|
||||
def add_bulk(self, obj_list):
|
||||
return [] # Not needed
|
||||
def add_bulk(self, model, obj_list):
|
||||
pass # Not needed
|
||||
|
||||
def delete(self, obj):
|
||||
pass # Not needed
|
||||
|
@ -572,42 +572,29 @@ class ElasticSearch(BaseSearch):
|
||||
# Add document to index
|
||||
self.es.index(self.es_index, mapping.get_document_type(), mapping.get_document(obj), id=mapping.get_document_id(obj))
|
||||
|
||||
def add_bulk(self, obj_list):
|
||||
# Group all objects by their type
|
||||
type_set = {}
|
||||
def add_bulk(self, model, obj_list):
|
||||
# Get mapping
|
||||
mapping = ElasticSearchMapping(model)
|
||||
doc_type = mapping.get_document_type()
|
||||
|
||||
# Create list of actions
|
||||
actions = []
|
||||
for obj in obj_list:
|
||||
# Object must be a decendant of Indexed and be a django model
|
||||
if not self.object_can_be_indexed(obj):
|
||||
continue
|
||||
|
||||
# Get mapping
|
||||
mapping = ElasticSearchMapping(obj.__class__)
|
||||
# Create the action
|
||||
action = {
|
||||
'_index': self.es_index,
|
||||
'_type': doc_type,
|
||||
'_id': mapping.get_document_id(obj),
|
||||
}
|
||||
action.update(mapping.get_document(obj))
|
||||
actions.append(action)
|
||||
|
||||
# Get document type
|
||||
doc_type = mapping.get_document_type()
|
||||
|
||||
# If type is currently not in set, add it
|
||||
if doc_type not in type_set:
|
||||
type_set[doc_type] = []
|
||||
|
||||
# Add document to set
|
||||
type_set[doc_type].append((mapping.get_document_id(obj), mapping.get_document(obj)))
|
||||
|
||||
# Loop through each type and bulk add them
|
||||
for type_name, type_documents in type_set.items():
|
||||
# Get list of actions
|
||||
actions = []
|
||||
for doc_id, doc in type_documents:
|
||||
action = {
|
||||
'_index': self.es_index,
|
||||
'_type': type_name,
|
||||
'_id': doc_id,
|
||||
}
|
||||
action.update(doc)
|
||||
actions.append(action)
|
||||
|
||||
yield type_name, len(type_documents)
|
||||
bulk(self.es, actions)
|
||||
# Run the actions
|
||||
bulk(self.es, actions)
|
||||
|
||||
def delete(self, obj):
|
||||
# Object must be a decendant of Indexed and be a django model
|
||||
|
@ -10,43 +10,16 @@ from wagtail.wagtailsearch.backends import get_search_backend
|
||||
|
||||
class Command(BaseCommand):
|
||||
def get_object_list(self):
|
||||
# Print info
|
||||
self.stdout.write("Getting object list")
|
||||
|
||||
# Get list of indexed models
|
||||
indexed_models = [model for model in models.get_models() if issubclass(model, Indexed)]
|
||||
|
||||
# Object set
|
||||
object_set = {}
|
||||
# Return list of (model_name, queryset) tuples
|
||||
return [
|
||||
(model, model.get_indexed_objects())
|
||||
for model in indexed_models
|
||||
]
|
||||
|
||||
# Add all objects to object set and detect any duplicates
|
||||
# Duplicates are caused when both a model and a derived model are indexed
|
||||
# Eg, if BlogPost inherits from Page and both of these models are indexed
|
||||
# If we were to add all objects from both models into the index, all the BlogPosts will have two entries
|
||||
for model in indexed_models:
|
||||
# Get toplevel content type
|
||||
toplevel_content_type = model.indexed_get_toplevel_content_type()
|
||||
|
||||
# Loop through objects
|
||||
for obj in model.get_indexed_objects():
|
||||
# Get key for this object
|
||||
key = toplevel_content_type + ':' + str(obj.pk)
|
||||
|
||||
# Check if this key already exists
|
||||
if key in object_set:
|
||||
# Conflict, work out who should get this space
|
||||
# The object with the longest content type string gets the space
|
||||
# Eg, "wagtailcore.Page-myapp.BlogPost" kicks out "wagtailcore.Page"
|
||||
if len(obj.indexed_get_content_type()) > len(object_set[key].indexed_get_content_type()):
|
||||
# Take the spot
|
||||
object_set[key] = obj
|
||||
else:
|
||||
# Space free, take it
|
||||
object_set[key] = obj
|
||||
|
||||
return indexed_models, object_set.values()
|
||||
|
||||
def update_backend(self, backend_name, models, object_list):
|
||||
def update_backend(self, backend_name, object_list):
|
||||
# Print info
|
||||
self.stdout.write("Updating backend: " + backend_name)
|
||||
|
||||
@ -57,15 +30,17 @@ class Command(BaseCommand):
|
||||
self.stdout.write(backend_name + ": Reseting index")
|
||||
backend.reset_index()
|
||||
|
||||
# Add types
|
||||
self.stdout.write(backend_name + ": Adding types")
|
||||
for model in models:
|
||||
for model, queryset in object_list:
|
||||
self.stdout.write(backend_name + ": Indexing model '%s.%s'" % (
|
||||
model._meta.app_label,
|
||||
model.__name__,
|
||||
))
|
||||
|
||||
# Add type
|
||||
backend.add_type(model)
|
||||
|
||||
# Add objects to index
|
||||
self.stdout.write(backend_name + ": Adding objects")
|
||||
for result in backend.add_bulk(object_list):
|
||||
self.stdout.write(result[0] + ' ' + str(result[1]))
|
||||
# Add objects
|
||||
backend.add_bulk(model, queryset)
|
||||
|
||||
# Refresh index
|
||||
self.stdout.write(backend_name + ": Refreshing index")
|
||||
@ -82,7 +57,7 @@ class Command(BaseCommand):
|
||||
|
||||
def handle(self, **options):
|
||||
# Get object list
|
||||
models, object_list = self.get_object_list()
|
||||
object_list = self.get_object_list()
|
||||
|
||||
# Get list of backends to index
|
||||
if options['backend_name']:
|
||||
@ -97,4 +72,4 @@ class Command(BaseCommand):
|
||||
|
||||
# Update backends
|
||||
for backend_name in backend_names:
|
||||
self.update_backend(backend_name, models, object_list)
|
||||
self.update_backend(backend_name, object_list)
|
||||
|
Loading…
Reference in New Issue
Block a user