0
0
mirror of https://github.com/wagtail/wagtail.git synced 2024-12-01 11:41:20 +01:00

Search: Added new deduplication method Fixes #710

This commit is contained in:
Karl Hobley 2014-10-17 12:43:29 +01:00
parent b4fdec74ac
commit 7a4651ad64
5 changed files with 48 additions and 75 deletions

View File

@ -425,6 +425,17 @@ class SearchTest(models.Model, index.Indexed):
def callable_indexed_field(self):
return "Callable"
@classmethod
def get_indexed_objects(cls):
indexed_objects = super(SearchTest, cls).get_indexed_objects()
# Exclude SearchTests that have a SearchTestChild to prevent duplicates
if cls is SearchTest:
indexed_objects = indexed_objects.exclude(
id__in=SearchTestChild.objects.all().values_list('searchtest_ptr_id', flat=True)
)
return indexed_objects
class SearchTestChild(SearchTest):
subtitle = models.CharField(max_length=255, null=True, blank=True)

View File

@ -29,7 +29,7 @@ class BaseSearch(object):
def add(self, obj):
return NotImplemented
def add_bulk(self, obj_list):
def add_bulk(self, model, obj_list):
return NotImplemented
def delete(self, obj):

View File

@ -19,8 +19,8 @@ class DBSearch(BaseSearch):
def add(self, obj):
pass # Not needed
def add_bulk(self, obj_list):
return [] # Not needed
def add_bulk(self, model, obj_list):
pass # Not needed
def delete(self, obj):
pass # Not needed

View File

@ -572,42 +572,29 @@ class ElasticSearch(BaseSearch):
# Add document to index
self.es.index(self.es_index, mapping.get_document_type(), mapping.get_document(obj), id=mapping.get_document_id(obj))
def add_bulk(self, obj_list):
# Group all objects by their type
type_set = {}
def add_bulk(self, model, obj_list):
# Get mapping
mapping = ElasticSearchMapping(model)
doc_type = mapping.get_document_type()
# Create list of actions
actions = []
for obj in obj_list:
# Object must be a decendant of Indexed and be a django model
if not self.object_can_be_indexed(obj):
continue
# Get mapping
mapping = ElasticSearchMapping(obj.__class__)
# Create the action
action = {
'_index': self.es_index,
'_type': doc_type,
'_id': mapping.get_document_id(obj),
}
action.update(mapping.get_document(obj))
actions.append(action)
# Get document type
doc_type = mapping.get_document_type()
# If type is currently not in set, add it
if doc_type not in type_set:
type_set[doc_type] = []
# Add document to set
type_set[doc_type].append((mapping.get_document_id(obj), mapping.get_document(obj)))
# Loop through each type and bulk add them
for type_name, type_documents in type_set.items():
# Get list of actions
actions = []
for doc_id, doc in type_documents:
action = {
'_index': self.es_index,
'_type': type_name,
'_id': doc_id,
}
action.update(doc)
actions.append(action)
yield type_name, len(type_documents)
bulk(self.es, actions)
# Run the actions
bulk(self.es, actions)
def delete(self, obj):
# Object must be a decendant of Indexed and be a django model

View File

@ -10,43 +10,16 @@ from wagtail.wagtailsearch.backends import get_search_backend
class Command(BaseCommand):
def get_object_list(self):
# Print info
self.stdout.write("Getting object list")
# Get list of indexed models
indexed_models = [model for model in models.get_models() if issubclass(model, Indexed)]
# Object set
object_set = {}
# Return list of (model_name, queryset) tuples
return [
(model, model.get_indexed_objects())
for model in indexed_models
]
# Add all objects to object set and detect any duplicates
# Duplicates are caused when both a model and a derived model are indexed
# Eg, if BlogPost inherits from Page and both of these models are indexed
# If we were to add all objects from both models into the index, all the BlogPosts will have two entries
for model in indexed_models:
# Get toplevel content type
toplevel_content_type = model.indexed_get_toplevel_content_type()
# Loop through objects
for obj in model.get_indexed_objects():
# Get key for this object
key = toplevel_content_type + ':' + str(obj.pk)
# Check if this key already exists
if key in object_set:
# Conflict, work out who should get this space
# The object with the longest content type string gets the space
# Eg, "wagtailcore.Page-myapp.BlogPost" kicks out "wagtailcore.Page"
if len(obj.indexed_get_content_type()) > len(object_set[key].indexed_get_content_type()):
# Take the spot
object_set[key] = obj
else:
# Space free, take it
object_set[key] = obj
return indexed_models, object_set.values()
def update_backend(self, backend_name, models, object_list):
def update_backend(self, backend_name, object_list):
# Print info
self.stdout.write("Updating backend: " + backend_name)
@ -57,15 +30,17 @@ class Command(BaseCommand):
self.stdout.write(backend_name + ": Reseting index")
backend.reset_index()
# Add types
self.stdout.write(backend_name + ": Adding types")
for model in models:
for model, queryset in object_list:
self.stdout.write(backend_name + ": Indexing model '%s.%s'" % (
model._meta.app_label,
model.__name__,
))
# Add type
backend.add_type(model)
# Add objects to index
self.stdout.write(backend_name + ": Adding objects")
for result in backend.add_bulk(object_list):
self.stdout.write(result[0] + ' ' + str(result[1]))
# Add objects
backend.add_bulk(model, queryset)
# Refresh index
self.stdout.write(backend_name + ": Refreshing index")
@ -82,7 +57,7 @@ class Command(BaseCommand):
def handle(self, **options):
# Get object list
models, object_list = self.get_object_list()
object_list = self.get_object_list()
# Get list of backends to index
if options['backend_name']:
@ -97,4 +72,4 @@ class Command(BaseCommand):
# Update backends
for backend_name in backend_names:
self.update_backend(backend_name, models, object_list)
self.update_backend(backend_name, object_list)