0
0
mirror of https://github.com/wagtail/wagtail.git synced 2024-12-01 11:41:20 +01:00

Don't use edgengram as query analyser

When a field uses the partial matching, the edgengram_analyser is added
to that field.

This breaks down the field data into "ngrams" like so:
Hello -> "H", "He", "Hel", "Hell", "Hello"

This allows a users query for "Hel" to match the above text.

The issue that this commit solves is that this was accidentally set as
both the index analyser (as described above) and also the query
analyser.

Setting this as the query analyser will instruct Elasticsearch to
perform the above transformation on the users input to the box as well.
So if, for example, there was a document with the word "Horse" in it, a
users query for "Hello" will match this simply because they both start
with the letter "H".

The solution is to simply set the "index_analyser" instead of the
"analyser" field (which will sets "query_analyser" as well).
This commit is contained in:
Karl Hobley 2014-10-30 16:45:52 +00:00
parent 869a16f5fa
commit 7af321a2e9
2 changed files with 35 additions and 3 deletions

View File

@ -62,7 +62,7 @@ class ElasticSearchMapping(object):
mapping['boost'] = field.boost mapping['boost'] = field.boost
if field.partial_match: if field.partial_match:
mapping['analyzer'] = 'edgengram_analyzer' mapping['index_analyzer'] = 'edgengram_analyzer'
mapping['include_in_all'] = True mapping['include_in_all'] = True
elif isinstance(field, FilterField): elif isinstance(field, FilterField):
@ -80,7 +80,7 @@ class ElasticSearchMapping(object):
fields = { fields = {
'pk': dict(type='string', index='not_analyzed', store='yes', include_in_all=False), 'pk': dict(type='string', index='not_analyzed', store='yes', include_in_all=False),
'content_type': dict(type='string', index='not_analyzed', include_in_all=False), 'content_type': dict(type='string', index='not_analyzed', include_in_all=False),
'_partials': dict(type='string', analyzer='edgengram_analyzer', include_in_all=False), '_partials': dict(type='string', index_analyzer='edgengram_analyzer', include_in_all=False),
} }
fields.update(dict( fields.update(dict(

View File

@ -89,7 +89,7 @@ class TestElasticSearchBackend(BackendTests, TestCase):
# Add some test data # Add some test data
obj = models.SearchTest() obj = models.SearchTest()
obj.title = "Ĥéø" obj.title = "Ĥéllø"
obj.live = True obj.live = True
obj.save() obj.save()
self.backend.add(obj) self.backend.add(obj)
@ -103,6 +103,38 @@ class TestElasticSearchBackend(BackendTests, TestCase):
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
self.assertEqual(results[0].id, obj.id) self.assertEqual(results[0].id, obj.id)
def test_query_analyser(self):
"""
This is testing that fields that use edgengram_analyzer as their index analyser do not
have it also as their query analyser
"""
# Reset the index
self.backend.reset_index()
self.backend.add_type(models.SearchTest)
self.backend.add_type(models.SearchTestChild)
# Add some test data
obj = models.SearchTest()
obj.title = "Hello"
obj.live = True
obj.save()
self.backend.add(obj)
# Refresh the index
self.backend.refresh_index()
# Test search for "Hello"
results = self.backend.search("Hello", models.SearchTest.objects.all())
# Should find the result
self.assertEqual(len(results), 1)
# Test search for "Horse"
results = self.backend.search("Horse", models.SearchTest.objects.all())
# Even though they both start with the letter "H". This should not be considered a match
self.assertEqual(len(results), 0)
class TestElasticSearchQuery(TestCase): class TestElasticSearchQuery(TestCase):
def assertDictEqual(self, a, b): def assertDictEqual(self, a, b):