Don't use edgengram as query analyser

When a field uses the partial matching, the edgengram_analyser is added to that field. This breaks down the field data into "ngrams" like so: Hello -> "H", "He", "Hel", "Hell", "Hello" This allows a users query for "Hel" to match the above text. The issue that this commit solves is that this was accidentally set as both the index analyser (as described above) and also the query analyser. Setting this as the query analyser will instruct Elasticsearch to perform the above transformation on the users input to the box as well. So if, for example, there was a document with the word "Horse" in it, a users query for "Hello" will match this simply because they both start with the letter "H". The solution is to simply set the "index_analyser" instead of the "analyser" field (which will sets "query_analyser" as well).
2024-12-01 11:41:20 +01:00 · 2014-10-30 16:45:52 +00:00 · 2014-10-30 16:45:52 +00:00 · 7af321a2e9
commit 7af321a2e9
parent 869a16f5fa
2 changed files with 35 additions and 3 deletions
--- a/wagtail/wagtailsearch/backends/elasticsearch.py
+++ b/wagtail/wagtailsearch/backends/elasticsearch.py
@ -62,7 +62,7 @@ class ElasticSearchMapping(object):
                mapping['boost'] = field.boost
            if field.partial_match:
-                mapping['analyzer'] = 'edgengram_analyzer'
+                mapping['index_analyzer'] = 'edgengram_analyzer'
            mapping['include_in_all'] = True
        elif isinstance(field, FilterField):
@ -80,7 +80,7 @@ class ElasticSearchMapping(object):
        fields = {
            'pk': dict(type='string', index='not_analyzed', store='yes', include_in_all=False),
            'content_type': dict(type='string', index='not_analyzed', include_in_all=False),
-            '_partials': dict(type='string', analyzer='edgengram_analyzer', include_in_all=False),
+            '_partials': dict(type='string', index_analyzer='edgengram_analyzer', include_in_all=False),
        }
        fields.update(dict(
--- a/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py
+++ b/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py
@ -89,7 +89,7 @@ class TestElasticSearchBackend(BackendTests, TestCase):
        # Add some test data
        obj = models.SearchTest()
-        obj.title = "Ĥéỻø"
+        obj.title = "Ĥéllø"
        obj.live = True
        obj.save()
        self.backend.add(obj)
@ -103,6 +103,38 @@ class TestElasticSearchBackend(BackendTests, TestCase):
        self.assertEqual(len(results), 1)
        self.assertEqual(results[0].id, obj.id)
    def test_query_analyser(self):
        """
        This is testing that fields that use edgengram_analyzer as their index analyser do not
        have it also as their query analyser
        """
        # Reset the index
        self.backend.reset_index()
        self.backend.add_type(models.SearchTest)
        self.backend.add_type(models.SearchTestChild)
        # Add some test data
        obj = models.SearchTest()
        obj.title = "Hello"
        obj.live = True
        obj.save()
        self.backend.add(obj)
        # Refresh the index
        self.backend.refresh_index()
        # Test search for "Hello"
        results = self.backend.search("Hello", models.SearchTest.objects.all())
        # Should find the result
        self.assertEqual(len(results), 1)
        # Test search for "Horse"
        results = self.backend.search("Horse", models.SearchTest.objects.all())
        # Even though they both start with the letter "H". This should not be considered a match
        self.assertEqual(len(results), 0)
 class TestElasticSearchQuery(TestCase):
    def assertDictEqual(self, a, b):