From bb7753c0ec215b933414b0981ce9b79775d6b854 Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Tue, 13 Oct 2015 13:42:21 +0100 Subject: [PATCH 1/2] Remove HTML comments from rich text When text is pasted into a rich text block from MS word, HTML comments are created that contain lots of data that we don't need. This commit changes the whitelister to remove any comment nodes it comes across. --- wagtail/wagtailcore/tests/test_whitelist.py | 5 +++++ wagtail/wagtailcore/whitelist.py | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/wagtail/wagtailcore/tests/test_whitelist.py b/wagtail/wagtailcore/tests/test_whitelist.py index ce2ff44b84..26193a4fe8 100644 --- a/wagtail/wagtailcore/tests/test_whitelist.py +++ b/wagtail/wagtailcore/tests/test_whitelist.py @@ -143,3 +143,8 @@ class TestWhitelister(TestCase): string = 'snowman Yorkshire' cleaned_string = Whitelister.clean(string) self.assertEqual(cleaned_string, 'snowman Yorkshire') + + def test_clean_comments(self): + string = 'snowman Yorkshire' + cleaned_string = Whitelister.clean(string) + self.assertEqual(cleaned_string, 'snowman Yorkshire') diff --git a/wagtail/wagtailcore/whitelist.py b/wagtail/wagtailcore/whitelist.py index 3f521275ac..653e0c1241 100644 --- a/wagtail/wagtailcore/whitelist.py +++ b/wagtail/wagtailcore/whitelist.py @@ -3,7 +3,7 @@ A generic HTML whitelisting engine, designed to accommodate subclassing to overr specific rules. """ import re -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag, Comment ALLOWED_URL_SCHEMES = ['http', 'https', 'ftp', 'mailto', 'tel'] @@ -111,7 +111,12 @@ class Whitelister(object): cls.clean_unknown_node(doc, node) @classmethod - def clean_string_node(cls, doc, str): + def clean_string_node(cls, doc, node): + # Remove comments + if isinstance(node, Comment): + node.extract() + return + # by default, nothing needs to be done to whitelist string nodes pass From bb8894aa6c80ec306280e6b8c646c48c0fee40b2 Mon Sep 17 00:00:00 2001 From: Matt Westcott Date: Wed, 14 Oct 2015 17:07:26 +0100 Subject: [PATCH 2/2] Release note for #1821 --- CHANGELOG.txt | 1 + docs/releases/1.2.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index cfc88d71ab..5c1a4c8269 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -22,6 +22,7 @@ Changelog * StreamField blocks are now added automatically, without showing the block types menu, if only one block type exists (Alex Gleason) * Wagtail admin now standardises on a single thumbnail image size, to reduce the overhead of creating multiple renditions * The `first_published_at` and `latest_revision_created_at` fields on page models are now available as filter fields on search queries + * Rich text fields now strip out HTML comments * Fix: Deleting a page permission from the groups admin UI does not immediately submit the form * Fix: Wagtail userbar is shown on pages that do not pass a `page` variable to the template (e.g. because they override the `serve` method) * Fix: request.site now set correctly on page preview when the page is not in the default site diff --git a/docs/releases/1.2.rst b/docs/releases/1.2.rst index 3c291ecd44..64d8bbcea1 100644 --- a/docs/releases/1.2.rst +++ b/docs/releases/1.2.rst @@ -54,6 +54,7 @@ Minor features * StreamField blocks are now added automatically, without showing the block types menu, if only one block type exists (Alex Gleason) * The ``first_published_at`` and ``latest_revision_created_at`` fields on page models are now available as filter fields on search queries * Wagtail admin now standardises on a single thumbnail image size, to reduce the overhead of creating multiple renditions + * Rich text fields now strip out HTML comments Bug fixes ~~~~~~~~~