diff --git a/django/utils/html.py b/django/utils/html.py index 094bc6660d..7fda015840 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -13,7 +13,7 @@ LEADING_PUNCTUATION = ['(', '<', '<'] TRAILING_PUNCTUATION = ['.', ',', ')', '>', '\n', '>'] # List of possible strings used for bullets in bulleted lists. -DOTS = ['·', '*', '\xe2\x80\xa2', '•', '•', '•'] +DOTS = [u'·', u'*', u'\u2022', u'•', u'•', u'•'] unencoded_ampersands_re = re.compile(r'&(?!(\w+|#\d+);)') word_split_re = re.compile(r'(\s+)') @@ -180,13 +180,13 @@ def clean_html(text): text = html_gunk_re.sub('', text) # Convert hard-coded bullets into HTML unordered lists. def replace_p_tags(match): - s = match.group().replace('

', '') + s = match.group().replace(u'

', u'') for d in DOTS: - s = s.replace('

%s' % d, '

  • ') + s = s.replace(u'

    %s' % d, u'

  • ') return u'' % s text = hard_coded_bullets_re.sub(replace_p_tags, text) # Remove stuff like "

      

    ", but only if it's at the bottom # of the text. - text = trailing_empty_content_re.sub('', text) + text = trailing_empty_content_re.sub(u'', text) return text clean_html = allow_lazy(clean_html, unicode) diff --git a/tests/regressiontests/utils/html.py b/tests/regressiontests/utils/html.py index 3acb218cd1..d8b9bde8bf 100644 --- a/tests/regressiontests/utils/html.py +++ b/tests/regressiontests/utils/html.py @@ -121,3 +121,15 @@ class TestUtilsHtml(unittest.TestCase): ) for value, output in items: self.check_output(f, value, output) + + def test_clean_html(self): + f = html.clean_html + items = ( + (u'

    I believe in semantic markup!

    ', u'

    I believe in semantic markup!

    '), + (u'I escape & I don\'t target', u'I escape & I don\'t target'), + (u'

    I kill whitespace


     

    ', u'

    I kill whitespace

    '), + # also a regression test for #7267: this used to raise an UnicodeDecodeError + (u'

    * foo

    * bar

    ', u''), + ) + for value, output in items: + self.check_output(f, value, output)