annotate core/html.py @ 955:71a671dab55d

First commit of whitelisting image hosts. This is behind a feature flag courtesy of waffle.
author Brian Neal <bgneal@gmail.com>
date Wed, 03 Jun 2015 21:13:08 -0500
parents 928b97ec55a7
children
rev   line source
bgneal@849 1 """Common HTML related functions"""
bgneal@955 2 from urlparse import urlparse
bgneal@955 3
bgneal@849 4 import bleach
bgneal@955 5 from lxml import etree
bgneal@955 6
bgneal@955 7 from django.conf import settings
bgneal@9 8
bgneal@9 9
bgneal@849 10 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
bgneal@849 11 # tuple consists of (allowed_tags_list, allowed_attributes_dict,
bgneal@849 12 # allowed_styles_list)
bgneal@849 13 #
bgneal@849 14 _CLEAN_PROFILES = {
bgneal@849 15 'comments': (
bgneal@849 16 [
bgneal@849 17 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
bgneal@864 18 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
bgneal@849 19 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
bgneal@849 20 ],
bgneal@849 21 {
bgneal@849 22 'a': ['href'],
bgneal@849 23 'img': ['src', 'alt', 'title'],
bgneal@849 24 },
bgneal@849 25 [],
bgneal@849 26 ),
bgneal@849 27 'news': (
bgneal@849 28 [
bgneal@849 29 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
bgneal@849 30 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
bgneal@864 31 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
bgneal@849 32 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
bgneal@849 33 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
bgneal@849 34 'thead', 'tr', 'tt', 'u', 'ul',
bgneal@849 35 ],
bgneal@849 36 {
bgneal@849 37 'a': ['href'],
bgneal@849 38 'img': ['src', 'alt', 'title', 'width', 'height'],
bgneal@849 39 },
bgneal@849 40 [],
bgneal@849 41 ),
bgneal@849 42 }
bgneal@849 43
bgneal@849 44
bgneal@849 45 def clean_html(text, profile='comments'):
bgneal@9 46 """Cleans HTML of dangerous tags and content."""
bgneal@849 47 text = text.strip()
bgneal@849 48 if not text:
bgneal@849 49 return text
bgneal@9 50
bgneal@849 51 tags, attrs, styles = _CLEAN_PROFILES[profile]
bgneal@9 52
bgneal@849 53 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
bgneal@849 54 strip=True, strip_comments=True)
bgneal@955 55
bgneal@955 56
bgneal@955 57 class ImageCheckError(Exception):
bgneal@955 58 """Exception for the image_check() function"""
bgneal@955 59
bgneal@955 60
bgneal@955 61 ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
bgneal@955 62
bgneal@955 63
bgneal@955 64 def image_check(html, allowed_hosts=None):
bgneal@955 65 """Returns true if all image tags in the given html come from hosts
bgneal@955 66 specified in the allowed_hosts container using https.
bgneal@955 67
bgneal@955 68 An ImageCheckError is raised if the following problems are detected:
bgneal@955 69 * the image src is missing altogether
bgneal@955 70 * the scheme is missing or not https
bgneal@955 71 * the hostname is missing
bgneal@955 72 * the hostname is not in allowed_hosts
bgneal@955 73
bgneal@955 74 If allowed_hosts is not None, it will be used as the whitelist of allowed
bgneal@955 75 hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
bgneal@955 76 whitelist.
bgneal@955 77 """
bgneal@955 78 if not allowed_hosts:
bgneal@955 79 allowed_hosts = ALLOWED_HOSTS
bgneal@955 80
bgneal@955 81 root = etree.HTML(html)
bgneal@955 82 for img in root.iter('img'):
bgneal@955 83 src = img.get('src')
bgneal@955 84 if not src:
bgneal@955 85 raise ImageCheckError("missing image source")
bgneal@955 86 r = urlparse(src)
bgneal@955 87
bgneal@955 88 if not r.scheme and not r.hostname:
bgneal@955 89 # relative URL is ok
bgneal@955 90 continue
bgneal@955 91 if r.scheme != 'https':
bgneal@955 92 raise ImageCheckError("image must be accessed via https")
bgneal@955 93 if not r.hostname:
bgneal@955 94 raise ImageCheckError("missing image hostname")
bgneal@955 95 if r.hostname not in allowed_hosts:
bgneal@955 96 raise ImageCheckError("invalid image source")
bgneal@955 97
bgneal@955 98 return True