annotate core/html.py @ 989:2908859c2fe4

Smilies now use relative links. This is for upcoming switch to SSL. Currently we do not need absolute URLs for smilies. If this changes we can add it later.
author Brian Neal <bgneal@gmail.com>
date Thu, 29 Oct 2015 20:54:34 -0500
parents 9b197dbba34b
children
rev   line source
bgneal@849 1 """Common HTML related functions"""
bgneal@963 2 from urlparse import urlparse
bgneal@963 3
bgneal@849 4 import bleach
bgneal@963 5 from lxml import etree
bgneal@963 6
bgneal@963 7 from django.conf import settings
bgneal@9 8
bgneal@9 9
bgneal@849 10 # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
bgneal@849 11 # tuple consists of (allowed_tags_list, allowed_attributes_dict,
bgneal@849 12 # allowed_styles_list)
bgneal@849 13 #
bgneal@849 14 _CLEAN_PROFILES = {
bgneal@849 15 'comments': (
bgneal@849 16 [
bgneal@849 17 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
bgneal@864 18 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
bgneal@849 19 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
bgneal@849 20 ],
bgneal@849 21 {
bgneal@849 22 'a': ['href'],
bgneal@849 23 'img': ['src', 'alt', 'title'],
bgneal@849 24 },
bgneal@849 25 [],
bgneal@849 26 ),
bgneal@849 27 'news': (
bgneal@849 28 [
bgneal@849 29 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
bgneal@849 30 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
bgneal@864 31 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
bgneal@849 32 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
bgneal@849 33 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
bgneal@849 34 'thead', 'tr', 'tt', 'u', 'ul',
bgneal@849 35 ],
bgneal@849 36 {
bgneal@849 37 'a': ['href'],
bgneal@849 38 'img': ['src', 'alt', 'title', 'width', 'height'],
bgneal@849 39 },
bgneal@849 40 [],
bgneal@849 41 ),
bgneal@849 42 }
bgneal@849 43
bgneal@849 44
bgneal@849 45 def clean_html(text, profile='comments'):
bgneal@9 46 """Cleans HTML of dangerous tags and content."""
bgneal@849 47 text = text.strip()
bgneal@849 48 if not text:
bgneal@849 49 return text
bgneal@9 50
bgneal@849 51 tags, attrs, styles = _CLEAN_PROFILES[profile]
bgneal@9 52
bgneal@849 53 return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
bgneal@849 54 strip=True, strip_comments=True)
bgneal@963 55
bgneal@963 56
bgneal@963 57 class ImageCheckError(Exception):
bgneal@963 58 """Exception for the image_check() function"""
bgneal@963 59
bgneal@963 60
bgneal@963 61 ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
bgneal@963 62
bgneal@963 63
bgneal@963 64 def image_check(html, allowed_hosts=None):
bgneal@963 65 """Returns true if all image tags in the given html come from hosts
bgneal@963 66 specified in the allowed_hosts container using https.
bgneal@963 67
bgneal@963 68 An ImageCheckError is raised if the following problems are detected:
bgneal@963 69 * the image src is missing altogether
bgneal@963 70 * the scheme is missing or not https
bgneal@963 71 * the hostname is missing
bgneal@963 72 * the hostname is not in allowed_hosts
bgneal@963 73
bgneal@963 74 If allowed_hosts is not None, it will be used as the whitelist of allowed
bgneal@963 75 hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
bgneal@963 76 whitelist.
bgneal@963 77 """
bgneal@985 78 html = html.strip()
bgneal@985 79 if not html:
bgneal@985 80 return True
bgneal@985 81
bgneal@963 82 if not allowed_hosts:
bgneal@963 83 allowed_hosts = ALLOWED_HOSTS
bgneal@963 84
bgneal@963 85 root = etree.HTML(html)
bgneal@963 86 for img in root.iter('img'):
bgneal@963 87 src = img.get('src')
bgneal@963 88 if not src:
bgneal@963 89 raise ImageCheckError("Missing image source")
bgneal@963 90 r = urlparse(src)
bgneal@963 91
bgneal@963 92 if not r.scheme and not r.hostname:
bgneal@963 93 # relative URL is ok
bgneal@963 94 continue
bgneal@963 95 if r.scheme != 'https':
bgneal@963 96 raise ImageCheckError("Image must be accessed via https")
bgneal@963 97 if not r.hostname:
bgneal@963 98 raise ImageCheckError("Missing image hostname")
bgneal@963 99 if r.hostname not in allowed_hosts:
bgneal@963 100 raise ImageCheckError("Invalid image source")
bgneal@963 101
bgneal@963 102 return True