bgneal@849: """Common HTML related functions"""
bgneal@963: from urlparse import urlparse
bgneal@963: 
bgneal@849: import bleach
bgneal@963: from lxml import etree
bgneal@963: 
bgneal@963: from django.conf import settings
bgneal@9: 
bgneal@9: 
bgneal@849: # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
bgneal@849: # tuple consists of (allowed_tags_list, allowed_attributes_dict,
bgneal@849: # allowed_styles_list)
bgneal@849: #
bgneal@849: _CLEAN_PROFILES = {
bgneal@849:     'comments': (
bgneal@849:         [
bgneal@849:             'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
bgneal@864:             'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
bgneal@849:             'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
bgneal@849:         ],
bgneal@849:         {
bgneal@849:             'a': ['href'],
bgneal@849:             'img': ['src', 'alt', 'title'],
bgneal@849:         },
bgneal@849:         [],
bgneal@849:     ),
bgneal@849:     'news': (
bgneal@849:         [
bgneal@849:             'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
bgneal@849:             'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
bgneal@864:             'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
bgneal@849:             'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
bgneal@849:             'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
bgneal@849:             'thead', 'tr', 'tt', 'u', 'ul',
bgneal@849:         ],
bgneal@849:         {
bgneal@849:             'a': ['href'],
bgneal@849:             'img': ['src', 'alt', 'title', 'width', 'height'],
bgneal@849:         },
bgneal@849:         [],
bgneal@849:     ),
bgneal@849: }
bgneal@849: 
bgneal@849: 
bgneal@849: def clean_html(text, profile='comments'):
bgneal@9:     """Cleans HTML of dangerous tags and content."""
bgneal@849:     text = text.strip()
bgneal@849:     if not text:
bgneal@849:         return text
bgneal@9: 
bgneal@849:     tags, attrs, styles = _CLEAN_PROFILES[profile]
bgneal@9: 
bgneal@849:     return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
bgneal@849:         strip=True, strip_comments=True)
bgneal@963: 
bgneal@963: 
bgneal@963: class ImageCheckError(Exception):
bgneal@963:     """Exception for the image_check() function"""
bgneal@963: 
bgneal@963: 
bgneal@963: ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)
bgneal@963: 
bgneal@963: 
bgneal@963: def image_check(html, allowed_hosts=None):
bgneal@963:     """Returns true if all image tags in the given html come from hosts
bgneal@963:     specified in the allowed_hosts container using https.
bgneal@963: 
bgneal@963:     An ImageCheckError is raised if the following problems are detected:
bgneal@963:         * the image src is missing altogether
bgneal@963:         * the scheme is missing or not https
bgneal@963:         * the hostname is missing
bgneal@963:         * the hostname is not in allowed_hosts
bgneal@963: 
bgneal@963:     If allowed_hosts is not None, it will be used as the whitelist of allowed
bgneal@963:     hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
bgneal@963:     whitelist.
bgneal@963:     """
bgneal@985:     html = html.strip()
bgneal@985:     if not html:
bgneal@985:         return True
bgneal@985: 
bgneal@963:     if not allowed_hosts:
bgneal@963:         allowed_hosts = ALLOWED_HOSTS
bgneal@963: 
bgneal@963:     root = etree.HTML(html)
bgneal@963:     for img in root.iter('img'):
bgneal@963:         src = img.get('src')
bgneal@963:         if not src:
bgneal@963:             raise ImageCheckError("Missing image source")
bgneal@963:         r = urlparse(src)
bgneal@963: 
bgneal@963:         if not r.scheme and not r.hostname:
bgneal@963:             # relative URL is ok
bgneal@963:             continue
bgneal@963:         if r.scheme != 'https':
bgneal@963:             raise ImageCheckError("Image must be accessed via https")
bgneal@963:         if not r.hostname:
bgneal@963:             raise ImageCheckError("Missing image hostname")
bgneal@963:         if r.hostname not in allowed_hosts:
bgneal@963:             raise ImageCheckError("Invalid image source")
bgneal@963: 
bgneal@963:     return True