view core/html.py @ 989:2908859c2fe4

Smilies now use relative links. This is for upcoming switch to SSL. Currently we do not need absolute URLs for smilies. If this changes we can add it later.
author Brian Neal <bgneal@gmail.com>
date Thu, 29 Oct 2015 20:54:34 -0500
parents 9b197dbba34b
children
line wrap: on
line source
"""Common HTML related functions"""
from urlparse import urlparse

import bleach
from lxml import etree

from django.conf import settings


# Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The
# tuple consists of (allowed_tags_list, allowed_attributes_dict,
# allowed_styles_list)
#
_CLEAN_PROFILES = {
    'comments': (
        [
            'a', 'b', 'blockquote', 'br', 'code', 'del', 'em',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
            'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul',
        ],
        {
            'a': ['href'],
            'img': ['src', 'alt', 'title'],
        },
        [],
    ),
    'news': (
        [
            'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col',
            'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr',
            'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike',
            'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th',
            'thead', 'tr', 'tt', 'u', 'ul',
        ],
        {
            'a': ['href'],
            'img': ['src', 'alt', 'title', 'width', 'height'],
        },
        [],
    ),
}


def clean_html(text, profile='comments'):
    """Cleans HTML of dangerous tags and content."""
    text = text.strip()
    if not text:
        return text

    tags, attrs, styles = _CLEAN_PROFILES[profile]

    return bleach.clean(text, tags=tags, attributes=attrs, styles=styles,
        strip=True, strip_comments=True)


class ImageCheckError(Exception):
    """Exception for the image_check() function"""


ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES)


def image_check(html, allowed_hosts=None):
    """Returns true if all image tags in the given html come from hosts
    specified in the allowed_hosts container using https.

    An ImageCheckError is raised if the following problems are detected:
        * the image src is missing altogether
        * the scheme is missing or not https
        * the hostname is missing
        * the hostname is not in allowed_hosts

    If allowed_hosts is not None, it will be used as the whitelist of allowed
    hosts. If None, USER_IMAGES_SOURCES from settings will be used as the
    whitelist.
    """
    html = html.strip()
    if not html:
        return True

    if not allowed_hosts:
        allowed_hosts = ALLOWED_HOSTS

    root = etree.HTML(html)
    for img in root.iter('img'):
        src = img.get('src')
        if not src:
            raise ImageCheckError("Missing image source")
        r = urlparse(src)

        if not r.scheme and not r.hostname:
            # relative URL is ok
            continue
        if r.scheme != 'https':
            raise ImageCheckError("Image must be accessed via https")
        if not r.hostname:
            raise ImageCheckError("Missing image hostname")
        if r.hostname not in allowed_hosts:
            raise ImageCheckError("Invalid image source")

    return True