bgneal@849: """Common HTML related functions""" bgneal@955: from urlparse import urlparse bgneal@955: bgneal@849: import bleach bgneal@955: from lxml import etree bgneal@955: bgneal@955: from django.conf import settings bgneal@9: bgneal@9: bgneal@849: # Each entry in the _CLEAN_PROFILES dict is a profile name -> 3-tuple pair. The bgneal@849: # tuple consists of (allowed_tags_list, allowed_attributes_dict, bgneal@849: # allowed_styles_list) bgneal@849: # bgneal@849: _CLEAN_PROFILES = { bgneal@849: 'comments': ( bgneal@849: [ bgneal@849: 'a', 'b', 'blockquote', 'br', 'code', 'del', 'em', bgneal@864: 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', bgneal@849: 'i', 'img', 'li', 'ol', 'p', 'pre', 'strong', 'ul', bgneal@849: ], bgneal@849: { bgneal@849: 'a': ['href'], bgneal@849: 'img': ['src', 'alt', 'title'], bgneal@849: }, bgneal@849: [], bgneal@849: ), bgneal@849: 'news': ( bgneal@849: [ bgneal@849: 'a', 'b', 'blockquote', 'br', 'caption', 'center', 'code', 'col', bgneal@849: 'colgroup', 'dd', 'del', 'div', 'dl', 'dt', 'em', bgneal@864: 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', bgneal@849: 'i', 'img', 'ins', 'li', 'ol', 'p', 'pre', 'small', 'strike', bgneal@849: 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', bgneal@849: 'thead', 'tr', 'tt', 'u', 'ul', bgneal@849: ], bgneal@849: { bgneal@849: 'a': ['href'], bgneal@849: 'img': ['src', 'alt', 'title', 'width', 'height'], bgneal@849: }, bgneal@849: [], bgneal@849: ), bgneal@849: } bgneal@849: bgneal@849: bgneal@849: def clean_html(text, profile='comments'): bgneal@9: """Cleans HTML of dangerous tags and content.""" bgneal@849: text = text.strip() bgneal@849: if not text: bgneal@849: return text bgneal@9: bgneal@849: tags, attrs, styles = _CLEAN_PROFILES[profile] bgneal@9: bgneal@849: return bleach.clean(text, tags=tags, attributes=attrs, styles=styles, bgneal@849: strip=True, strip_comments=True) bgneal@955: bgneal@955: bgneal@955: class ImageCheckError(Exception): bgneal@955: """Exception for the image_check() function""" bgneal@955: bgneal@955: bgneal@955: ALLOWED_HOSTS = set(settings.USER_IMAGES_SOURCES) bgneal@955: bgneal@955: bgneal@955: def image_check(html, allowed_hosts=None): bgneal@955: """Returns true if all image tags in the given html come from hosts bgneal@955: specified in the allowed_hosts container using https. bgneal@955: bgneal@955: An ImageCheckError is raised if the following problems are detected: bgneal@955: * the image src is missing altogether bgneal@955: * the scheme is missing or not https bgneal@955: * the hostname is missing bgneal@955: * the hostname is not in allowed_hosts bgneal@955: bgneal@955: If allowed_hosts is not None, it will be used as the whitelist of allowed bgneal@955: hosts. If None, USER_IMAGES_SOURCES from settings will be used as the bgneal@955: whitelist. bgneal@955: """ bgneal@955: if not allowed_hosts: bgneal@955: allowed_hosts = ALLOWED_HOSTS bgneal@955: bgneal@955: root = etree.HTML(html) bgneal@955: for img in root.iter('img'): bgneal@955: src = img.get('src') bgneal@955: if not src: bgneal@955: raise ImageCheckError("missing image source") bgneal@955: r = urlparse(src) bgneal@955: bgneal@955: if not r.scheme and not r.hostname: bgneal@955: # relative URL is ok bgneal@955: continue bgneal@955: if r.scheme != 'https': bgneal@955: raise ImageCheckError("image must be accessed via https") bgneal@955: if not r.hostname: bgneal@955: raise ImageCheckError("missing image hostname") bgneal@955: if r.hostname not in allowed_hosts: bgneal@955: raise ImageCheckError("invalid image source") bgneal@955: bgneal@955: return True