bgneal@859: """ bgneal@859: ssl_images is a custom manage.py command to convert forum post and comment bgneal@859: images to https. It does this by rewriting the markup: bgneal@859: - Images with src = http://surfguitar101.com/something are rewritten to be bgneal@859: /something. bgneal@859: - Non SG101 images that use http: are downloaded, resized, and uploaded to bgneal@859: an S3 bucket. The src attribute is replaced with the new S3 URL. bgneal@859: """ bgneal@882: import base64 bgneal@895: import datetime bgneal@899: import json bgneal@859: import logging bgneal@859: from optparse import make_option bgneal@888: import os bgneal@863: import re bgneal@863: import signal bgneal@881: import socket bgneal@881: import urllib bgneal@868: import urlparse bgneal@881: import uuid bgneal@859: bgneal@859: from django.core.management.base import NoArgsCommand, CommandError bgneal@859: from django.conf import settings bgneal@894: from lxml import etree bgneal@863: import markdown.inlinepatterns bgneal@881: from PIL import Image bgneal@859: bgneal@860: from comments.models import Comment bgneal@860: from forums.models import Post bgneal@881: from core.s3 import S3Bucket bgneal@860: bgneal@860: bgneal@859: LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log') bgneal@859: logger = logging.getLogger(__name__) bgneal@859: bgneal@871: IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE, bgneal@871: re.DOTALL | re.UNICODE) bgneal@871: IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE, bgneal@871: re.DOTALL | re.UNICODE) bgneal@863: bgneal@868: SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com']) bgneal@963: WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES) bgneal@866: MODEL_CHOICES = ['comments', 'posts'] bgneal@866: bgneal@881: PHOTO_MAX_SIZE = (660, 720) bgneal@881: PHOTO_BASE_URL = 'https://s3.amazonaws.com/' bgneal@881: PHOTO_BUCKET_NAME = 'sg101.forum.photos' bgneal@881: bgneal@899: CACHE_FILENAME = 'ssl_images_cache.json' bgneal@899: bgneal@863: quit_flag = False bgneal@881: opener = None bgneal@881: bucket = None bgneal@881: url_cache = {} bgneal@899: bad_hosts = set() bgneal@863: bgneal@863: bgneal@863: def signal_handler(signum, frame): bgneal@863: """SIGINT signal handler""" bgneal@863: global quit_flag bgneal@863: quit_flag = True bgneal@863: bgneal@859: bgneal@859: def _setup_logging(): bgneal@859: logger.setLevel(logging.DEBUG) bgneal@859: logger.propagate = False bgneal@859: handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8') bgneal@859: formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') bgneal@859: handler.setFormatter(formatter) bgneal@859: logger.addHandler(handler) bgneal@859: bgneal@859: bgneal@881: class ImageURLopener(urllib.FancyURLopener): bgneal@881: """Our URL opener. Handles redirects as per FancyURLopener. But all other bgneal@881: errors and authentication requests will raise an IOError. bgneal@881: """ bgneal@881: HANDLED_ERRORS = set([302, 301, 303, 307]) bgneal@881: bgneal@881: def http_error_default(self, url, fp, errcode, errmsg, headers): bgneal@881: return urllib.URLopener.http_error_default(self, url, fp, errcode, bgneal@881: errmsg, headers) bgneal@881: bgneal@881: def http_error(self, url, fp, errcode, errmsg, headers, data=None): bgneal@881: """Handle http errors. bgneal@881: We let FancyURLopener handle the redirects, but any other error we want bgneal@881: to let fail. bgneal@881: """ bgneal@881: if errcode in self.HANDLED_ERRORS: bgneal@881: name = 'http_error_%d' % errcode bgneal@881: method = getattr(self, name) bgneal@881: if data is None: bgneal@881: result = method(url, fp, errcode, errmsg, headers) bgneal@881: else: bgneal@881: result = method(url, fp, errcode, errmsg, headers, data) bgneal@881: if result: bgneal@881: return result bgneal@881: return self.http_error_default(url, fp, errcode, errmsg, headers) bgneal@881: bgneal@881: bgneal@899: def download_image(parsed_url): bgneal@881: """Downloads the image file from the given source URL. bgneal@881: bgneal@881: If successful returns the path to the downloaded file. Otherwise None is bgneal@881: returned. bgneal@881: """ bgneal@899: src = parsed_url.geturl() bgneal@881: logger.info("Retrieving %s", src) bgneal@881: try: bgneal@881: fn, hdrs = opener.retrieve(src) bgneal@881: except IOError as ex: bgneal@899: args = ex.args if ex.args else [] bgneal@881: if len(args) == 4 and args[0] == 'http error': bgneal@881: logger.error("http error: %d - %s", args[1], args[2]) bgneal@899: elif len(args) == 2 and isinstance(args[1], socket.gaierror): bgneal@899: logger.error("gaierror, ignoring host %s", parsed_url.hostname) bgneal@899: bad_hosts.add(parsed_url.hostname) bgneal@881: else: bgneal@881: logger.error("%s", ex) bgneal@881: return None bgneal@881: bgneal@881: # Does it look like an image? bgneal@881: content_type = hdrs.get('content-type') bgneal@881: if not content_type: bgneal@881: logger.error("No content-type header found") bgneal@881: return None bgneal@881: bgneal@888: file_size = os.stat(fn).st_size bgneal@888: logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type) bgneal@881: bgneal@881: parts = content_type.split('/') bgneal@881: if len(parts) < 2 or parts[0] != 'image': bgneal@881: logger.error("Unknown content-type: %s", content_type) bgneal@881: return None bgneal@881: bgneal@881: return fn bgneal@881: bgneal@881: bgneal@881: def resize_image(img_path): bgneal@881: """Resizes the image found at img_path if necessary.""" bgneal@881: image = Image.open(img_path) bgneal@881: if image.size > PHOTO_MAX_SIZE: bgneal@881: logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE) bgneal@881: image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS) bgneal@881: image.save(img_path) bgneal@881: bgneal@881: bgneal@882: def gen_key(): bgneal@882: """Return a random key.""" bgneal@882: return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=') bgneal@882: bgneal@882: bgneal@881: def upload_image(img_path): bgneal@881: """Upload image file located at img_path to our S3 bucket. bgneal@881: bgneal@881: Returns the URL of the image in the bucket or None if an error occurs. bgneal@881: """ bgneal@881: logger.info("upload_image starting") bgneal@881: # Make a unique name for the image in the bucket bgneal@881: ext = os.path.splitext(img_path)[1] bgneal@882: file_key = gen_key() + ext bgneal@881: try: bgneal@881: return bucket.upload_from_filename(file_key, img_path, public=True) bgneal@881: except IOError as ex: bgneal@881: logger.error("Error uploading file: %s", ex) bgneal@881: return None bgneal@881: bgneal@881: bgneal@888: def convert_to_ssl(parsed_url): bgneal@888: """Top-level function for moving an image to SSL.""" bgneal@888: bgneal@888: src = parsed_url.geturl() bgneal@888: bgneal@899: if parsed_url.hostname in bad_hosts: bgneal@899: logger.info("Host known to be bad, skipping: %s", src) bgneal@899: return None bgneal@899: bgneal@899: # Check the cache bgneal@897: try: bgneal@897: new_url = url_cache[src] bgneal@897: except KeyError: bgneal@897: # cache miss, try to get the file bgneal@899: new_url = save_image_to_cloud(parsed_url) bgneal@897: url_cache[src] = new_url bgneal@897: else: bgneal@897: if new_url: bgneal@897: logger.info("Found URL in cache: %s => %s", src, new_url) bgneal@897: else: bgneal@897: logger.info("URL known to be bad, skipping: %s", src) bgneal@888: bgneal@889: return new_url bgneal@888: bgneal@888: bgneal@899: def save_image_to_cloud(parsed_url): bgneal@881: """Downloads an image at a given source URL. Uploads it to cloud storage. bgneal@881: bgneal@881: Returns the new URL or None if unsuccessful. bgneal@881: """ bgneal@899: fn = download_image(parsed_url) bgneal@881: if fn: bgneal@881: resize_image(fn) bgneal@889: return upload_image(fn) bgneal@881: return None bgneal@868: bgneal@868: bgneal@866: def replace_image_markup(match): bgneal@870: src_parts = match.group(8).split() bgneal@868: if src_parts: bgneal@868: src = src_parts[0] bgneal@868: if src[0] == "<" and src[-1] == ">": bgneal@868: src = src[1:-1] bgneal@868: else: bgneal@868: src = '' bgneal@868: bgneal@868: title = '' bgneal@868: if len(src_parts) > 1: bgneal@868: title = " ".join(src_parts[1:]) bgneal@870: alt = match.group(1) bgneal@868: bgneal@871: new_src = None bgneal@868: if src: bgneal@868: r = urlparse.urlparse(src) bgneal@871: if r.hostname in SG101_HOSTS: bgneal@871: new_src = r.path # convert to relative path bgneal@871: elif r.scheme == 'http': bgneal@888: # Try a few things to get this on ssl: bgneal@888: new_src = convert_to_ssl(r) bgneal@868: elif r.scheme == 'https': bgneal@963: if r.hostname in WHITELIST_HOSTS: bgneal@963: new_src = src # already in whitelist bgneal@963: else: bgneal@963: new_src = convert_to_ssl(r) bgneal@868: bgneal@868: if new_src: bgneal@868: if title: bgneal@871: s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title) bgneal@868: else: bgneal@868: s = u'![{alt}]({src})'.format(alt=alt, src=new_src) bgneal@868: else: bgneal@868: # something's messed up, convert to a link using original src bgneal@868: s = u'[{alt}]({src})'.format(alt=alt, src=src) bgneal@868: bgneal@868: return s bgneal@860: bgneal@860: bgneal@887: def warn_if_image_refs(text, model_name, pk): bgneal@887: """Search text for Markdown image reference markup. bgneal@887: bgneal@887: We aren't expecting these, but we will log something if we see any. bgneal@887: """ bgneal@887: if IMAGE_REF_RE.search(text): bgneal@887: logger.warning("Image reference found in %s pk = #%d", model_name, pk) bgneal@887: bgneal@887: bgneal@866: def process_post(text): bgneal@863: """Process the post object: bgneal@863: bgneal@863: A regex substitution is run on the post's text field. This fixes up image bgneal@863: links, getting rid of plain old http sources; either converting to https bgneal@863: or relative style links (if the link is to SG101). bgneal@863: bgneal@863: """ bgneal@866: return IMAGE_LINK_RE.sub(replace_image_markup, text) bgneal@863: bgneal@863: bgneal@894: def html_check(html): bgneal@894: """Return True if the given HTML fragment has tags with src attributes bgneal@894: that use http, and False otherwise. bgneal@894: """ bgneal@894: if not html: bgneal@894: return False bgneal@894: bgneal@894: root = etree.HTML(html) bgneal@894: for img in root.iter('img'): bgneal@894: src = img.get('src') bgneal@894: if src and src.lower().startswith('http:'): bgneal@894: return True bgneal@894: return False bgneal@894: bgneal@894: bgneal@859: class Command(NoArgsCommand): bgneal@859: help = "Rewrite forum posts and comments to not use http for images" bgneal@859: option_list = NoArgsCommand.option_list + ( bgneal@866: make_option('-m', '--model', bgneal@866: choices=MODEL_CHOICES, bgneal@866: help="which model to update; must be one of {{{}}}".format( bgneal@866: ', '.join(MODEL_CHOICES))), bgneal@860: make_option('-i', '--i', bgneal@859: type='int', bgneal@863: help="optional first slice index; the i in [i:j]"), bgneal@860: make_option('-j', '--j', bgneal@859: type='int', bgneal@863: help="optional second slice index; the j in [i:j]"), bgneal@898: make_option('-t', '--timeout', bgneal@898: type='int', bgneal@898: help="optional socket timeout (secs)"), bgneal@859: ) bgneal@859: bgneal@859: def handle_noargs(self, **options): bgneal@895: time_started = datetime.datetime.now() bgneal@859: _setup_logging() bgneal@860: logger.info("Starting; arguments received: %s", options) bgneal@859: bgneal@866: if options['model'] not in MODEL_CHOICES: bgneal@866: raise CommandError('Please choose a --model option') bgneal@859: bgneal@866: if options['model'] == 'comments': bgneal@860: qs = Comment.objects.all() bgneal@866: text_attr = 'comment' bgneal@881: model_name = 'Comment' bgneal@860: else: bgneal@860: qs = Post.objects.all() bgneal@866: text_attr = 'body' bgneal@881: model_name = 'Post' bgneal@860: bgneal@860: i, j = options['i'], options['j'] bgneal@860: bgneal@860: if i is not None and i < 0: bgneal@860: raise CommandError("-i must be >= 0") bgneal@860: if j is not None and j < 0: bgneal@860: raise CommandError("-j must be >= 0") bgneal@860: if j is not None and i is not None and j <= i: bgneal@860: raise CommandError("-j must be > -i") bgneal@860: bgneal@860: if i is not None and j is not None: bgneal@860: qs = qs[i:j] bgneal@860: elif i is not None and j is None: bgneal@860: qs = qs[i:] bgneal@860: elif i is None and j is not None: bgneal@860: qs = qs[:j] bgneal@860: bgneal@881: # Set global socket timeout bgneal@898: timeout = options.get('timeout', 30) bgneal@898: logger.info("Setting socket timeout to %d", timeout) bgneal@898: socket.setdefaulttimeout(timeout) bgneal@881: bgneal@863: # Install signal handler for ctrl-c bgneal@863: signal.signal(signal.SIGINT, signal_handler) bgneal@863: bgneal@881: # Create URL opener to download photos bgneal@881: global opener bgneal@881: opener = ImageURLopener() bgneal@881: bgneal@881: # Create bucket to upload photos bgneal@881: global bucket bgneal@881: bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, bgneal@881: secret_key=settings.USER_PHOTOS_SECRET_KEY, bgneal@881: base_url=PHOTO_BASE_URL, bgneal@881: bucket_name=PHOTO_BUCKET_NAME) bgneal@887: bgneal@899: # Load cached info from previous runs bgneal@899: load_cache() bgneal@899: bgneal@887: if i is None: bgneal@887: i = 0 bgneal@887: bgneal@895: count = 0 bgneal@881: for n, model in enumerate(qs.iterator()): bgneal@863: if quit_flag: bgneal@863: logger.warning("SIGINT received, exiting") bgneal@881: break bgneal@881: logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk) bgneal@866: txt = getattr(model, text_attr) bgneal@887: warn_if_image_refs(txt, model_name, model.pk) bgneal@866: new_txt = process_post(txt) bgneal@881: if txt != new_txt: bgneal@889: logger.info("Content changed on %s #%d (pk = %d)", bgneal@887: model_name, n + i, model.pk) bgneal@881: logger.debug("original: %s", txt) bgneal@881: logger.debug("changed: %s", new_txt) bgneal@887: setattr(model, text_attr, new_txt) bgneal@887: model.save() bgneal@894: elif html_check(model.html): bgneal@894: # Check for content generated with older smiley code that used bgneal@894: # absolute URLs for the smiley images. If True, then just save bgneal@894: # the model again to force updated HTML to be created. bgneal@894: logger.info("Older Smiley HTML detected, forcing a save") bgneal@894: model.save() bgneal@895: count += 1 bgneal@860: bgneal@895: time_finished = datetime.datetime.now() bgneal@895: elapsed = time_finished - time_started bgneal@895: logger.info("ssl_images exiting; number of objects: %d; elapsed: %s", bgneal@895: count, elapsed) bgneal@897: bgneal@897: http_images = len(url_cache) bgneal@897: https_images = sum(1 for v in url_cache.itervalues() if v) bgneal@897: bad_images = http_images - https_images bgneal@897: if http_images > 0: bgneal@897: pct_saved = float(https_images) / http_images * 100.0 bgneal@897: else: bgneal@897: pct_saved = 0.0 bgneal@897: bgneal@897: logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%", bgneal@897: http_images, https_images, bad_images, pct_saved) bgneal@899: bgneal@899: save_cache() bgneal@899: logger.info("ssl_images done") bgneal@899: bgneal@899: bgneal@899: def load_cache(): bgneal@899: """Load cache from previous runs.""" bgneal@899: logger.info("Loading cached information") bgneal@899: try: bgneal@899: with open(CACHE_FILENAME, 'r') as fp: bgneal@899: d = json.load(fp) bgneal@899: except IOError as ex: bgneal@899: logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex) bgneal@899: return bgneal@899: except ValueError: bgneal@899: logger.error("Mangled cache file: %s", CACHE_FILENAME) bgneal@899: return bgneal@899: bgneal@899: global bad_hosts, url_cache bgneal@899: try: bgneal@899: bad_hosts = set(d['bad_hosts']) bgneal@899: url_cache = d['url_cache'] bgneal@899: except KeyError: bgneal@899: logger.error("Malformed cache file: %s", CACHE_FILENAME) bgneal@899: bgneal@899: bgneal@899: def save_cache(): bgneal@899: """Save our cache to a file for future runs.""" bgneal@899: logger.info("Saving cached information") bgneal@899: d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache} bgneal@899: with open(CACHE_FILENAME, 'w') as fp: bgneal@899: json.dump(d, fp, indent=4)