bgneal@859: """ bgneal@859: ssl_images is a custom manage.py command to convert forum post and comment bgneal@859: images to https. It does this by rewriting the markup: bgneal@859: - Images with src = http://surfguitar101.com/something are rewritten to be bgneal@859: /something. bgneal@859: - Non SG101 images that use http: are downloaded, resized, and uploaded to bgneal@859: an S3 bucket. The src attribute is replaced with the new S3 URL. bgneal@859: """ bgneal@882: import base64 bgneal@895: import datetime bgneal@888: import httplib bgneal@859: import logging bgneal@859: from optparse import make_option bgneal@888: import os bgneal@863: import re bgneal@863: import signal bgneal@881: import socket bgneal@881: import urllib bgneal@868: import urlparse bgneal@881: import uuid bgneal@859: bgneal@859: from django.core.management.base import NoArgsCommand, CommandError bgneal@859: from django.conf import settings bgneal@894: from lxml import etree bgneal@863: import markdown.inlinepatterns bgneal@881: from PIL import Image bgneal@859: bgneal@860: from comments.models import Comment bgneal@860: from forums.models import Post bgneal@881: from core.s3 import S3Bucket bgneal@860: bgneal@860: bgneal@859: LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log') bgneal@859: logger = logging.getLogger(__name__) bgneal@859: bgneal@871: IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE, bgneal@871: re.DOTALL | re.UNICODE) bgneal@871: IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE, bgneal@871: re.DOTALL | re.UNICODE) bgneal@863: bgneal@868: SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com']) bgneal@866: MODEL_CHOICES = ['comments', 'posts'] bgneal@866: bgneal@881: PHOTO_MAX_SIZE = (660, 720) bgneal@881: PHOTO_BASE_URL = 'https://s3.amazonaws.com/' bgneal@881: PHOTO_BUCKET_NAME = 'sg101.forum.photos' bgneal@881: bgneal@863: quit_flag = False bgneal@881: opener = None bgneal@881: bucket = None bgneal@881: url_cache = {} bgneal@863: bgneal@863: bgneal@863: def signal_handler(signum, frame): bgneal@863: """SIGINT signal handler""" bgneal@863: global quit_flag bgneal@863: quit_flag = True bgneal@863: bgneal@859: bgneal@859: def _setup_logging(): bgneal@859: logger.setLevel(logging.DEBUG) bgneal@859: logger.propagate = False bgneal@859: handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8') bgneal@859: formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') bgneal@859: handler.setFormatter(formatter) bgneal@859: logger.addHandler(handler) bgneal@859: bgneal@859: bgneal@881: class ImageURLopener(urllib.FancyURLopener): bgneal@881: """Our URL opener. Handles redirects as per FancyURLopener. But all other bgneal@881: errors and authentication requests will raise an IOError. bgneal@881: """ bgneal@881: HANDLED_ERRORS = set([302, 301, 303, 307]) bgneal@881: bgneal@881: def http_error_default(self, url, fp, errcode, errmsg, headers): bgneal@881: return urllib.URLopener.http_error_default(self, url, fp, errcode, bgneal@881: errmsg, headers) bgneal@881: bgneal@881: def http_error(self, url, fp, errcode, errmsg, headers, data=None): bgneal@881: """Handle http errors. bgneal@881: We let FancyURLopener handle the redirects, but any other error we want bgneal@881: to let fail. bgneal@881: """ bgneal@881: if errcode in self.HANDLED_ERRORS: bgneal@881: name = 'http_error_%d' % errcode bgneal@881: method = getattr(self, name) bgneal@881: if data is None: bgneal@881: result = method(url, fp, errcode, errmsg, headers) bgneal@881: else: bgneal@881: result = method(url, fp, errcode, errmsg, headers, data) bgneal@881: if result: bgneal@881: return result bgneal@881: return self.http_error_default(url, fp, errcode, errmsg, headers) bgneal@881: bgneal@881: bgneal@881: def download_image(src): bgneal@881: """Downloads the image file from the given source URL. bgneal@881: bgneal@881: If successful returns the path to the downloaded file. Otherwise None is bgneal@881: returned. bgneal@881: """ bgneal@881: logger.info("Retrieving %s", src) bgneal@881: try: bgneal@881: fn, hdrs = opener.retrieve(src) bgneal@881: except IOError as ex: bgneal@881: args = ex.args bgneal@881: if len(args) == 4 and args[0] == 'http error': bgneal@881: logger.error("http error: %d - %s", args[1], args[2]) bgneal@881: else: bgneal@881: logger.error("%s", ex) bgneal@881: return None bgneal@881: bgneal@881: # Does it look like an image? bgneal@881: content_type = hdrs.get('content-type') bgneal@881: if not content_type: bgneal@881: logger.error("No content-type header found") bgneal@881: return None bgneal@881: bgneal@888: file_size = os.stat(fn).st_size bgneal@888: logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type) bgneal@881: bgneal@881: parts = content_type.split('/') bgneal@881: if len(parts) < 2 or parts[0] != 'image': bgneal@881: logger.error("Unknown content-type: %s", content_type) bgneal@881: return None bgneal@881: bgneal@881: return fn bgneal@881: bgneal@881: bgneal@881: def resize_image(img_path): bgneal@881: """Resizes the image found at img_path if necessary.""" bgneal@881: image = Image.open(img_path) bgneal@881: if image.size > PHOTO_MAX_SIZE: bgneal@881: logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE) bgneal@881: image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS) bgneal@881: image.save(img_path) bgneal@881: bgneal@881: bgneal@882: def gen_key(): bgneal@882: """Return a random key.""" bgneal@882: return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=') bgneal@882: bgneal@882: bgneal@881: def upload_image(img_path): bgneal@881: """Upload image file located at img_path to our S3 bucket. bgneal@881: bgneal@881: Returns the URL of the image in the bucket or None if an error occurs. bgneal@881: """ bgneal@881: logger.info("upload_image starting") bgneal@881: # Make a unique name for the image in the bucket bgneal@881: ext = os.path.splitext(img_path)[1] bgneal@882: file_key = gen_key() + ext bgneal@881: try: bgneal@881: return bucket.upload_from_filename(file_key, img_path, public=True) bgneal@881: except IOError as ex: bgneal@881: logger.error("Error uploading file: %s", ex) bgneal@881: return None bgneal@881: bgneal@881: bgneal@888: def convert_to_ssl(parsed_url): bgneal@888: """Top-level function for moving an image to SSL.""" bgneal@888: bgneal@888: src = parsed_url.geturl() bgneal@888: bgneal@888: # Check the cache first bgneal@888: new_url = url_cache.get(src) bgneal@888: if new_url: bgneal@888: logger.info("Found URL in cache: %s => %s", src, new_url) bgneal@888: return new_url bgneal@888: bgneal@888: # It has been observed that at least 2 different services bgneal@888: # serve up the same image on https: with the URL otherwise the same. bgneal@888: # Check to see if the image is available via https first. bgneal@888: new_url = check_https_availability(parsed_url) bgneal@888: if new_url: bgneal@889: url_cache[src] = new_url bgneal@888: return new_url bgneal@888: bgneal@888: # If none of the above worked, try to download and upload to our S3 bucket bgneal@889: new_url = save_image_to_cloud(src) bgneal@889: if new_url: bgneal@889: url_cache[src] = new_url bgneal@889: return new_url bgneal@888: bgneal@888: bgneal@888: def check_https_availability(parsed_url): bgneal@888: """Given a urlparse.urlparse() result, perform a HEAD request over https bgneal@888: using the same net location and path. If we get a response that indicates an bgneal@888: image is available, return the url of the image over https. Otherwise return bgneal@888: None. bgneal@888: """ bgneal@888: logger.info("Checking https availability for %s", parsed_url.geturl()) bgneal@888: con = httplib.HTTPSConnection(parsed_url.netloc) bgneal@888: try: bgneal@888: con.request('HEAD', parsed_url.path) bgneal@888: except (httplib.HTTPException, socket.timeout) as ex: bgneal@888: logger.info("https HEAD request failed: %s", ex) bgneal@888: return None bgneal@888: bgneal@888: content_type = None bgneal@888: response = con.getresponse() bgneal@888: if response.status == 200: bgneal@888: content_type = response.getheader('content-type') bgneal@888: if content_type: bgneal@888: parts = content_type.split('/') bgneal@888: if len(parts) >= 2 and parts[0] == 'image': bgneal@888: url = urlparse.urlunparse(('https', ) + parsed_url[1:]) bgneal@888: logger.info("Image is available at %s", url) bgneal@888: return url bgneal@888: bgneal@888: logger.info('https HEAD request failed; status = %d, content-type = %s', bgneal@888: response.status, content_type) bgneal@888: return None bgneal@888: bgneal@888: bgneal@868: def save_image_to_cloud(src): bgneal@881: """Downloads an image at a given source URL. Uploads it to cloud storage. bgneal@881: bgneal@881: Returns the new URL or None if unsuccessful. bgneal@881: """ bgneal@881: fn = download_image(src) bgneal@881: if fn: bgneal@881: resize_image(fn) bgneal@889: return upload_image(fn) bgneal@881: return None bgneal@868: bgneal@868: bgneal@866: def replace_image_markup(match): bgneal@870: src_parts = match.group(8).split() bgneal@868: if src_parts: bgneal@868: src = src_parts[0] bgneal@868: if src[0] == "<" and src[-1] == ">": bgneal@868: src = src[1:-1] bgneal@868: else: bgneal@868: src = '' bgneal@868: bgneal@868: title = '' bgneal@868: if len(src_parts) > 1: bgneal@868: title = " ".join(src_parts[1:]) bgneal@870: alt = match.group(1) bgneal@868: bgneal@871: new_src = None bgneal@868: if src: bgneal@868: r = urlparse.urlparse(src) bgneal@871: if r.hostname in SG101_HOSTS: bgneal@871: new_src = r.path # convert to relative path bgneal@871: elif r.scheme == 'http': bgneal@888: # Try a few things to get this on ssl: bgneal@888: new_src = convert_to_ssl(r) bgneal@868: elif r.scheme == 'https': bgneal@868: new_src = src # already https, accept it as-is bgneal@868: bgneal@868: if new_src: bgneal@868: if title: bgneal@871: s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title) bgneal@868: else: bgneal@868: s = u'![{alt}]({src})'.format(alt=alt, src=new_src) bgneal@868: else: bgneal@868: # something's messed up, convert to a link using original src bgneal@868: s = u'[{alt}]({src})'.format(alt=alt, src=src) bgneal@868: bgneal@868: return s bgneal@860: bgneal@860: bgneal@887: def warn_if_image_refs(text, model_name, pk): bgneal@887: """Search text for Markdown image reference markup. bgneal@887: bgneal@887: We aren't expecting these, but we will log something if we see any. bgneal@887: """ bgneal@887: if IMAGE_REF_RE.search(text): bgneal@887: logger.warning("Image reference found in %s pk = #%d", model_name, pk) bgneal@887: bgneal@887: bgneal@866: def process_post(text): bgneal@863: """Process the post object: bgneal@863: bgneal@863: A regex substitution is run on the post's text field. This fixes up image bgneal@863: links, getting rid of plain old http sources; either converting to https bgneal@863: or relative style links (if the link is to SG101). bgneal@863: bgneal@863: """ bgneal@866: return IMAGE_LINK_RE.sub(replace_image_markup, text) bgneal@863: bgneal@863: bgneal@894: def html_check(html): bgneal@894: """Return True if the given HTML fragment has tags with src attributes bgneal@894: that use http, and False otherwise. bgneal@894: """ bgneal@894: if not html: bgneal@894: return False bgneal@894: bgneal@894: root = etree.HTML(html) bgneal@894: for img in root.iter('img'): bgneal@894: src = img.get('src') bgneal@894: if src and src.lower().startswith('http:'): bgneal@894: return True bgneal@894: return False bgneal@894: bgneal@894: bgneal@859: class Command(NoArgsCommand): bgneal@859: help = "Rewrite forum posts and comments to not use http for images" bgneal@859: option_list = NoArgsCommand.option_list + ( bgneal@866: make_option('-m', '--model', bgneal@866: choices=MODEL_CHOICES, bgneal@866: help="which model to update; must be one of {{{}}}".format( bgneal@866: ', '.join(MODEL_CHOICES))), bgneal@860: make_option('-i', '--i', bgneal@859: type='int', bgneal@863: help="optional first slice index; the i in [i:j]"), bgneal@860: make_option('-j', '--j', bgneal@859: type='int', bgneal@863: help="optional second slice index; the j in [i:j]"), bgneal@859: ) bgneal@859: bgneal@859: def handle_noargs(self, **options): bgneal@895: time_started = datetime.datetime.now() bgneal@859: _setup_logging() bgneal@860: logger.info("Starting; arguments received: %s", options) bgneal@859: bgneal@866: if options['model'] not in MODEL_CHOICES: bgneal@866: raise CommandError('Please choose a --model option') bgneal@859: bgneal@866: if options['model'] == 'comments': bgneal@860: qs = Comment.objects.all() bgneal@866: text_attr = 'comment' bgneal@881: model_name = 'Comment' bgneal@860: else: bgneal@860: qs = Post.objects.all() bgneal@866: text_attr = 'body' bgneal@881: model_name = 'Post' bgneal@860: bgneal@860: i, j = options['i'], options['j'] bgneal@860: bgneal@860: if i is not None and i < 0: bgneal@860: raise CommandError("-i must be >= 0") bgneal@860: if j is not None and j < 0: bgneal@860: raise CommandError("-j must be >= 0") bgneal@860: if j is not None and i is not None and j <= i: bgneal@860: raise CommandError("-j must be > -i") bgneal@860: bgneal@860: if i is not None and j is not None: bgneal@860: qs = qs[i:j] bgneal@860: elif i is not None and j is None: bgneal@860: qs = qs[i:] bgneal@860: elif i is None and j is not None: bgneal@860: qs = qs[:j] bgneal@860: bgneal@881: # Set global socket timeout bgneal@881: socket.setdefaulttimeout(30) bgneal@881: bgneal@863: # Install signal handler for ctrl-c bgneal@863: signal.signal(signal.SIGINT, signal_handler) bgneal@863: bgneal@881: # Create URL opener to download photos bgneal@881: global opener bgneal@881: opener = ImageURLopener() bgneal@881: bgneal@881: # Create bucket to upload photos bgneal@881: global bucket bgneal@881: bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, bgneal@881: secret_key=settings.USER_PHOTOS_SECRET_KEY, bgneal@881: base_url=PHOTO_BASE_URL, bgneal@881: bucket_name=PHOTO_BUCKET_NAME) bgneal@887: bgneal@887: if i is None: bgneal@887: i = 0 bgneal@887: bgneal@895: count = 0 bgneal@881: for n, model in enumerate(qs.iterator()): bgneal@863: if quit_flag: bgneal@863: logger.warning("SIGINT received, exiting") bgneal@881: break bgneal@881: logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk) bgneal@866: txt = getattr(model, text_attr) bgneal@887: warn_if_image_refs(txt, model_name, model.pk) bgneal@866: new_txt = process_post(txt) bgneal@881: if txt != new_txt: bgneal@889: logger.info("Content changed on %s #%d (pk = %d)", bgneal@887: model_name, n + i, model.pk) bgneal@881: logger.debug("original: %s", txt) bgneal@881: logger.debug("changed: %s", new_txt) bgneal@887: setattr(model, text_attr, new_txt) bgneal@887: model.save() bgneal@894: elif html_check(model.html): bgneal@894: # Check for content generated with older smiley code that used bgneal@894: # absolute URLs for the smiley images. If True, then just save bgneal@894: # the model again to force updated HTML to be created. bgneal@894: logger.info("Older Smiley HTML detected, forcing a save") bgneal@894: model.save() bgneal@895: count += 1 bgneal@860: bgneal@895: time_finished = datetime.datetime.now() bgneal@895: elapsed = time_finished - time_started bgneal@895: logger.info("ssl_images exiting; number of objects: %d; elapsed: %s", bgneal@895: count, elapsed)