# HG changeset patch # User Brian Neal # Date 1423535113 21600 # Node ID deef1536a54a42687cc98b242e2fef770743f884 # Parent 9a15f7c27526f4a92d2d619c7d2ea456a6d657b2 See if file available at https: by doing HEAD request. diff -r 9a15f7c27526 -r deef1536a54a core/management/commands/ssl_images.py --- a/core/management/commands/ssl_images.py Tue Feb 03 21:09:44 2015 -0600 +++ b/core/management/commands/ssl_images.py Mon Feb 09 20:25:13 2015 -0600 @@ -7,9 +7,10 @@ an S3 bucket. The src attribute is replaced with the new S3 URL. """ import base64 +import httplib import logging from optparse import make_option -import os.path +import os import re import signal import socket @@ -107,25 +108,14 @@ logger.error("%s", ex) return None - # TODO: This code below is not right. content-length is optional and will - # not appear when using chunked encoding, for example. Remove this check. If - # we want to log the size of the file, use stat() on it or something. - # - # If there is an error or timeout, sometimes there is no content-length - # header. - content_length = hdrs.get('content-length') - if not content_length: - logger.error("Bad content-length: %s", content_length) - return None - # Does it look like an image? content_type = hdrs.get('content-type') if not content_type: logger.error("No content-type header found") return None - logger.info("Retrieved: %s bytes; content-type: %s", content_length, - content_type) + file_size = os.stat(fn).st_size + logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type) parts = content_type.split('/') if len(parts) < 2 or parts[0] != 'image': @@ -165,16 +155,63 @@ return None +def convert_to_ssl(parsed_url): + """Top-level function for moving an image to SSL.""" + + src = parsed_url.geturl() + + # Check the cache first + new_url = url_cache.get(src) + if new_url: + logger.info("Found URL in cache: %s => %s", src, new_url) + return new_url + + # It has been observed that at least 2 different services + # serve up the same image on https: with the URL otherwise the same. + # Check to see if the image is available via https first. + new_url = check_https_availability(parsed_url) + if new_url: + return new_url + + # If none of the above worked, try to download and upload to our S3 bucket + return save_image_to_cloud(src) + + +def check_https_availability(parsed_url): + """Given a urlparse.urlparse() result, perform a HEAD request over https + using the same net location and path. If we get a response that indicates an + image is available, return the url of the image over https. Otherwise return + None. + """ + logger.info("Checking https availability for %s", parsed_url.geturl()) + con = httplib.HTTPSConnection(parsed_url.netloc) + try: + con.request('HEAD', parsed_url.path) + except (httplib.HTTPException, socket.timeout) as ex: + logger.info("https HEAD request failed: %s", ex) + return None + + content_type = None + response = con.getresponse() + if response.status == 200: + content_type = response.getheader('content-type') + if content_type: + parts = content_type.split('/') + if len(parts) >= 2 and parts[0] == 'image': + url = urlparse.urlunparse(('https', ) + parsed_url[1:]) + logger.info("Image is available at %s", url) + return url + + logger.info('https HEAD request failed; status = %d, content-type = %s', + response.status, content_type) + return None + + def save_image_to_cloud(src): """Downloads an image at a given source URL. Uploads it to cloud storage. Returns the new URL or None if unsuccessful. """ - # Check the cache first - new_url = url_cache.get(src) - if new_url: - return new_url - fn = download_image(src) if fn: resize_image(fn) @@ -205,12 +242,8 @@ if r.hostname in SG101_HOSTS: new_src = r.path # convert to relative path elif r.scheme == 'http': - # TODO: it has been observed that at least 2 different services - # serve up the same image on https: with the URL otherwise the same. - # Add code to see if the image is available at https (maybe do - # a HEAD request?) and if so just change the protocol to https in - # the original URL. - new_src = save_image_to_cloud(src) + # Try a few things to get this on ssl: + new_src = convert_to_ssl(r) elif r.scheme == 'https': new_src = src # already https, accept it as-is