diff core/management/commands/ssl_images.py @ 888:deef1536a54a

See if file available at https: by doing HEAD request.
author Brian Neal <bgneal@gmail.com>
date Mon, 09 Feb 2015 20:25:13 -0600
parents 9a15f7c27526
children ae146e30d588
line wrap: on
line diff
--- a/core/management/commands/ssl_images.py	Tue Feb 03 21:09:44 2015 -0600
+++ b/core/management/commands/ssl_images.py	Mon Feb 09 20:25:13 2015 -0600
@@ -7,9 +7,10 @@
       an S3 bucket. The src attribute is replaced with the new S3 URL.
 """
 import base64
+import httplib
 import logging
 from optparse import make_option
-import os.path
+import os
 import re
 import signal
 import socket
@@ -107,25 +108,14 @@
             logger.error("%s", ex)
         return None
 
-    # TODO: This code below is not right. content-length is optional and will
-    # not appear when using chunked encoding, for example. Remove this check. If
-    # we want to log the size of the file, use stat() on it or something.
-    #
-    # If there is an error or timeout, sometimes there is no content-length
-    # header.
-    content_length = hdrs.get('content-length')
-    if not content_length:
-        logger.error("Bad content-length: %s", content_length)
-        return None
-
     # Does it look like an image?
     content_type = hdrs.get('content-type')
     if not content_type:
         logger.error("No content-type header found")
         return None
 
-    logger.info("Retrieved: %s bytes; content-type: %s", content_length,
-                 content_type)
+    file_size = os.stat(fn).st_size
+    logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type)
 
     parts = content_type.split('/')
     if len(parts) < 2 or parts[0] != 'image':
@@ -165,16 +155,63 @@
     return None
 
 
+def convert_to_ssl(parsed_url):
+    """Top-level function for moving an image to SSL."""
+
+    src = parsed_url.geturl()
+
+    # Check the cache first
+    new_url = url_cache.get(src)
+    if new_url:
+        logger.info("Found URL in cache: %s => %s", src, new_url)
+        return new_url
+
+    # It has been observed that at least 2 different services
+    # serve up the same image on https: with the URL otherwise the same.
+    # Check to see if the image is available via https first.
+    new_url = check_https_availability(parsed_url)
+    if new_url:
+        return new_url
+
+    # If none of the above worked, try to download and upload to our S3 bucket
+    return save_image_to_cloud(src)
+
+
+def check_https_availability(parsed_url):
+    """Given a urlparse.urlparse() result, perform a HEAD request over https
+    using the same net location and path. If we get a response that indicates an
+    image is available, return the url of the image over https. Otherwise return
+    None.
+    """
+    logger.info("Checking https availability for %s", parsed_url.geturl())
+    con = httplib.HTTPSConnection(parsed_url.netloc)
+    try:
+        con.request('HEAD', parsed_url.path)
+    except (httplib.HTTPException, socket.timeout) as ex:
+        logger.info("https HEAD request failed: %s", ex)
+        return None
+
+    content_type = None
+    response = con.getresponse()
+    if response.status == 200:
+        content_type = response.getheader('content-type')
+        if content_type:
+            parts = content_type.split('/')
+            if len(parts) >= 2 and parts[0] == 'image':
+                url = urlparse.urlunparse(('https', ) + parsed_url[1:])
+                logger.info("Image is available at %s", url)
+                return url
+
+    logger.info('https HEAD request failed; status = %d, content-type = %s',
+                response.status, content_type)
+    return None
+
+
 def save_image_to_cloud(src):
     """Downloads an image at a given source URL. Uploads it to cloud storage.
 
     Returns the new URL or None if unsuccessful.
     """
-    # Check the cache first
-    new_url = url_cache.get(src)
-    if new_url:
-        return new_url
-
     fn = download_image(src)
     if fn:
         resize_image(fn)
@@ -205,12 +242,8 @@
         if r.hostname in SG101_HOSTS:
             new_src = r.path        # convert to relative path
         elif r.scheme == 'http':
-            # TODO: it has been observed that at least 2 different services
-            # serve up the same image on https: with the URL otherwise the same.
-            # Add code to see if the image is available at https (maybe do
-            # a HEAD request?) and if so just change the protocol to https in
-            # the original URL.
-            new_src = save_image_to_cloud(src)
+            # Try a few things to get this on ssl:
+            new_src = convert_to_ssl(r)
         elif r.scheme == 'https':
             new_src = src       # already https, accept it as-is