comparison core/management/commands/ssl_images.py @ 888:deef1536a54a

See if file available at https: by doing HEAD request.
author Brian Neal <bgneal@gmail.com>
date Mon, 09 Feb 2015 20:25:13 -0600
parents 9a15f7c27526
children ae146e30d588
comparison
equal deleted inserted replaced
887:9a15f7c27526 888:deef1536a54a
5 /something. 5 /something.
6 - Non SG101 images that use http: are downloaded, resized, and uploaded to 6 - Non SG101 images that use http: are downloaded, resized, and uploaded to
7 an S3 bucket. The src attribute is replaced with the new S3 URL. 7 an S3 bucket. The src attribute is replaced with the new S3 URL.
8 """ 8 """
9 import base64 9 import base64
10 import httplib
10 import logging 11 import logging
11 from optparse import make_option 12 from optparse import make_option
12 import os.path 13 import os
13 import re 14 import re
14 import signal 15 import signal
15 import socket 16 import socket
16 import urllib 17 import urllib
17 import urlparse 18 import urlparse
105 logger.error("http error: %d - %s", args[1], args[2]) 106 logger.error("http error: %d - %s", args[1], args[2])
106 else: 107 else:
107 logger.error("%s", ex) 108 logger.error("%s", ex)
108 return None 109 return None
109 110
110 # TODO: This code below is not right. content-length is optional and will
111 # not appear when using chunked encoding, for example. Remove this check. If
112 # we want to log the size of the file, use stat() on it or something.
113 #
114 # If there is an error or timeout, sometimes there is no content-length
115 # header.
116 content_length = hdrs.get('content-length')
117 if not content_length:
118 logger.error("Bad content-length: %s", content_length)
119 return None
120
121 # Does it look like an image? 111 # Does it look like an image?
122 content_type = hdrs.get('content-type') 112 content_type = hdrs.get('content-type')
123 if not content_type: 113 if not content_type:
124 logger.error("No content-type header found") 114 logger.error("No content-type header found")
125 return None 115 return None
126 116
127 logger.info("Retrieved: %s bytes; content-type: %s", content_length, 117 file_size = os.stat(fn).st_size
128 content_type) 118 logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type)
129 119
130 parts = content_type.split('/') 120 parts = content_type.split('/')
131 if len(parts) < 2 or parts[0] != 'image': 121 if len(parts) < 2 or parts[0] != 'image':
132 logger.error("Unknown content-type: %s", content_type) 122 logger.error("Unknown content-type: %s", content_type)
133 return None 123 return None
163 except IOError as ex: 153 except IOError as ex:
164 logger.error("Error uploading file: %s", ex) 154 logger.error("Error uploading file: %s", ex)
165 return None 155 return None
166 156
167 157
168 def save_image_to_cloud(src): 158 def convert_to_ssl(parsed_url):
169 """Downloads an image at a given source URL. Uploads it to cloud storage. 159 """Top-level function for moving an image to SSL."""
170 160
171 Returns the new URL or None if unsuccessful. 161 src = parsed_url.geturl()
172 """ 162
173 # Check the cache first 163 # Check the cache first
174 new_url = url_cache.get(src) 164 new_url = url_cache.get(src)
175 if new_url: 165 if new_url:
166 logger.info("Found URL in cache: %s => %s", src, new_url)
176 return new_url 167 return new_url
177 168
169 # It has been observed that at least 2 different services
170 # serve up the same image on https: with the URL otherwise the same.
171 # Check to see if the image is available via https first.
172 new_url = check_https_availability(parsed_url)
173 if new_url:
174 return new_url
175
176 # If none of the above worked, try to download and upload to our S3 bucket
177 return save_image_to_cloud(src)
178
179
180 def check_https_availability(parsed_url):
181 """Given a urlparse.urlparse() result, perform a HEAD request over https
182 using the same net location and path. If we get a response that indicates an
183 image is available, return the url of the image over https. Otherwise return
184 None.
185 """
186 logger.info("Checking https availability for %s", parsed_url.geturl())
187 con = httplib.HTTPSConnection(parsed_url.netloc)
188 try:
189 con.request('HEAD', parsed_url.path)
190 except (httplib.HTTPException, socket.timeout) as ex:
191 logger.info("https HEAD request failed: %s", ex)
192 return None
193
194 content_type = None
195 response = con.getresponse()
196 if response.status == 200:
197 content_type = response.getheader('content-type')
198 if content_type:
199 parts = content_type.split('/')
200 if len(parts) >= 2 and parts[0] == 'image':
201 url = urlparse.urlunparse(('https', ) + parsed_url[1:])
202 logger.info("Image is available at %s", url)
203 return url
204
205 logger.info('https HEAD request failed; status = %d, content-type = %s',
206 response.status, content_type)
207 return None
208
209
210 def save_image_to_cloud(src):
211 """Downloads an image at a given source URL. Uploads it to cloud storage.
212
213 Returns the new URL or None if unsuccessful.
214 """
178 fn = download_image(src) 215 fn = download_image(src)
179 if fn: 216 if fn:
180 resize_image(fn) 217 resize_image(fn)
181 new_url = upload_image(fn) 218 new_url = upload_image(fn)
182 if new_url: 219 if new_url:
203 if src: 240 if src:
204 r = urlparse.urlparse(src) 241 r = urlparse.urlparse(src)
205 if r.hostname in SG101_HOSTS: 242 if r.hostname in SG101_HOSTS:
206 new_src = r.path # convert to relative path 243 new_src = r.path # convert to relative path
207 elif r.scheme == 'http': 244 elif r.scheme == 'http':
208 # TODO: it has been observed that at least 2 different services 245 # Try a few things to get this on ssl:
209 # serve up the same image on https: with the URL otherwise the same. 246 new_src = convert_to_ssl(r)
210 # Add code to see if the image is available at https (maybe do
211 # a HEAD request?) and if so just change the protocol to https in
212 # the original URL.
213 new_src = save_image_to_cloud(src)
214 elif r.scheme == 'https': 247 elif r.scheme == 'https':
215 new_src = src # already https, accept it as-is 248 new_src = src # already https, accept it as-is
216 249
217 if new_src: 250 if new_src:
218 if title: 251 if title: