comparison core/management/commands/ssl_images.py @ 899:62cd07bb891c

Detect bad hosts. Save cache information.
author Brian Neal <bgneal@gmail.com>
date Sat, 28 Feb 2015 15:43:54 -0600
parents 8fcd278d8987
children 4619290d171d
comparison
equal deleted inserted replaced
898:8fcd278d8987 899:62cd07bb891c
6 - Non SG101 images that use http: are downloaded, resized, and uploaded to 6 - Non SG101 images that use http: are downloaded, resized, and uploaded to
7 an S3 bucket. The src attribute is replaced with the new S3 URL. 7 an S3 bucket. The src attribute is replaced with the new S3 URL.
8 """ 8 """
9 import base64 9 import base64
10 import datetime 10 import datetime
11 import json
11 import logging 12 import logging
12 from optparse import make_option 13 from optparse import make_option
13 import os 14 import os
14 import re 15 import re
15 import signal 16 import signal
42 43
43 PHOTO_MAX_SIZE = (660, 720) 44 PHOTO_MAX_SIZE = (660, 720)
44 PHOTO_BASE_URL = 'https://s3.amazonaws.com/' 45 PHOTO_BASE_URL = 'https://s3.amazonaws.com/'
45 PHOTO_BUCKET_NAME = 'sg101.forum.photos' 46 PHOTO_BUCKET_NAME = 'sg101.forum.photos'
46 47
48 CACHE_FILENAME = 'ssl_images_cache.json'
49
47 quit_flag = False 50 quit_flag = False
48 opener = None 51 opener = None
49 bucket = None 52 bucket = None
50 url_cache = {} 53 url_cache = {}
54 bad_hosts = set()
51 55
52 56
53 def signal_handler(signum, frame): 57 def signal_handler(signum, frame):
54 """SIGINT signal handler""" 58 """SIGINT signal handler"""
55 global quit_flag 59 global quit_flag
90 if result: 94 if result:
91 return result 95 return result
92 return self.http_error_default(url, fp, errcode, errmsg, headers) 96 return self.http_error_default(url, fp, errcode, errmsg, headers)
93 97
94 98
95 def download_image(src): 99 def download_image(parsed_url):
96 """Downloads the image file from the given source URL. 100 """Downloads the image file from the given source URL.
97 101
98 If successful returns the path to the downloaded file. Otherwise None is 102 If successful returns the path to the downloaded file. Otherwise None is
99 returned. 103 returned.
100 """ 104 """
105 src = parsed_url.geturl()
101 logger.info("Retrieving %s", src) 106 logger.info("Retrieving %s", src)
102 try: 107 try:
103 fn, hdrs = opener.retrieve(src) 108 fn, hdrs = opener.retrieve(src)
104 except IOError as ex: 109 except IOError as ex:
105 args = ex.args 110 args = ex.args if ex.args else []
106 if len(args) == 4 and args[0] == 'http error': 111 if len(args) == 4 and args[0] == 'http error':
107 logger.error("http error: %d - %s", args[1], args[2]) 112 logger.error("http error: %d - %s", args[1], args[2])
113 elif len(args) == 2 and isinstance(args[1], socket.gaierror):
114 logger.error("gaierror, ignoring host %s", parsed_url.hostname)
115 bad_hosts.add(parsed_url.hostname)
108 else: 116 else:
109 logger.error("%s", ex) 117 logger.error("%s", ex)
110 return None 118 return None
111 119
112 # Does it look like an image? 120 # Does it look like an image?
159 def convert_to_ssl(parsed_url): 167 def convert_to_ssl(parsed_url):
160 """Top-level function for moving an image to SSL.""" 168 """Top-level function for moving an image to SSL."""
161 169
162 src = parsed_url.geturl() 170 src = parsed_url.geturl()
163 171
164 # Check the cache first 172 if parsed_url.hostname in bad_hosts:
173 logger.info("Host known to be bad, skipping: %s", src)
174 return None
175
176 # Check the cache
165 try: 177 try:
166 new_url = url_cache[src] 178 new_url = url_cache[src]
167 except KeyError: 179 except KeyError:
168 # cache miss, try to get the file 180 # cache miss, try to get the file
169 new_url = save_image_to_cloud(src) 181 new_url = save_image_to_cloud(parsed_url)
170 url_cache[src] = new_url 182 url_cache[src] = new_url
171 else: 183 else:
172 if new_url: 184 if new_url:
173 logger.info("Found URL in cache: %s => %s", src, new_url) 185 logger.info("Found URL in cache: %s => %s", src, new_url)
174 else: 186 else:
175 logger.info("URL known to be bad, skipping: %s", src) 187 logger.info("URL known to be bad, skipping: %s", src)
176 188
177 return new_url 189 return new_url
178 190
179 191
180 def save_image_to_cloud(src): 192 def save_image_to_cloud(parsed_url):
181 """Downloads an image at a given source URL. Uploads it to cloud storage. 193 """Downloads an image at a given source URL. Uploads it to cloud storage.
182 194
183 Returns the new URL or None if unsuccessful. 195 Returns the new URL or None if unsuccessful.
184 """ 196 """
185 fn = download_image(src) 197 fn = download_image(parsed_url)
186 if fn: 198 if fn:
187 resize_image(fn) 199 resize_image(fn)
188 return upload_image(fn) 200 return upload_image(fn)
189 return None 201 return None
190 202
328 global bucket 340 global bucket
329 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, 341 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY,
330 secret_key=settings.USER_PHOTOS_SECRET_KEY, 342 secret_key=settings.USER_PHOTOS_SECRET_KEY,
331 base_url=PHOTO_BASE_URL, 343 base_url=PHOTO_BASE_URL,
332 bucket_name=PHOTO_BUCKET_NAME) 344 bucket_name=PHOTO_BUCKET_NAME)
345
346 # Load cached info from previous runs
347 load_cache()
333 348
334 if i is None: 349 if i is None:
335 i = 0 350 i = 0
336 351
337 count = 0 352 count = 0
371 else: 386 else:
372 pct_saved = 0.0 387 pct_saved = 0.0
373 388
374 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%", 389 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%",
375 http_images, https_images, bad_images, pct_saved) 390 http_images, https_images, bad_images, pct_saved)
391
392 save_cache()
393 logger.info("ssl_images done")
394
395
396 def load_cache():
397 """Load cache from previous runs."""
398 logger.info("Loading cached information")
399 try:
400 with open(CACHE_FILENAME, 'r') as fp:
401 d = json.load(fp)
402 except IOError as ex:
403 logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex)
404 return
405 except ValueError:
406 logger.error("Mangled cache file: %s", CACHE_FILENAME)
407 return
408
409 global bad_hosts, url_cache
410 try:
411 bad_hosts = set(d['bad_hosts'])
412 url_cache = d['url_cache']
413 except KeyError:
414 logger.error("Malformed cache file: %s", CACHE_FILENAME)
415
416
417 def save_cache():
418 """Save our cache to a file for future runs."""
419 logger.info("Saving cached information")
420 d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache}
421 with open(CACHE_FILENAME, 'w') as fp:
422 json.dump(d, fp, indent=4)