Mercurial > public > sg101
comparison core/management/commands/ssl_images.py @ 899:62cd07bb891c
Detect bad hosts. Save cache information.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 28 Feb 2015 15:43:54 -0600 |
parents | 8fcd278d8987 |
children | 4619290d171d |
comparison
equal
deleted
inserted
replaced
898:8fcd278d8987 | 899:62cd07bb891c |
---|---|
6 - Non SG101 images that use http: are downloaded, resized, and uploaded to | 6 - Non SG101 images that use http: are downloaded, resized, and uploaded to |
7 an S3 bucket. The src attribute is replaced with the new S3 URL. | 7 an S3 bucket. The src attribute is replaced with the new S3 URL. |
8 """ | 8 """ |
9 import base64 | 9 import base64 |
10 import datetime | 10 import datetime |
11 import json | |
11 import logging | 12 import logging |
12 from optparse import make_option | 13 from optparse import make_option |
13 import os | 14 import os |
14 import re | 15 import re |
15 import signal | 16 import signal |
42 | 43 |
43 PHOTO_MAX_SIZE = (660, 720) | 44 PHOTO_MAX_SIZE = (660, 720) |
44 PHOTO_BASE_URL = 'https://s3.amazonaws.com/' | 45 PHOTO_BASE_URL = 'https://s3.amazonaws.com/' |
45 PHOTO_BUCKET_NAME = 'sg101.forum.photos' | 46 PHOTO_BUCKET_NAME = 'sg101.forum.photos' |
46 | 47 |
48 CACHE_FILENAME = 'ssl_images_cache.json' | |
49 | |
47 quit_flag = False | 50 quit_flag = False |
48 opener = None | 51 opener = None |
49 bucket = None | 52 bucket = None |
50 url_cache = {} | 53 url_cache = {} |
54 bad_hosts = set() | |
51 | 55 |
52 | 56 |
53 def signal_handler(signum, frame): | 57 def signal_handler(signum, frame): |
54 """SIGINT signal handler""" | 58 """SIGINT signal handler""" |
55 global quit_flag | 59 global quit_flag |
90 if result: | 94 if result: |
91 return result | 95 return result |
92 return self.http_error_default(url, fp, errcode, errmsg, headers) | 96 return self.http_error_default(url, fp, errcode, errmsg, headers) |
93 | 97 |
94 | 98 |
95 def download_image(src): | 99 def download_image(parsed_url): |
96 """Downloads the image file from the given source URL. | 100 """Downloads the image file from the given source URL. |
97 | 101 |
98 If successful returns the path to the downloaded file. Otherwise None is | 102 If successful returns the path to the downloaded file. Otherwise None is |
99 returned. | 103 returned. |
100 """ | 104 """ |
105 src = parsed_url.geturl() | |
101 logger.info("Retrieving %s", src) | 106 logger.info("Retrieving %s", src) |
102 try: | 107 try: |
103 fn, hdrs = opener.retrieve(src) | 108 fn, hdrs = opener.retrieve(src) |
104 except IOError as ex: | 109 except IOError as ex: |
105 args = ex.args | 110 args = ex.args if ex.args else [] |
106 if len(args) == 4 and args[0] == 'http error': | 111 if len(args) == 4 and args[0] == 'http error': |
107 logger.error("http error: %d - %s", args[1], args[2]) | 112 logger.error("http error: %d - %s", args[1], args[2]) |
113 elif len(args) == 2 and isinstance(args[1], socket.gaierror): | |
114 logger.error("gaierror, ignoring host %s", parsed_url.hostname) | |
115 bad_hosts.add(parsed_url.hostname) | |
108 else: | 116 else: |
109 logger.error("%s", ex) | 117 logger.error("%s", ex) |
110 return None | 118 return None |
111 | 119 |
112 # Does it look like an image? | 120 # Does it look like an image? |
159 def convert_to_ssl(parsed_url): | 167 def convert_to_ssl(parsed_url): |
160 """Top-level function for moving an image to SSL.""" | 168 """Top-level function for moving an image to SSL.""" |
161 | 169 |
162 src = parsed_url.geturl() | 170 src = parsed_url.geturl() |
163 | 171 |
164 # Check the cache first | 172 if parsed_url.hostname in bad_hosts: |
173 logger.info("Host known to be bad, skipping: %s", src) | |
174 return None | |
175 | |
176 # Check the cache | |
165 try: | 177 try: |
166 new_url = url_cache[src] | 178 new_url = url_cache[src] |
167 except KeyError: | 179 except KeyError: |
168 # cache miss, try to get the file | 180 # cache miss, try to get the file |
169 new_url = save_image_to_cloud(src) | 181 new_url = save_image_to_cloud(parsed_url) |
170 url_cache[src] = new_url | 182 url_cache[src] = new_url |
171 else: | 183 else: |
172 if new_url: | 184 if new_url: |
173 logger.info("Found URL in cache: %s => %s", src, new_url) | 185 logger.info("Found URL in cache: %s => %s", src, new_url) |
174 else: | 186 else: |
175 logger.info("URL known to be bad, skipping: %s", src) | 187 logger.info("URL known to be bad, skipping: %s", src) |
176 | 188 |
177 return new_url | 189 return new_url |
178 | 190 |
179 | 191 |
180 def save_image_to_cloud(src): | 192 def save_image_to_cloud(parsed_url): |
181 """Downloads an image at a given source URL. Uploads it to cloud storage. | 193 """Downloads an image at a given source URL. Uploads it to cloud storage. |
182 | 194 |
183 Returns the new URL or None if unsuccessful. | 195 Returns the new URL or None if unsuccessful. |
184 """ | 196 """ |
185 fn = download_image(src) | 197 fn = download_image(parsed_url) |
186 if fn: | 198 if fn: |
187 resize_image(fn) | 199 resize_image(fn) |
188 return upload_image(fn) | 200 return upload_image(fn) |
189 return None | 201 return None |
190 | 202 |
328 global bucket | 340 global bucket |
329 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, | 341 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, |
330 secret_key=settings.USER_PHOTOS_SECRET_KEY, | 342 secret_key=settings.USER_PHOTOS_SECRET_KEY, |
331 base_url=PHOTO_BASE_URL, | 343 base_url=PHOTO_BASE_URL, |
332 bucket_name=PHOTO_BUCKET_NAME) | 344 bucket_name=PHOTO_BUCKET_NAME) |
345 | |
346 # Load cached info from previous runs | |
347 load_cache() | |
333 | 348 |
334 if i is None: | 349 if i is None: |
335 i = 0 | 350 i = 0 |
336 | 351 |
337 count = 0 | 352 count = 0 |
371 else: | 386 else: |
372 pct_saved = 0.0 | 387 pct_saved = 0.0 |
373 | 388 |
374 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%", | 389 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%", |
375 http_images, https_images, bad_images, pct_saved) | 390 http_images, https_images, bad_images, pct_saved) |
391 | |
392 save_cache() | |
393 logger.info("ssl_images done") | |
394 | |
395 | |
396 def load_cache(): | |
397 """Load cache from previous runs.""" | |
398 logger.info("Loading cached information") | |
399 try: | |
400 with open(CACHE_FILENAME, 'r') as fp: | |
401 d = json.load(fp) | |
402 except IOError as ex: | |
403 logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex) | |
404 return | |
405 except ValueError: | |
406 logger.error("Mangled cache file: %s", CACHE_FILENAME) | |
407 return | |
408 | |
409 global bad_hosts, url_cache | |
410 try: | |
411 bad_hosts = set(d['bad_hosts']) | |
412 url_cache = d['url_cache'] | |
413 except KeyError: | |
414 logger.error("Malformed cache file: %s", CACHE_FILENAME) | |
415 | |
416 | |
417 def save_cache(): | |
418 """Save our cache to a file for future runs.""" | |
419 logger.info("Saving cached information") | |
420 d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache} | |
421 with open(CACHE_FILENAME, 'w') as fp: | |
422 json.dump(d, fp, indent=4) |