Mercurial > public > sg101
comparison core/management/commands/ssl_images.py @ 902:4dee923a2f6d
Merge with upstream.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sat, 07 Mar 2015 14:56:41 -0600 |
parents | 62cd07bb891c |
children | 4619290d171d |
comparison
equal
deleted
inserted
replaced
901:147a66da9cbc | 902:4dee923a2f6d |
---|---|
1 """ | |
2 ssl_images is a custom manage.py command to convert forum post and comment | |
3 images to https. It does this by rewriting the markup: | |
4 - Images with src = http://surfguitar101.com/something are rewritten to be | |
5 /something. | |
6 - Non SG101 images that use http: are downloaded, resized, and uploaded to | |
7 an S3 bucket. The src attribute is replaced with the new S3 URL. | |
8 """ | |
9 import base64 | |
10 import datetime | |
11 import json | |
12 import logging | |
13 from optparse import make_option | |
14 import os | |
15 import re | |
16 import signal | |
17 import socket | |
18 import urllib | |
19 import urlparse | |
20 import uuid | |
21 | |
22 from django.core.management.base import NoArgsCommand, CommandError | |
23 from django.conf import settings | |
24 from lxml import etree | |
25 import markdown.inlinepatterns | |
26 from PIL import Image | |
27 | |
28 from comments.models import Comment | |
29 from forums.models import Post | |
30 from core.s3 import S3Bucket | |
31 | |
32 | |
33 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log') | |
34 logger = logging.getLogger(__name__) | |
35 | |
36 IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE, | |
37 re.DOTALL | re.UNICODE) | |
38 IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE, | |
39 re.DOTALL | re.UNICODE) | |
40 | |
41 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com']) | |
42 MODEL_CHOICES = ['comments', 'posts'] | |
43 | |
44 PHOTO_MAX_SIZE = (660, 720) | |
45 PHOTO_BASE_URL = 'https://s3.amazonaws.com/' | |
46 PHOTO_BUCKET_NAME = 'sg101.forum.photos' | |
47 | |
48 CACHE_FILENAME = 'ssl_images_cache.json' | |
49 | |
50 quit_flag = False | |
51 opener = None | |
52 bucket = None | |
53 url_cache = {} | |
54 bad_hosts = set() | |
55 | |
56 | |
57 def signal_handler(signum, frame): | |
58 """SIGINT signal handler""" | |
59 global quit_flag | |
60 quit_flag = True | |
61 | |
62 | |
63 def _setup_logging(): | |
64 logger.setLevel(logging.DEBUG) | |
65 logger.propagate = False | |
66 handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8') | |
67 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') | |
68 handler.setFormatter(formatter) | |
69 logger.addHandler(handler) | |
70 | |
71 | |
72 class ImageURLopener(urllib.FancyURLopener): | |
73 """Our URL opener. Handles redirects as per FancyURLopener. But all other | |
74 errors and authentication requests will raise an IOError. | |
75 """ | |
76 HANDLED_ERRORS = set([302, 301, 303, 307]) | |
77 | |
78 def http_error_default(self, url, fp, errcode, errmsg, headers): | |
79 return urllib.URLopener.http_error_default(self, url, fp, errcode, | |
80 errmsg, headers) | |
81 | |
82 def http_error(self, url, fp, errcode, errmsg, headers, data=None): | |
83 """Handle http errors. | |
84 We let FancyURLopener handle the redirects, but any other error we want | |
85 to let fail. | |
86 """ | |
87 if errcode in self.HANDLED_ERRORS: | |
88 name = 'http_error_%d' % errcode | |
89 method = getattr(self, name) | |
90 if data is None: | |
91 result = method(url, fp, errcode, errmsg, headers) | |
92 else: | |
93 result = method(url, fp, errcode, errmsg, headers, data) | |
94 if result: | |
95 return result | |
96 return self.http_error_default(url, fp, errcode, errmsg, headers) | |
97 | |
98 | |
99 def download_image(parsed_url): | |
100 """Downloads the image file from the given source URL. | |
101 | |
102 If successful returns the path to the downloaded file. Otherwise None is | |
103 returned. | |
104 """ | |
105 src = parsed_url.geturl() | |
106 logger.info("Retrieving %s", src) | |
107 try: | |
108 fn, hdrs = opener.retrieve(src) | |
109 except IOError as ex: | |
110 args = ex.args if ex.args else [] | |
111 if len(args) == 4 and args[0] == 'http error': | |
112 logger.error("http error: %d - %s", args[1], args[2]) | |
113 elif len(args) == 2 and isinstance(args[1], socket.gaierror): | |
114 logger.error("gaierror, ignoring host %s", parsed_url.hostname) | |
115 bad_hosts.add(parsed_url.hostname) | |
116 else: | |
117 logger.error("%s", ex) | |
118 return None | |
119 | |
120 # Does it look like an image? | |
121 content_type = hdrs.get('content-type') | |
122 if not content_type: | |
123 logger.error("No content-type header found") | |
124 return None | |
125 | |
126 file_size = os.stat(fn).st_size | |
127 logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type) | |
128 | |
129 parts = content_type.split('/') | |
130 if len(parts) < 2 or parts[0] != 'image': | |
131 logger.error("Unknown content-type: %s", content_type) | |
132 return None | |
133 | |
134 return fn | |
135 | |
136 | |
137 def resize_image(img_path): | |
138 """Resizes the image found at img_path if necessary.""" | |
139 image = Image.open(img_path) | |
140 if image.size > PHOTO_MAX_SIZE: | |
141 logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE) | |
142 image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS) | |
143 image.save(img_path) | |
144 | |
145 | |
146 def gen_key(): | |
147 """Return a random key.""" | |
148 return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=') | |
149 | |
150 | |
151 def upload_image(img_path): | |
152 """Upload image file located at img_path to our S3 bucket. | |
153 | |
154 Returns the URL of the image in the bucket or None if an error occurs. | |
155 """ | |
156 logger.info("upload_image starting") | |
157 # Make a unique name for the image in the bucket | |
158 ext = os.path.splitext(img_path)[1] | |
159 file_key = gen_key() + ext | |
160 try: | |
161 return bucket.upload_from_filename(file_key, img_path, public=True) | |
162 except IOError as ex: | |
163 logger.error("Error uploading file: %s", ex) | |
164 return None | |
165 | |
166 | |
167 def convert_to_ssl(parsed_url): | |
168 """Top-level function for moving an image to SSL.""" | |
169 | |
170 src = parsed_url.geturl() | |
171 | |
172 if parsed_url.hostname in bad_hosts: | |
173 logger.info("Host known to be bad, skipping: %s", src) | |
174 return None | |
175 | |
176 # Check the cache | |
177 try: | |
178 new_url = url_cache[src] | |
179 except KeyError: | |
180 # cache miss, try to get the file | |
181 new_url = save_image_to_cloud(parsed_url) | |
182 url_cache[src] = new_url | |
183 else: | |
184 if new_url: | |
185 logger.info("Found URL in cache: %s => %s", src, new_url) | |
186 else: | |
187 logger.info("URL known to be bad, skipping: %s", src) | |
188 | |
189 return new_url | |
190 | |
191 | |
192 def save_image_to_cloud(parsed_url): | |
193 """Downloads an image at a given source URL. Uploads it to cloud storage. | |
194 | |
195 Returns the new URL or None if unsuccessful. | |
196 """ | |
197 fn = download_image(parsed_url) | |
198 if fn: | |
199 resize_image(fn) | |
200 return upload_image(fn) | |
201 return None | |
202 | |
203 | |
204 def replace_image_markup(match): | |
205 src_parts = match.group(8).split() | |
206 if src_parts: | |
207 src = src_parts[0] | |
208 if src[0] == "<" and src[-1] == ">": | |
209 src = src[1:-1] | |
210 else: | |
211 src = '' | |
212 | |
213 title = '' | |
214 if len(src_parts) > 1: | |
215 title = " ".join(src_parts[1:]) | |
216 alt = match.group(1) | |
217 | |
218 new_src = None | |
219 if src: | |
220 r = urlparse.urlparse(src) | |
221 if r.hostname in SG101_HOSTS: | |
222 new_src = r.path # convert to relative path | |
223 elif r.scheme == 'http': | |
224 # Try a few things to get this on ssl: | |
225 new_src = convert_to_ssl(r) | |
226 elif r.scheme == 'https': | |
227 new_src = src # already https, accept it as-is | |
228 | |
229 if new_src: | |
230 if title: | |
231 s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title) | |
232 else: | |
233 s = u'![{alt}]({src})'.format(alt=alt, src=new_src) | |
234 else: | |
235 # something's messed up, convert to a link using original src | |
236 s = u'[{alt}]({src})'.format(alt=alt, src=src) | |
237 | |
238 return s | |
239 | |
240 | |
241 def warn_if_image_refs(text, model_name, pk): | |
242 """Search text for Markdown image reference markup. | |
243 | |
244 We aren't expecting these, but we will log something if we see any. | |
245 """ | |
246 if IMAGE_REF_RE.search(text): | |
247 logger.warning("Image reference found in %s pk = #%d", model_name, pk) | |
248 | |
249 | |
250 def process_post(text): | |
251 """Process the post object: | |
252 | |
253 A regex substitution is run on the post's text field. This fixes up image | |
254 links, getting rid of plain old http sources; either converting to https | |
255 or relative style links (if the link is to SG101). | |
256 | |
257 """ | |
258 return IMAGE_LINK_RE.sub(replace_image_markup, text) | |
259 | |
260 | |
261 def html_check(html): | |
262 """Return True if the given HTML fragment has <img> tags with src attributes | |
263 that use http, and False otherwise. | |
264 """ | |
265 if not html: | |
266 return False | |
267 | |
268 root = etree.HTML(html) | |
269 for img in root.iter('img'): | |
270 src = img.get('src') | |
271 if src and src.lower().startswith('http:'): | |
272 return True | |
273 return False | |
274 | |
275 | |
276 class Command(NoArgsCommand): | |
277 help = "Rewrite forum posts and comments to not use http for images" | |
278 option_list = NoArgsCommand.option_list + ( | |
279 make_option('-m', '--model', | |
280 choices=MODEL_CHOICES, | |
281 help="which model to update; must be one of {{{}}}".format( | |
282 ', '.join(MODEL_CHOICES))), | |
283 make_option('-i', '--i', | |
284 type='int', | |
285 help="optional first slice index; the i in [i:j]"), | |
286 make_option('-j', '--j', | |
287 type='int', | |
288 help="optional second slice index; the j in [i:j]"), | |
289 make_option('-t', '--timeout', | |
290 type='int', | |
291 help="optional socket timeout (secs)"), | |
292 ) | |
293 | |
294 def handle_noargs(self, **options): | |
295 time_started = datetime.datetime.now() | |
296 _setup_logging() | |
297 logger.info("Starting; arguments received: %s", options) | |
298 | |
299 if options['model'] not in MODEL_CHOICES: | |
300 raise CommandError('Please choose a --model option') | |
301 | |
302 if options['model'] == 'comments': | |
303 qs = Comment.objects.all() | |
304 text_attr = 'comment' | |
305 model_name = 'Comment' | |
306 else: | |
307 qs = Post.objects.all() | |
308 text_attr = 'body' | |
309 model_name = 'Post' | |
310 | |
311 i, j = options['i'], options['j'] | |
312 | |
313 if i is not None and i < 0: | |
314 raise CommandError("-i must be >= 0") | |
315 if j is not None and j < 0: | |
316 raise CommandError("-j must be >= 0") | |
317 if j is not None and i is not None and j <= i: | |
318 raise CommandError("-j must be > -i") | |
319 | |
320 if i is not None and j is not None: | |
321 qs = qs[i:j] | |
322 elif i is not None and j is None: | |
323 qs = qs[i:] | |
324 elif i is None and j is not None: | |
325 qs = qs[:j] | |
326 | |
327 # Set global socket timeout | |
328 timeout = options.get('timeout', 30) | |
329 logger.info("Setting socket timeout to %d", timeout) | |
330 socket.setdefaulttimeout(timeout) | |
331 | |
332 # Install signal handler for ctrl-c | |
333 signal.signal(signal.SIGINT, signal_handler) | |
334 | |
335 # Create URL opener to download photos | |
336 global opener | |
337 opener = ImageURLopener() | |
338 | |
339 # Create bucket to upload photos | |
340 global bucket | |
341 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, | |
342 secret_key=settings.USER_PHOTOS_SECRET_KEY, | |
343 base_url=PHOTO_BASE_URL, | |
344 bucket_name=PHOTO_BUCKET_NAME) | |
345 | |
346 # Load cached info from previous runs | |
347 load_cache() | |
348 | |
349 if i is None: | |
350 i = 0 | |
351 | |
352 count = 0 | |
353 for n, model in enumerate(qs.iterator()): | |
354 if quit_flag: | |
355 logger.warning("SIGINT received, exiting") | |
356 break | |
357 logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk) | |
358 txt = getattr(model, text_attr) | |
359 warn_if_image_refs(txt, model_name, model.pk) | |
360 new_txt = process_post(txt) | |
361 if txt != new_txt: | |
362 logger.info("Content changed on %s #%d (pk = %d)", | |
363 model_name, n + i, model.pk) | |
364 logger.debug("original: %s", txt) | |
365 logger.debug("changed: %s", new_txt) | |
366 setattr(model, text_attr, new_txt) | |
367 model.save() | |
368 elif html_check(model.html): | |
369 # Check for content generated with older smiley code that used | |
370 # absolute URLs for the smiley images. If True, then just save | |
371 # the model again to force updated HTML to be created. | |
372 logger.info("Older Smiley HTML detected, forcing a save") | |
373 model.save() | |
374 count += 1 | |
375 | |
376 time_finished = datetime.datetime.now() | |
377 elapsed = time_finished - time_started | |
378 logger.info("ssl_images exiting; number of objects: %d; elapsed: %s", | |
379 count, elapsed) | |
380 | |
381 http_images = len(url_cache) | |
382 https_images = sum(1 for v in url_cache.itervalues() if v) | |
383 bad_images = http_images - https_images | |
384 if http_images > 0: | |
385 pct_saved = float(https_images) / http_images * 100.0 | |
386 else: | |
387 pct_saved = 0.0 | |
388 | |
389 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%", | |
390 http_images, https_images, bad_images, pct_saved) | |
391 | |
392 save_cache() | |
393 logger.info("ssl_images done") | |
394 | |
395 | |
396 def load_cache(): | |
397 """Load cache from previous runs.""" | |
398 logger.info("Loading cached information") | |
399 try: | |
400 with open(CACHE_FILENAME, 'r') as fp: | |
401 d = json.load(fp) | |
402 except IOError as ex: | |
403 logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex) | |
404 return | |
405 except ValueError: | |
406 logger.error("Mangled cache file: %s", CACHE_FILENAME) | |
407 return | |
408 | |
409 global bad_hosts, url_cache | |
410 try: | |
411 bad_hosts = set(d['bad_hosts']) | |
412 url_cache = d['url_cache'] | |
413 except KeyError: | |
414 logger.error("Malformed cache file: %s", CACHE_FILENAME) | |
415 | |
416 | |
417 def save_cache(): | |
418 """Save our cache to a file for future runs.""" | |
419 logger.info("Saving cached information") | |
420 d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache} | |
421 with open(CACHE_FILENAME, 'w') as fp: | |
422 json.dump(d, fp, indent=4) |