comparison core/management/commands/ssl_images.py @ 902:4dee923a2f6d

Merge with upstream.
author Brian Neal <bgneal@gmail.com>
date Sat, 07 Mar 2015 14:56:41 -0600
parents 62cd07bb891c
children 4619290d171d
comparison
equal deleted inserted replaced
901:147a66da9cbc 902:4dee923a2f6d
1 """
2 ssl_images is a custom manage.py command to convert forum post and comment
3 images to https. It does this by rewriting the markup:
4 - Images with src = http://surfguitar101.com/something are rewritten to be
5 /something.
6 - Non SG101 images that use http: are downloaded, resized, and uploaded to
7 an S3 bucket. The src attribute is replaced with the new S3 URL.
8 """
9 import base64
10 import datetime
11 import json
12 import logging
13 from optparse import make_option
14 import os
15 import re
16 import signal
17 import socket
18 import urllib
19 import urlparse
20 import uuid
21
22 from django.core.management.base import NoArgsCommand, CommandError
23 from django.conf import settings
24 from lxml import etree
25 import markdown.inlinepatterns
26 from PIL import Image
27
28 from comments.models import Comment
29 from forums.models import Post
30 from core.s3 import S3Bucket
31
32
33 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log')
34 logger = logging.getLogger(__name__)
35
36 IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE,
37 re.DOTALL | re.UNICODE)
38 IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE,
39 re.DOTALL | re.UNICODE)
40
41 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
42 MODEL_CHOICES = ['comments', 'posts']
43
44 PHOTO_MAX_SIZE = (660, 720)
45 PHOTO_BASE_URL = 'https://s3.amazonaws.com/'
46 PHOTO_BUCKET_NAME = 'sg101.forum.photos'
47
48 CACHE_FILENAME = 'ssl_images_cache.json'
49
50 quit_flag = False
51 opener = None
52 bucket = None
53 url_cache = {}
54 bad_hosts = set()
55
56
57 def signal_handler(signum, frame):
58 """SIGINT signal handler"""
59 global quit_flag
60 quit_flag = True
61
62
63 def _setup_logging():
64 logger.setLevel(logging.DEBUG)
65 logger.propagate = False
66 handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8')
67 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
68 handler.setFormatter(formatter)
69 logger.addHandler(handler)
70
71
72 class ImageURLopener(urllib.FancyURLopener):
73 """Our URL opener. Handles redirects as per FancyURLopener. But all other
74 errors and authentication requests will raise an IOError.
75 """
76 HANDLED_ERRORS = set([302, 301, 303, 307])
77
78 def http_error_default(self, url, fp, errcode, errmsg, headers):
79 return urllib.URLopener.http_error_default(self, url, fp, errcode,
80 errmsg, headers)
81
82 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
83 """Handle http errors.
84 We let FancyURLopener handle the redirects, but any other error we want
85 to let fail.
86 """
87 if errcode in self.HANDLED_ERRORS:
88 name = 'http_error_%d' % errcode
89 method = getattr(self, name)
90 if data is None:
91 result = method(url, fp, errcode, errmsg, headers)
92 else:
93 result = method(url, fp, errcode, errmsg, headers, data)
94 if result:
95 return result
96 return self.http_error_default(url, fp, errcode, errmsg, headers)
97
98
99 def download_image(parsed_url):
100 """Downloads the image file from the given source URL.
101
102 If successful returns the path to the downloaded file. Otherwise None is
103 returned.
104 """
105 src = parsed_url.geturl()
106 logger.info("Retrieving %s", src)
107 try:
108 fn, hdrs = opener.retrieve(src)
109 except IOError as ex:
110 args = ex.args if ex.args else []
111 if len(args) == 4 and args[0] == 'http error':
112 logger.error("http error: %d - %s", args[1], args[2])
113 elif len(args) == 2 and isinstance(args[1], socket.gaierror):
114 logger.error("gaierror, ignoring host %s", parsed_url.hostname)
115 bad_hosts.add(parsed_url.hostname)
116 else:
117 logger.error("%s", ex)
118 return None
119
120 # Does it look like an image?
121 content_type = hdrs.get('content-type')
122 if not content_type:
123 logger.error("No content-type header found")
124 return None
125
126 file_size = os.stat(fn).st_size
127 logger.info("Retrieved: %s bytes; content-type: %s", file_size, content_type)
128
129 parts = content_type.split('/')
130 if len(parts) < 2 or parts[0] != 'image':
131 logger.error("Unknown content-type: %s", content_type)
132 return None
133
134 return fn
135
136
137 def resize_image(img_path):
138 """Resizes the image found at img_path if necessary."""
139 image = Image.open(img_path)
140 if image.size > PHOTO_MAX_SIZE:
141 logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE)
142 image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS)
143 image.save(img_path)
144
145
146 def gen_key():
147 """Return a random key."""
148 return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=')
149
150
151 def upload_image(img_path):
152 """Upload image file located at img_path to our S3 bucket.
153
154 Returns the URL of the image in the bucket or None if an error occurs.
155 """
156 logger.info("upload_image starting")
157 # Make a unique name for the image in the bucket
158 ext = os.path.splitext(img_path)[1]
159 file_key = gen_key() + ext
160 try:
161 return bucket.upload_from_filename(file_key, img_path, public=True)
162 except IOError as ex:
163 logger.error("Error uploading file: %s", ex)
164 return None
165
166
167 def convert_to_ssl(parsed_url):
168 """Top-level function for moving an image to SSL."""
169
170 src = parsed_url.geturl()
171
172 if parsed_url.hostname in bad_hosts:
173 logger.info("Host known to be bad, skipping: %s", src)
174 return None
175
176 # Check the cache
177 try:
178 new_url = url_cache[src]
179 except KeyError:
180 # cache miss, try to get the file
181 new_url = save_image_to_cloud(parsed_url)
182 url_cache[src] = new_url
183 else:
184 if new_url:
185 logger.info("Found URL in cache: %s => %s", src, new_url)
186 else:
187 logger.info("URL known to be bad, skipping: %s", src)
188
189 return new_url
190
191
192 def save_image_to_cloud(parsed_url):
193 """Downloads an image at a given source URL. Uploads it to cloud storage.
194
195 Returns the new URL or None if unsuccessful.
196 """
197 fn = download_image(parsed_url)
198 if fn:
199 resize_image(fn)
200 return upload_image(fn)
201 return None
202
203
204 def replace_image_markup(match):
205 src_parts = match.group(8).split()
206 if src_parts:
207 src = src_parts[0]
208 if src[0] == "<" and src[-1] == ">":
209 src = src[1:-1]
210 else:
211 src = ''
212
213 title = ''
214 if len(src_parts) > 1:
215 title = " ".join(src_parts[1:])
216 alt = match.group(1)
217
218 new_src = None
219 if src:
220 r = urlparse.urlparse(src)
221 if r.hostname in SG101_HOSTS:
222 new_src = r.path # convert to relative path
223 elif r.scheme == 'http':
224 # Try a few things to get this on ssl:
225 new_src = convert_to_ssl(r)
226 elif r.scheme == 'https':
227 new_src = src # already https, accept it as-is
228
229 if new_src:
230 if title:
231 s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title)
232 else:
233 s = u'![{alt}]({src})'.format(alt=alt, src=new_src)
234 else:
235 # something's messed up, convert to a link using original src
236 s = u'[{alt}]({src})'.format(alt=alt, src=src)
237
238 return s
239
240
241 def warn_if_image_refs(text, model_name, pk):
242 """Search text for Markdown image reference markup.
243
244 We aren't expecting these, but we will log something if we see any.
245 """
246 if IMAGE_REF_RE.search(text):
247 logger.warning("Image reference found in %s pk = #%d", model_name, pk)
248
249
250 def process_post(text):
251 """Process the post object:
252
253 A regex substitution is run on the post's text field. This fixes up image
254 links, getting rid of plain old http sources; either converting to https
255 or relative style links (if the link is to SG101).
256
257 """
258 return IMAGE_LINK_RE.sub(replace_image_markup, text)
259
260
261 def html_check(html):
262 """Return True if the given HTML fragment has <img> tags with src attributes
263 that use http, and False otherwise.
264 """
265 if not html:
266 return False
267
268 root = etree.HTML(html)
269 for img in root.iter('img'):
270 src = img.get('src')
271 if src and src.lower().startswith('http:'):
272 return True
273 return False
274
275
276 class Command(NoArgsCommand):
277 help = "Rewrite forum posts and comments to not use http for images"
278 option_list = NoArgsCommand.option_list + (
279 make_option('-m', '--model',
280 choices=MODEL_CHOICES,
281 help="which model to update; must be one of {{{}}}".format(
282 ', '.join(MODEL_CHOICES))),
283 make_option('-i', '--i',
284 type='int',
285 help="optional first slice index; the i in [i:j]"),
286 make_option('-j', '--j',
287 type='int',
288 help="optional second slice index; the j in [i:j]"),
289 make_option('-t', '--timeout',
290 type='int',
291 help="optional socket timeout (secs)"),
292 )
293
294 def handle_noargs(self, **options):
295 time_started = datetime.datetime.now()
296 _setup_logging()
297 logger.info("Starting; arguments received: %s", options)
298
299 if options['model'] not in MODEL_CHOICES:
300 raise CommandError('Please choose a --model option')
301
302 if options['model'] == 'comments':
303 qs = Comment.objects.all()
304 text_attr = 'comment'
305 model_name = 'Comment'
306 else:
307 qs = Post.objects.all()
308 text_attr = 'body'
309 model_name = 'Post'
310
311 i, j = options['i'], options['j']
312
313 if i is not None and i < 0:
314 raise CommandError("-i must be >= 0")
315 if j is not None and j < 0:
316 raise CommandError("-j must be >= 0")
317 if j is not None and i is not None and j <= i:
318 raise CommandError("-j must be > -i")
319
320 if i is not None and j is not None:
321 qs = qs[i:j]
322 elif i is not None and j is None:
323 qs = qs[i:]
324 elif i is None and j is not None:
325 qs = qs[:j]
326
327 # Set global socket timeout
328 timeout = options.get('timeout', 30)
329 logger.info("Setting socket timeout to %d", timeout)
330 socket.setdefaulttimeout(timeout)
331
332 # Install signal handler for ctrl-c
333 signal.signal(signal.SIGINT, signal_handler)
334
335 # Create URL opener to download photos
336 global opener
337 opener = ImageURLopener()
338
339 # Create bucket to upload photos
340 global bucket
341 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY,
342 secret_key=settings.USER_PHOTOS_SECRET_KEY,
343 base_url=PHOTO_BASE_URL,
344 bucket_name=PHOTO_BUCKET_NAME)
345
346 # Load cached info from previous runs
347 load_cache()
348
349 if i is None:
350 i = 0
351
352 count = 0
353 for n, model in enumerate(qs.iterator()):
354 if quit_flag:
355 logger.warning("SIGINT received, exiting")
356 break
357 logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk)
358 txt = getattr(model, text_attr)
359 warn_if_image_refs(txt, model_name, model.pk)
360 new_txt = process_post(txt)
361 if txt != new_txt:
362 logger.info("Content changed on %s #%d (pk = %d)",
363 model_name, n + i, model.pk)
364 logger.debug("original: %s", txt)
365 logger.debug("changed: %s", new_txt)
366 setattr(model, text_attr, new_txt)
367 model.save()
368 elif html_check(model.html):
369 # Check for content generated with older smiley code that used
370 # absolute URLs for the smiley images. If True, then just save
371 # the model again to force updated HTML to be created.
372 logger.info("Older Smiley HTML detected, forcing a save")
373 model.save()
374 count += 1
375
376 time_finished = datetime.datetime.now()
377 elapsed = time_finished - time_started
378 logger.info("ssl_images exiting; number of objects: %d; elapsed: %s",
379 count, elapsed)
380
381 http_images = len(url_cache)
382 https_images = sum(1 for v in url_cache.itervalues() if v)
383 bad_images = http_images - https_images
384 if http_images > 0:
385 pct_saved = float(https_images) / http_images * 100.0
386 else:
387 pct_saved = 0.0
388
389 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%",
390 http_images, https_images, bad_images, pct_saved)
391
392 save_cache()
393 logger.info("ssl_images done")
394
395
396 def load_cache():
397 """Load cache from previous runs."""
398 logger.info("Loading cached information")
399 try:
400 with open(CACHE_FILENAME, 'r') as fp:
401 d = json.load(fp)
402 except IOError as ex:
403 logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex)
404 return
405 except ValueError:
406 logger.error("Mangled cache file: %s", CACHE_FILENAME)
407 return
408
409 global bad_hosts, url_cache
410 try:
411 bad_hosts = set(d['bad_hosts'])
412 url_cache = d['url_cache']
413 except KeyError:
414 logger.error("Malformed cache file: %s", CACHE_FILENAME)
415
416
417 def save_cache():
418 """Save our cache to a file for future runs."""
419 logger.info("Saving cached information")
420 d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache}
421 with open(CACHE_FILENAME, 'w') as fp:
422 json.dump(d, fp, indent=4)