annotate core/management/commands/ssl_images.py @ 989:2908859c2fe4

Smilies now use relative links. This is for upcoming switch to SSL. Currently we do not need absolute URLs for smilies. If this changes we can add it later.
author Brian Neal <bgneal@gmail.com>
date Thu, 29 Oct 2015 20:54:34 -0500
parents 65b2bc9cb3cc
children fc528d4509b0
rev   line source
bgneal@859 1 """
bgneal@859 2 ssl_images is a custom manage.py command to convert forum post and comment
bgneal@859 3 images to https. It does this by rewriting the markup:
bgneal@859 4 - Images with src = http://surfguitar101.com/something are rewritten to be
bgneal@859 5 /something.
bgneal@859 6 - Non SG101 images that use http: are downloaded, resized, and uploaded to
bgneal@859 7 an S3 bucket. The src attribute is replaced with the new S3 URL.
bgneal@859 8 """
bgneal@882 9 import base64
bgneal@895 10 import datetime
bgneal@899 11 import json
bgneal@859 12 import logging
bgneal@859 13 from optparse import make_option
bgneal@888 14 import os
bgneal@863 15 import re
bgneal@863 16 import signal
bgneal@868 17 import urlparse
bgneal@881 18 import uuid
bgneal@859 19
bgneal@859 20 from django.core.management.base import NoArgsCommand, CommandError
bgneal@859 21 from django.conf import settings
bgneal@894 22 from lxml import etree
bgneal@987 23 import lxml.html
bgneal@863 24 import markdown.inlinepatterns
bgneal@881 25 from PIL import Image
bgneal@979 26 import requests
bgneal@859 27
bgneal@860 28 from comments.models import Comment
bgneal@860 29 from forums.models import Post
bgneal@979 30 from core.download import download_file
bgneal@979 31 from core.functions import remove_file
bgneal@881 32 from core.s3 import S3Bucket
bgneal@987 33 from news.models import Story
bgneal@860 34
bgneal@860 35
bgneal@859 36 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log')
bgneal@859 37 logger = logging.getLogger(__name__)
bgneal@859 38
bgneal@871 39 IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE,
bgneal@871 40 re.DOTALL | re.UNICODE)
bgneal@871 41 IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE,
bgneal@871 42 re.DOTALL | re.UNICODE)
bgneal@863 43
bgneal@868 44 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
bgneal@963 45 WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES)
bgneal@987 46 MODEL_CHOICES = ['comments', 'posts', 'news']
bgneal@866 47
bgneal@881 48 PHOTO_MAX_SIZE = (660, 720)
bgneal@979 49 PHOTO_BASE_URL = settings.HOT_LINK_PHOTOS_BASE_URL
bgneal@979 50 PHOTO_BUCKET_NAME = settings.HOT_LINK_PHOTOS_BUCKET
bgneal@881 51
bgneal@899 52 CACHE_FILENAME = 'ssl_images_cache.json'
bgneal@899 53
bgneal@863 54 quit_flag = False
bgneal@881 55 bucket = None
bgneal@881 56 url_cache = {}
bgneal@899 57 bad_hosts = set()
bgneal@980 58 request_timeout = None
bgneal@863 59
bgneal@863 60
bgneal@863 61 def signal_handler(signum, frame):
bgneal@863 62 """SIGINT signal handler"""
bgneal@863 63 global quit_flag
bgneal@863 64 quit_flag = True
bgneal@863 65
bgneal@859 66
bgneal@859 67 def _setup_logging():
bgneal@859 68 logger.setLevel(logging.DEBUG)
bgneal@859 69 logger.propagate = False
bgneal@859 70 handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8')
bgneal@859 71 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
bgneal@859 72 handler.setFormatter(formatter)
bgneal@859 73 logger.addHandler(handler)
bgneal@859 74
bgneal@981 75 requests_log = logging.getLogger("requests.packages.urllib3")
bgneal@981 76 requests_log.setLevel(logging.INFO)
bgneal@981 77 requests_log.propagate = True
bgneal@981 78 requests_log.addHandler(handler)
bgneal@981 79
bgneal@982 80 dl_log = logging.getLogger("core.download")
bgneal@982 81 dl_log.setLevel(logging.INFO)
bgneal@982 82 dl_log.propagate = True
bgneal@982 83 dl_log.addHandler(handler)
bgneal@982 84
bgneal@859 85
bgneal@979 86 def resize_image(img_path):
bgneal@979 87 """Resizes the image found at img_path if necessary.
bgneal@979 88
bgneal@979 89 Returns True if the image was resized or resizing wasn't necessary.
bgneal@979 90 Returns False if the image could not be read or processed.
bgneal@881 91 """
bgneal@979 92 try:
bgneal@979 93 image = Image.open(img_path)
bgneal@979 94 except IOError as ex:
bgneal@979 95 logger.error("Error opening %s: %s", img_path, ex)
bgneal@979 96 return False
bgneal@881 97
bgneal@881 98 if image.size > PHOTO_MAX_SIZE:
bgneal@881 99 logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE)
bgneal@981 100 try:
bgneal@981 101 image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS)
bgneal@981 102 image.save(img_path)
bgneal@981 103 except IOError as ex:
bgneal@981 104 logger.error("Error resizing image from %s: %s", img_path, ex)
bgneal@981 105 return False
bgneal@881 106
bgneal@979 107 return True
bgneal@979 108
bgneal@881 109
bgneal@882 110 def gen_key():
bgneal@882 111 """Return a random key."""
bgneal@882 112 return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=')
bgneal@882 113
bgneal@882 114
bgneal@881 115 def upload_image(img_path):
bgneal@881 116 """Upload image file located at img_path to our S3 bucket.
bgneal@881 117
bgneal@881 118 Returns the URL of the image in the bucket or None if an error occurs.
bgneal@881 119 """
bgneal@881 120 logger.info("upload_image starting")
bgneal@881 121 # Make a unique name for the image in the bucket
bgneal@881 122 ext = os.path.splitext(img_path)[1]
bgneal@882 123 file_key = gen_key() + ext
bgneal@881 124 try:
bgneal@881 125 return bucket.upload_from_filename(file_key, img_path, public=True)
bgneal@881 126 except IOError as ex:
bgneal@881 127 logger.error("Error uploading file: %s", ex)
bgneal@881 128 return None
bgneal@881 129
bgneal@881 130
bgneal@888 131 def convert_to_ssl(parsed_url):
bgneal@888 132 """Top-level function for moving an image to SSL."""
bgneal@888 133
bgneal@888 134 src = parsed_url.geturl()
bgneal@888 135
bgneal@899 136 if parsed_url.hostname in bad_hosts:
bgneal@899 137 logger.info("Host known to be bad, skipping: %s", src)
bgneal@899 138 return None
bgneal@899 139
bgneal@899 140 # Check the cache
bgneal@897 141 try:
bgneal@897 142 new_url = url_cache[src]
bgneal@897 143 except KeyError:
bgneal@897 144 # cache miss, try to get the file
bgneal@899 145 new_url = save_image_to_cloud(parsed_url)
bgneal@897 146 url_cache[src] = new_url
bgneal@897 147 else:
bgneal@897 148 if new_url:
bgneal@897 149 logger.info("Found URL in cache: %s => %s", src, new_url)
bgneal@897 150 else:
bgneal@897 151 logger.info("URL known to be bad, skipping: %s", src)
bgneal@888 152
bgneal@889 153 return new_url
bgneal@888 154
bgneal@888 155
bgneal@899 156 def save_image_to_cloud(parsed_url):
bgneal@881 157 """Downloads an image at a given source URL. Uploads it to cloud storage.
bgneal@881 158
bgneal@881 159 Returns the new URL or None if unsuccessful.
bgneal@881 160 """
bgneal@979 161 url = parsed_url.geturl()
bgneal@979 162 fn = None
bgneal@979 163 try:
bgneal@980 164 fn = download_file(url, timeout=request_timeout)
bgneal@979 165 except requests.ConnectionError as ex:
bgneal@979 166 logger.error("ConnectionError, ignoring host %s", parsed_url.hostname)
bgneal@979 167 bad_hosts.add(parsed_url.hostname)
bgneal@979 168 except requests.RequestException as ex:
bgneal@979 169 logger.error("%s", ex)
bgneal@979 170 except Exception as ex:
bgneal@979 171 logger.exception("%s", ex)
bgneal@979 172
bgneal@881 173 if fn:
bgneal@979 174 with remove_file(fn):
bgneal@979 175 if resize_image(fn):
bgneal@979 176 return upload_image(fn)
bgneal@881 177 return None
bgneal@868 178
bgneal@868 179
bgneal@866 180 def replace_image_markup(match):
bgneal@870 181 src_parts = match.group(8).split()
bgneal@868 182 if src_parts:
bgneal@868 183 src = src_parts[0]
bgneal@868 184 if src[0] == "<" and src[-1] == ">":
bgneal@868 185 src = src[1:-1]
bgneal@868 186 else:
bgneal@868 187 src = ''
bgneal@868 188
bgneal@868 189 title = ''
bgneal@868 190 if len(src_parts) > 1:
bgneal@868 191 title = " ".join(src_parts[1:])
bgneal@870 192 alt = match.group(1)
bgneal@868 193
bgneal@871 194 new_src = None
bgneal@868 195 if src:
bgneal@986 196 try:
bgneal@986 197 r = urlparse.urlparse(src)
bgneal@986 198 except ValueError:
bgneal@986 199 return u'{bad image}'
bgneal@986 200
bgneal@871 201 if r.hostname in SG101_HOSTS:
bgneal@871 202 new_src = r.path # convert to relative path
bgneal@871 203 elif r.scheme == 'http':
bgneal@888 204 # Try a few things to get this on ssl:
bgneal@888 205 new_src = convert_to_ssl(r)
bgneal@868 206 elif r.scheme == 'https':
bgneal@963 207 if r.hostname in WHITELIST_HOSTS:
bgneal@963 208 new_src = src # already in whitelist
bgneal@963 209 else:
bgneal@963 210 new_src = convert_to_ssl(r)
bgneal@868 211
bgneal@868 212 if new_src:
bgneal@868 213 if title:
bgneal@871 214 s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title)
bgneal@868 215 else:
bgneal@868 216 s = u'![{alt}]({src})'.format(alt=alt, src=new_src)
bgneal@868 217 else:
bgneal@868 218 # something's messed up, convert to a link using original src
bgneal@868 219 s = u'[{alt}]({src})'.format(alt=alt, src=src)
bgneal@868 220
bgneal@868 221 return s
bgneal@860 222
bgneal@860 223
bgneal@887 224 def warn_if_image_refs(text, model_name, pk):
bgneal@887 225 """Search text for Markdown image reference markup.
bgneal@887 226
bgneal@887 227 We aren't expecting these, but we will log something if we see any.
bgneal@887 228 """
bgneal@887 229 if IMAGE_REF_RE.search(text):
bgneal@887 230 logger.warning("Image reference found in %s pk = #%d", model_name, pk)
bgneal@887 231
bgneal@887 232
bgneal@866 233 def process_post(text):
bgneal@863 234 """Process the post object:
bgneal@863 235
bgneal@863 236 A regex substitution is run on the post's text field. This fixes up image
bgneal@863 237 links, getting rid of plain old http sources; either converting to https
bgneal@863 238 or relative style links (if the link is to SG101).
bgneal@863 239
bgneal@863 240 """
bgneal@866 241 return IMAGE_LINK_RE.sub(replace_image_markup, text)
bgneal@863 242
bgneal@863 243
bgneal@987 244 def process_html(html):
bgneal@987 245 """Process the html fragment, converting to https where needed."""
bgneal@987 246 s = html.strip()
bgneal@987 247 if not s:
bgneal@987 248 return s
bgneal@987 249
bgneal@987 250 changed = False
bgneal@987 251 root = lxml.html.fragment_fromstring(s, create_parent=True)
bgneal@987 252 for img in root.iter('img'):
bgneal@987 253 src = img.get('src')
bgneal@987 254 src = src.strip() if src else ''
bgneal@987 255 if src:
bgneal@987 256 try:
bgneal@987 257 r = urlparse.urlparse(src)
bgneal@987 258 except ValueError:
bgneal@987 259 logger.warning("Bad url? Should not happen; skipping...")
bgneal@987 260 continue
bgneal@987 261
bgneal@987 262 new_src = None
bgneal@987 263 if r.hostname in SG101_HOSTS:
bgneal@987 264 new_src = r.path # convert to relative path
bgneal@987 265 elif ((r.scheme == 'http') or
bgneal@987 266 (r.scheme == 'https' and r.hostname not in WHITELIST_HOSTS)):
bgneal@987 267 new_src = convert_to_ssl(r)
bgneal@987 268 if not new_src:
bgneal@987 269 # failed to convert to https; convert to a link
bgneal@987 270 tail = img.tail
bgneal@987 271 img.clear()
bgneal@987 272 img.tag = 'a'
bgneal@987 273 img.set('href', src)
bgneal@987 274 img.text = 'Image'
bgneal@987 275 img.tail = tail
bgneal@987 276 changed = True
bgneal@987 277
bgneal@987 278 if new_src:
bgneal@987 279 img.set('src', new_src)
bgneal@987 280 changed = True
bgneal@987 281
bgneal@987 282 if changed:
bgneal@987 283 result = lxml.html.tostring(root, encoding='utf-8')
bgneal@988 284 result = result[5:-6] # strip off parent div we added
bgneal@988 285 return result.decode('utf-8')
bgneal@987 286 return html
bgneal@987 287
bgneal@987 288
bgneal@894 289 def html_check(html):
bgneal@894 290 """Return True if the given HTML fragment has <img> tags with src attributes
bgneal@894 291 that use http, and False otherwise.
bgneal@894 292 """
bgneal@894 293 if not html:
bgneal@894 294 return False
bgneal@894 295
bgneal@894 296 root = etree.HTML(html)
bgneal@894 297 for img in root.iter('img'):
bgneal@894 298 src = img.get('src')
bgneal@894 299 if src and src.lower().startswith('http:'):
bgneal@894 300 return True
bgneal@894 301 return False
bgneal@894 302
bgneal@894 303
bgneal@859 304 class Command(NoArgsCommand):
bgneal@859 305 help = "Rewrite forum posts and comments to not use http for images"
bgneal@859 306 option_list = NoArgsCommand.option_list + (
bgneal@866 307 make_option('-m', '--model',
bgneal@866 308 choices=MODEL_CHOICES,
bgneal@866 309 help="which model to update; must be one of {{{}}}".format(
bgneal@866 310 ', '.join(MODEL_CHOICES))),
bgneal@860 311 make_option('-i', '--i',
bgneal@859 312 type='int',
bgneal@863 313 help="optional first slice index; the i in [i:j]"),
bgneal@860 314 make_option('-j', '--j',
bgneal@859 315 type='int',
bgneal@863 316 help="optional second slice index; the j in [i:j]"),
bgneal@898 317 make_option('-t', '--timeout',
bgneal@980 318 type='float',
bgneal@979 319 help="optional socket timeout (secs)",
bgneal@980 320 default=30.0),
bgneal@859 321 )
bgneal@859 322
bgneal@859 323 def handle_noargs(self, **options):
bgneal@895 324 time_started = datetime.datetime.now()
bgneal@859 325 _setup_logging()
bgneal@860 326 logger.info("Starting; arguments received: %s", options)
bgneal@859 327
bgneal@866 328 if options['model'] not in MODEL_CHOICES:
bgneal@866 329 raise CommandError('Please choose a --model option')
bgneal@859 330
bgneal@866 331 if options['model'] == 'comments':
bgneal@860 332 qs = Comment.objects.all()
bgneal@987 333 text_attrs = ['comment']
bgneal@881 334 model_name = 'Comment'
bgneal@987 335 elif options['model'] == 'posts':
bgneal@987 336 qs = Post.objects.all()
bgneal@987 337 text_attrs = ['body']
bgneal@987 338 model_name = 'Post'
bgneal@860 339 else:
bgneal@987 340 qs = Story.objects.all()
bgneal@987 341 text_attrs = ['short_text', 'long_text']
bgneal@987 342 model_name = 'Story'
bgneal@987 343
bgneal@987 344 html_based = options['model'] == 'news'
bgneal@860 345
bgneal@860 346 i, j = options['i'], options['j']
bgneal@860 347
bgneal@860 348 if i is not None and i < 0:
bgneal@860 349 raise CommandError("-i must be >= 0")
bgneal@860 350 if j is not None and j < 0:
bgneal@860 351 raise CommandError("-j must be >= 0")
bgneal@860 352 if j is not None and i is not None and j <= i:
bgneal@860 353 raise CommandError("-j must be > -i")
bgneal@860 354
bgneal@860 355 if i is not None and j is not None:
bgneal@860 356 qs = qs[i:j]
bgneal@860 357 elif i is not None and j is None:
bgneal@860 358 qs = qs[i:]
bgneal@860 359 elif i is None and j is not None:
bgneal@860 360 qs = qs[:j]
bgneal@860 361
bgneal@881 362 # Set global socket timeout
bgneal@980 363 global request_timeout
bgneal@980 364 request_timeout = options.get('timeout')
bgneal@980 365 logger.info("Using socket timeout of %4.2f", request_timeout)
bgneal@881 366
bgneal@863 367 # Install signal handler for ctrl-c
bgneal@863 368 signal.signal(signal.SIGINT, signal_handler)
bgneal@863 369
bgneal@881 370 # Create bucket to upload photos
bgneal@881 371 global bucket
bgneal@881 372 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY,
bgneal@881 373 secret_key=settings.USER_PHOTOS_SECRET_KEY,
bgneal@881 374 base_url=PHOTO_BASE_URL,
bgneal@881 375 bucket_name=PHOTO_BUCKET_NAME)
bgneal@887 376
bgneal@899 377 # Load cached info from previous runs
bgneal@899 378 load_cache()
bgneal@899 379
bgneal@887 380 if i is None:
bgneal@887 381 i = 0
bgneal@887 382
bgneal@895 383 count = 0
bgneal@881 384 for n, model in enumerate(qs.iterator()):
bgneal@863 385 if quit_flag:
bgneal@863 386 logger.warning("SIGINT received, exiting")
bgneal@881 387 break
bgneal@881 388 logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk)
bgneal@987 389 save_flag = False
bgneal@987 390 for text_attr in text_attrs:
bgneal@987 391 txt = getattr(model, text_attr)
bgneal@987 392
bgneal@987 393 if html_based:
bgneal@987 394 new_txt = process_html(txt)
bgneal@987 395 else:
bgneal@987 396 new_txt = process_post(txt)
bgneal@987 397 warn_if_image_refs(txt, model_name, model.pk)
bgneal@987 398
bgneal@987 399 if txt != new_txt:
bgneal@987 400 logger.info("Content changed on %s #%d (pk = %d)",
bgneal@987 401 model_name, n + i, model.pk)
bgneal@987 402 logger.debug(u"original: %s", txt)
bgneal@987 403 logger.debug(u"changed: %s", new_txt)
bgneal@987 404 setattr(model, text_attr, new_txt)
bgneal@987 405 save_flag = True
bgneal@987 406 elif not html_based and html_check(model.html):
bgneal@987 407 # Check for content generated with older smiley code that used
bgneal@987 408 # absolute URLs for the smiley images. If True, then just save
bgneal@987 409 # the model again to force updated HTML to be created.
bgneal@987 410 logger.info("Older Smiley HTML detected, forcing a save")
bgneal@987 411 save_flag = True
bgneal@987 412
bgneal@987 413 if save_flag:
bgneal@894 414 model.save()
bgneal@895 415 count += 1
bgneal@860 416
bgneal@895 417 time_finished = datetime.datetime.now()
bgneal@895 418 elapsed = time_finished - time_started
bgneal@895 419 logger.info("ssl_images exiting; number of objects: %d; elapsed: %s",
bgneal@895 420 count, elapsed)
bgneal@897 421
bgneal@897 422 http_images = len(url_cache)
bgneal@897 423 https_images = sum(1 for v in url_cache.itervalues() if v)
bgneal@897 424 bad_images = http_images - https_images
bgneal@897 425 if http_images > 0:
bgneal@897 426 pct_saved = float(https_images) / http_images * 100.0
bgneal@897 427 else:
bgneal@897 428 pct_saved = 0.0
bgneal@897 429
bgneal@897 430 logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%",
bgneal@897 431 http_images, https_images, bad_images, pct_saved)
bgneal@899 432
bgneal@899 433 save_cache()
bgneal@899 434 logger.info("ssl_images done")
bgneal@899 435
bgneal@899 436
bgneal@899 437 def load_cache():
bgneal@899 438 """Load cache from previous runs."""
bgneal@899 439 logger.info("Loading cached information")
bgneal@899 440 try:
bgneal@899 441 with open(CACHE_FILENAME, 'r') as fp:
bgneal@899 442 d = json.load(fp)
bgneal@899 443 except IOError as ex:
bgneal@899 444 logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex)
bgneal@899 445 return
bgneal@899 446 except ValueError:
bgneal@899 447 logger.error("Mangled cache file: %s", CACHE_FILENAME)
bgneal@899 448 return
bgneal@899 449
bgneal@899 450 global bad_hosts, url_cache
bgneal@899 451 try:
bgneal@899 452 bad_hosts = set(d['bad_hosts'])
bgneal@899 453 url_cache = d['url_cache']
bgneal@899 454 except KeyError:
bgneal@899 455 logger.error("Malformed cache file: %s", CACHE_FILENAME)
bgneal@899 456
bgneal@899 457
bgneal@899 458 def save_cache():
bgneal@899 459 """Save our cache to a file for future runs."""
bgneal@899 460 logger.info("Saving cached information")
bgneal@899 461 d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache}
bgneal@899 462 with open(CACHE_FILENAME, 'w') as fp:
bgneal@899 463 json.dump(d, fp, indent=4)