annotate core/management/commands/ssl_images.py @ 887:9a15f7c27526

Actually save model object upon change. This commit was tested on the comments model. Additional logging added. Added check for Markdown image references. Added TODOs after observing behavior on comments.
author Brian Neal <bgneal@gmail.com>
date Tue, 03 Feb 2015 21:09:44 -0600
parents 9a3019f2c7dc
children deef1536a54a
rev   line source
bgneal@859 1 """
bgneal@859 2 ssl_images is a custom manage.py command to convert forum post and comment
bgneal@859 3 images to https. It does this by rewriting the markup:
bgneal@859 4 - Images with src = http://surfguitar101.com/something are rewritten to be
bgneal@859 5 /something.
bgneal@859 6 - Non SG101 images that use http: are downloaded, resized, and uploaded to
bgneal@859 7 an S3 bucket. The src attribute is replaced with the new S3 URL.
bgneal@859 8 """
bgneal@882 9 import base64
bgneal@859 10 import logging
bgneal@859 11 from optparse import make_option
bgneal@859 12 import os.path
bgneal@863 13 import re
bgneal@863 14 import signal
bgneal@881 15 import socket
bgneal@881 16 import urllib
bgneal@868 17 import urlparse
bgneal@881 18 import uuid
bgneal@859 19
bgneal@859 20 from django.core.management.base import NoArgsCommand, CommandError
bgneal@859 21 from django.conf import settings
bgneal@863 22 import markdown.inlinepatterns
bgneal@881 23 from PIL import Image
bgneal@859 24
bgneal@860 25 from comments.models import Comment
bgneal@860 26 from forums.models import Post
bgneal@881 27 from core.s3 import S3Bucket
bgneal@860 28
bgneal@860 29
bgneal@859 30 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log')
bgneal@859 31 logger = logging.getLogger(__name__)
bgneal@859 32
bgneal@871 33 IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE,
bgneal@871 34 re.DOTALL | re.UNICODE)
bgneal@871 35 IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE,
bgneal@871 36 re.DOTALL | re.UNICODE)
bgneal@863 37
bgneal@868 38 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
bgneal@866 39 MODEL_CHOICES = ['comments', 'posts']
bgneal@866 40
bgneal@881 41 PHOTO_MAX_SIZE = (660, 720)
bgneal@881 42 PHOTO_BASE_URL = 'https://s3.amazonaws.com/'
bgneal@881 43 PHOTO_BUCKET_NAME = 'sg101.forum.photos'
bgneal@881 44
bgneal@863 45 quit_flag = False
bgneal@881 46 opener = None
bgneal@881 47 bucket = None
bgneal@881 48 url_cache = {}
bgneal@863 49
bgneal@863 50
bgneal@863 51 def signal_handler(signum, frame):
bgneal@863 52 """SIGINT signal handler"""
bgneal@863 53 global quit_flag
bgneal@863 54 quit_flag = True
bgneal@863 55
bgneal@859 56
bgneal@859 57 def _setup_logging():
bgneal@859 58 logger.setLevel(logging.DEBUG)
bgneal@859 59 logger.propagate = False
bgneal@859 60 handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8')
bgneal@859 61 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
bgneal@859 62 handler.setFormatter(formatter)
bgneal@859 63 logger.addHandler(handler)
bgneal@859 64
bgneal@859 65
bgneal@881 66 class ImageURLopener(urllib.FancyURLopener):
bgneal@881 67 """Our URL opener. Handles redirects as per FancyURLopener. But all other
bgneal@881 68 errors and authentication requests will raise an IOError.
bgneal@881 69 """
bgneal@881 70 HANDLED_ERRORS = set([302, 301, 303, 307])
bgneal@881 71
bgneal@881 72 def http_error_default(self, url, fp, errcode, errmsg, headers):
bgneal@881 73 return urllib.URLopener.http_error_default(self, url, fp, errcode,
bgneal@881 74 errmsg, headers)
bgneal@881 75
bgneal@881 76 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
bgneal@881 77 """Handle http errors.
bgneal@881 78 We let FancyURLopener handle the redirects, but any other error we want
bgneal@881 79 to let fail.
bgneal@881 80 """
bgneal@881 81 if errcode in self.HANDLED_ERRORS:
bgneal@881 82 name = 'http_error_%d' % errcode
bgneal@881 83 method = getattr(self, name)
bgneal@881 84 if data is None:
bgneal@881 85 result = method(url, fp, errcode, errmsg, headers)
bgneal@881 86 else:
bgneal@881 87 result = method(url, fp, errcode, errmsg, headers, data)
bgneal@881 88 if result:
bgneal@881 89 return result
bgneal@881 90 return self.http_error_default(url, fp, errcode, errmsg, headers)
bgneal@881 91
bgneal@881 92
bgneal@881 93 def download_image(src):
bgneal@881 94 """Downloads the image file from the given source URL.
bgneal@881 95
bgneal@881 96 If successful returns the path to the downloaded file. Otherwise None is
bgneal@881 97 returned.
bgneal@881 98 """
bgneal@881 99 logger.info("Retrieving %s", src)
bgneal@881 100 try:
bgneal@881 101 fn, hdrs = opener.retrieve(src)
bgneal@881 102 except IOError as ex:
bgneal@881 103 args = ex.args
bgneal@881 104 if len(args) == 4 and args[0] == 'http error':
bgneal@881 105 logger.error("http error: %d - %s", args[1], args[2])
bgneal@881 106 else:
bgneal@881 107 logger.error("%s", ex)
bgneal@881 108 return None
bgneal@881 109
bgneal@887 110 # TODO: This code below is not right. content-length is optional and will
bgneal@887 111 # not appear when using chunked encoding, for example. Remove this check. If
bgneal@887 112 # we want to log the size of the file, use stat() on it or something.
bgneal@887 113 #
bgneal@881 114 # If there is an error or timeout, sometimes there is no content-length
bgneal@881 115 # header.
bgneal@881 116 content_length = hdrs.get('content-length')
bgneal@881 117 if not content_length:
bgneal@881 118 logger.error("Bad content-length: %s", content_length)
bgneal@881 119 return None
bgneal@881 120
bgneal@881 121 # Does it look like an image?
bgneal@881 122 content_type = hdrs.get('content-type')
bgneal@881 123 if not content_type:
bgneal@881 124 logger.error("No content-type header found")
bgneal@881 125 return None
bgneal@881 126
bgneal@881 127 logger.info("Retrieved: %s bytes; content-type: %s", content_length,
bgneal@881 128 content_type)
bgneal@881 129
bgneal@881 130 parts = content_type.split('/')
bgneal@881 131 if len(parts) < 2 or parts[0] != 'image':
bgneal@881 132 logger.error("Unknown content-type: %s", content_type)
bgneal@881 133 return None
bgneal@881 134
bgneal@881 135 return fn
bgneal@881 136
bgneal@881 137
bgneal@881 138 def resize_image(img_path):
bgneal@881 139 """Resizes the image found at img_path if necessary."""
bgneal@881 140 image = Image.open(img_path)
bgneal@881 141 if image.size > PHOTO_MAX_SIZE:
bgneal@881 142 logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE)
bgneal@881 143 image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS)
bgneal@881 144 image.save(img_path)
bgneal@881 145
bgneal@881 146
bgneal@882 147 def gen_key():
bgneal@882 148 """Return a random key."""
bgneal@882 149 return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=')
bgneal@882 150
bgneal@882 151
bgneal@881 152 def upload_image(img_path):
bgneal@881 153 """Upload image file located at img_path to our S3 bucket.
bgneal@881 154
bgneal@881 155 Returns the URL of the image in the bucket or None if an error occurs.
bgneal@881 156 """
bgneal@881 157 logger.info("upload_image starting")
bgneal@881 158 # Make a unique name for the image in the bucket
bgneal@881 159 ext = os.path.splitext(img_path)[1]
bgneal@882 160 file_key = gen_key() + ext
bgneal@881 161 try:
bgneal@881 162 return bucket.upload_from_filename(file_key, img_path, public=True)
bgneal@881 163 except IOError as ex:
bgneal@881 164 logger.error("Error uploading file: %s", ex)
bgneal@881 165 return None
bgneal@881 166
bgneal@881 167
bgneal@868 168 def save_image_to_cloud(src):
bgneal@881 169 """Downloads an image at a given source URL. Uploads it to cloud storage.
bgneal@881 170
bgneal@881 171 Returns the new URL or None if unsuccessful.
bgneal@881 172 """
bgneal@881 173 # Check the cache first
bgneal@881 174 new_url = url_cache.get(src)
bgneal@881 175 if new_url:
bgneal@881 176 return new_url
bgneal@881 177
bgneal@881 178 fn = download_image(src)
bgneal@881 179 if fn:
bgneal@881 180 resize_image(fn)
bgneal@881 181 new_url = upload_image(fn)
bgneal@881 182 if new_url:
bgneal@881 183 url_cache[src] = new_url
bgneal@881 184 return new_url
bgneal@881 185 return None
bgneal@868 186
bgneal@868 187
bgneal@866 188 def replace_image_markup(match):
bgneal@870 189 src_parts = match.group(8).split()
bgneal@868 190 if src_parts:
bgneal@868 191 src = src_parts[0]
bgneal@868 192 if src[0] == "<" and src[-1] == ">":
bgneal@868 193 src = src[1:-1]
bgneal@868 194 else:
bgneal@868 195 src = ''
bgneal@868 196
bgneal@868 197 title = ''
bgneal@868 198 if len(src_parts) > 1:
bgneal@868 199 title = " ".join(src_parts[1:])
bgneal@870 200 alt = match.group(1)
bgneal@868 201
bgneal@871 202 new_src = None
bgneal@868 203 if src:
bgneal@868 204 r = urlparse.urlparse(src)
bgneal@871 205 if r.hostname in SG101_HOSTS:
bgneal@871 206 new_src = r.path # convert to relative path
bgneal@871 207 elif r.scheme == 'http':
bgneal@887 208 # TODO: it has been observed that at least 2 different services
bgneal@887 209 # serve up the same image on https: with the URL otherwise the same.
bgneal@887 210 # Add code to see if the image is available at https (maybe do
bgneal@887 211 # a HEAD request?) and if so just change the protocol to https in
bgneal@887 212 # the original URL.
bgneal@871 213 new_src = save_image_to_cloud(src)
bgneal@868 214 elif r.scheme == 'https':
bgneal@868 215 new_src = src # already https, accept it as-is
bgneal@868 216
bgneal@868 217 if new_src:
bgneal@868 218 if title:
bgneal@871 219 s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title)
bgneal@868 220 else:
bgneal@868 221 s = u'![{alt}]({src})'.format(alt=alt, src=new_src)
bgneal@868 222 else:
bgneal@868 223 # something's messed up, convert to a link using original src
bgneal@868 224 s = u'[{alt}]({src})'.format(alt=alt, src=src)
bgneal@868 225
bgneal@868 226 return s
bgneal@860 227
bgneal@860 228
bgneal@887 229 def warn_if_image_refs(text, model_name, pk):
bgneal@887 230 """Search text for Markdown image reference markup.
bgneal@887 231
bgneal@887 232 We aren't expecting these, but we will log something if we see any.
bgneal@887 233 """
bgneal@887 234 if IMAGE_REF_RE.search(text):
bgneal@887 235 logger.warning("Image reference found in %s pk = #%d", model_name, pk)
bgneal@887 236
bgneal@887 237
bgneal@866 238 def process_post(text):
bgneal@863 239 """Process the post object:
bgneal@863 240
bgneal@863 241 A regex substitution is run on the post's text field. This fixes up image
bgneal@863 242 links, getting rid of plain old http sources; either converting to https
bgneal@863 243 or relative style links (if the link is to SG101).
bgneal@863 244
bgneal@863 245 """
bgneal@866 246 return IMAGE_LINK_RE.sub(replace_image_markup, text)
bgneal@863 247
bgneal@863 248
bgneal@859 249 class Command(NoArgsCommand):
bgneal@859 250 help = "Rewrite forum posts and comments to not use http for images"
bgneal@859 251 option_list = NoArgsCommand.option_list + (
bgneal@866 252 make_option('-m', '--model',
bgneal@866 253 choices=MODEL_CHOICES,
bgneal@866 254 help="which model to update; must be one of {{{}}}".format(
bgneal@866 255 ', '.join(MODEL_CHOICES))),
bgneal@860 256 make_option('-i', '--i',
bgneal@859 257 type='int',
bgneal@863 258 help="optional first slice index; the i in [i:j]"),
bgneal@860 259 make_option('-j', '--j',
bgneal@859 260 type='int',
bgneal@863 261 help="optional second slice index; the j in [i:j]"),
bgneal@859 262 )
bgneal@859 263
bgneal@859 264 def handle_noargs(self, **options):
bgneal@859 265 _setup_logging()
bgneal@860 266 logger.info("Starting; arguments received: %s", options)
bgneal@859 267
bgneal@866 268 if options['model'] not in MODEL_CHOICES:
bgneal@866 269 raise CommandError('Please choose a --model option')
bgneal@859 270
bgneal@866 271 if options['model'] == 'comments':
bgneal@860 272 qs = Comment.objects.all()
bgneal@866 273 text_attr = 'comment'
bgneal@881 274 model_name = 'Comment'
bgneal@860 275 else:
bgneal@860 276 qs = Post.objects.all()
bgneal@866 277 text_attr = 'body'
bgneal@881 278 model_name = 'Post'
bgneal@860 279
bgneal@860 280 i, j = options['i'], options['j']
bgneal@860 281
bgneal@860 282 if i is not None and i < 0:
bgneal@860 283 raise CommandError("-i must be >= 0")
bgneal@860 284 if j is not None and j < 0:
bgneal@860 285 raise CommandError("-j must be >= 0")
bgneal@860 286 if j is not None and i is not None and j <= i:
bgneal@860 287 raise CommandError("-j must be > -i")
bgneal@860 288
bgneal@860 289 if i is not None and j is not None:
bgneal@860 290 qs = qs[i:j]
bgneal@860 291 elif i is not None and j is None:
bgneal@860 292 qs = qs[i:]
bgneal@860 293 elif i is None and j is not None:
bgneal@860 294 qs = qs[:j]
bgneal@860 295
bgneal@881 296 # Set global socket timeout
bgneal@881 297 socket.setdefaulttimeout(30)
bgneal@881 298
bgneal@863 299 # Install signal handler for ctrl-c
bgneal@863 300 signal.signal(signal.SIGINT, signal_handler)
bgneal@863 301
bgneal@881 302 # Create URL opener to download photos
bgneal@881 303 global opener
bgneal@881 304 opener = ImageURLopener()
bgneal@881 305
bgneal@881 306 # Create bucket to upload photos
bgneal@881 307 global bucket
bgneal@881 308 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY,
bgneal@881 309 secret_key=settings.USER_PHOTOS_SECRET_KEY,
bgneal@881 310 base_url=PHOTO_BASE_URL,
bgneal@881 311 bucket_name=PHOTO_BUCKET_NAME)
bgneal@887 312
bgneal@887 313 if i is None:
bgneal@887 314 i = 0
bgneal@887 315
bgneal@881 316 for n, model in enumerate(qs.iterator()):
bgneal@863 317 if quit_flag:
bgneal@863 318 logger.warning("SIGINT received, exiting")
bgneal@881 319 break
bgneal@881 320 logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk)
bgneal@866 321 txt = getattr(model, text_attr)
bgneal@887 322 warn_if_image_refs(txt, model_name, model.pk)
bgneal@866 323 new_txt = process_post(txt)
bgneal@881 324 if txt != new_txt:
bgneal@887 325 logger.info("Content changed on %s #%d (pk= %d)",
bgneal@887 326 model_name, n + i, model.pk)
bgneal@881 327 logger.debug("original: %s", txt)
bgneal@881 328 logger.debug("changed: %s", new_txt)
bgneal@887 329 setattr(model, text_attr, new_txt)
bgneal@887 330 model.save()
bgneal@860 331
bgneal@887 332 logger.info("ssl_images exiting")