annotate core/management/commands/ssl_images.py @ 883:f12751259f66

Add a Markdown extension to only allow https based <img> tags. This is not yet "turned on" in the site's markup system.
author Brian Neal <bgneal@gmail.com>
date Tue, 03 Feb 2015 19:51:12 -0600
parents 9a3019f2c7dc
children 9a15f7c27526
rev   line source
bgneal@859 1 """
bgneal@859 2 ssl_images is a custom manage.py command to convert forum post and comment
bgneal@859 3 images to https. It does this by rewriting the markup:
bgneal@859 4 - Images with src = http://surfguitar101.com/something are rewritten to be
bgneal@859 5 /something.
bgneal@859 6 - Non SG101 images that use http: are downloaded, resized, and uploaded to
bgneal@859 7 an S3 bucket. The src attribute is replaced with the new S3 URL.
bgneal@859 8 """
bgneal@882 9 import base64
bgneal@859 10 import logging
bgneal@859 11 from optparse import make_option
bgneal@859 12 import os.path
bgneal@863 13 import re
bgneal@863 14 import signal
bgneal@881 15 import socket
bgneal@881 16 import urllib
bgneal@868 17 import urlparse
bgneal@881 18 import uuid
bgneal@859 19
bgneal@859 20 from django.core.management.base import NoArgsCommand, CommandError
bgneal@859 21 from django.conf import settings
bgneal@863 22 import markdown.inlinepatterns
bgneal@881 23 from PIL import Image
bgneal@859 24
bgneal@860 25 from comments.models import Comment
bgneal@860 26 from forums.models import Post
bgneal@881 27 from core.s3 import S3Bucket
bgneal@860 28
bgneal@860 29
bgneal@859 30 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log')
bgneal@859 31 logger = logging.getLogger(__name__)
bgneal@859 32
bgneal@871 33 IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE,
bgneal@871 34 re.DOTALL | re.UNICODE)
bgneal@871 35 IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE,
bgneal@871 36 re.DOTALL | re.UNICODE)
bgneal@863 37
bgneal@868 38 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
bgneal@866 39 MODEL_CHOICES = ['comments', 'posts']
bgneal@866 40
bgneal@881 41 PHOTO_MAX_SIZE = (660, 720)
bgneal@881 42 PHOTO_BASE_URL = 'https://s3.amazonaws.com/'
bgneal@881 43 PHOTO_BUCKET_NAME = 'sg101.forum.photos'
bgneal@881 44
bgneal@863 45 quit_flag = False
bgneal@881 46 opener = None
bgneal@881 47 bucket = None
bgneal@881 48 url_cache = {}
bgneal@863 49
bgneal@863 50
bgneal@863 51 def signal_handler(signum, frame):
bgneal@863 52 """SIGINT signal handler"""
bgneal@863 53 global quit_flag
bgneal@863 54 quit_flag = True
bgneal@863 55
bgneal@859 56
bgneal@859 57 def _setup_logging():
bgneal@859 58 logger.setLevel(logging.DEBUG)
bgneal@859 59 logger.propagate = False
bgneal@859 60 handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8')
bgneal@859 61 formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
bgneal@859 62 handler.setFormatter(formatter)
bgneal@859 63 logger.addHandler(handler)
bgneal@859 64
bgneal@859 65
bgneal@881 66 class ImageURLopener(urllib.FancyURLopener):
bgneal@881 67 """Our URL opener. Handles redirects as per FancyURLopener. But all other
bgneal@881 68 errors and authentication requests will raise an IOError.
bgneal@881 69 """
bgneal@881 70 HANDLED_ERRORS = set([302, 301, 303, 307])
bgneal@881 71
bgneal@881 72 def http_error_default(self, url, fp, errcode, errmsg, headers):
bgneal@881 73 return urllib.URLopener.http_error_default(self, url, fp, errcode,
bgneal@881 74 errmsg, headers)
bgneal@881 75
bgneal@881 76 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
bgneal@881 77 """Handle http errors.
bgneal@881 78 We let FancyURLopener handle the redirects, but any other error we want
bgneal@881 79 to let fail.
bgneal@881 80 """
bgneal@881 81 if errcode in self.HANDLED_ERRORS:
bgneal@881 82 name = 'http_error_%d' % errcode
bgneal@881 83 method = getattr(self, name)
bgneal@881 84 if data is None:
bgneal@881 85 result = method(url, fp, errcode, errmsg, headers)
bgneal@881 86 else:
bgneal@881 87 result = method(url, fp, errcode, errmsg, headers, data)
bgneal@881 88 if result:
bgneal@881 89 return result
bgneal@881 90 return self.http_error_default(url, fp, errcode, errmsg, headers)
bgneal@881 91
bgneal@881 92
bgneal@881 93 def download_image(src):
bgneal@881 94 """Downloads the image file from the given source URL.
bgneal@881 95
bgneal@881 96 If successful returns the path to the downloaded file. Otherwise None is
bgneal@881 97 returned.
bgneal@881 98 """
bgneal@881 99 logger.info("Retrieving %s", src)
bgneal@881 100 try:
bgneal@881 101 fn, hdrs = opener.retrieve(src)
bgneal@881 102 except IOError as ex:
bgneal@881 103 args = ex.args
bgneal@881 104 if len(args) == 4 and args[0] == 'http error':
bgneal@881 105 logger.error("http error: %d - %s", args[1], args[2])
bgneal@881 106 else:
bgneal@881 107 logger.error("%s", ex)
bgneal@881 108 return None
bgneal@881 109
bgneal@881 110 # If there is an error or timeout, sometimes there is no content-length
bgneal@881 111 # header.
bgneal@881 112 content_length = hdrs.get('content-length')
bgneal@881 113 if not content_length:
bgneal@881 114 logger.error("Bad content-length: %s", content_length)
bgneal@881 115 return None
bgneal@881 116
bgneal@881 117 # Does it look like an image?
bgneal@881 118 content_type = hdrs.get('content-type')
bgneal@881 119 if not content_type:
bgneal@881 120 logger.error("No content-type header found")
bgneal@881 121 return None
bgneal@881 122
bgneal@881 123 logger.info("Retrieved: %s bytes; content-type: %s", content_length,
bgneal@881 124 content_type)
bgneal@881 125
bgneal@881 126 parts = content_type.split('/')
bgneal@881 127 if len(parts) < 2 or parts[0] != 'image':
bgneal@881 128 logger.error("Unknown content-type: %s", content_type)
bgneal@881 129 return None
bgneal@881 130
bgneal@881 131 return fn
bgneal@881 132
bgneal@881 133
bgneal@881 134 def resize_image(img_path):
bgneal@881 135 """Resizes the image found at img_path if necessary."""
bgneal@881 136 image = Image.open(img_path)
bgneal@881 137 if image.size > PHOTO_MAX_SIZE:
bgneal@881 138 logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE)
bgneal@881 139 image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS)
bgneal@881 140 image.save(img_path)
bgneal@881 141
bgneal@881 142
bgneal@882 143 def gen_key():
bgneal@882 144 """Return a random key."""
bgneal@882 145 return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=')
bgneal@882 146
bgneal@882 147
bgneal@881 148 def upload_image(img_path):
bgneal@881 149 """Upload image file located at img_path to our S3 bucket.
bgneal@881 150
bgneal@881 151 Returns the URL of the image in the bucket or None if an error occurs.
bgneal@881 152 """
bgneal@881 153 logger.info("upload_image starting")
bgneal@881 154 # Make a unique name for the image in the bucket
bgneal@881 155 ext = os.path.splitext(img_path)[1]
bgneal@882 156 file_key = gen_key() + ext
bgneal@881 157 try:
bgneal@881 158 return bucket.upload_from_filename(file_key, img_path, public=True)
bgneal@881 159 except IOError as ex:
bgneal@881 160 logger.error("Error uploading file: %s", ex)
bgneal@881 161 return None
bgneal@881 162
bgneal@881 163
bgneal@868 164 def save_image_to_cloud(src):
bgneal@881 165 """Downloads an image at a given source URL. Uploads it to cloud storage.
bgneal@881 166
bgneal@881 167 Returns the new URL or None if unsuccessful.
bgneal@881 168 """
bgneal@881 169 # Check the cache first
bgneal@881 170 new_url = url_cache.get(src)
bgneal@881 171 if new_url:
bgneal@881 172 return new_url
bgneal@881 173
bgneal@881 174 fn = download_image(src)
bgneal@881 175 if fn:
bgneal@881 176 resize_image(fn)
bgneal@881 177 new_url = upload_image(fn)
bgneal@881 178 if new_url:
bgneal@881 179 url_cache[src] = new_url
bgneal@881 180 return new_url
bgneal@881 181 return None
bgneal@868 182
bgneal@868 183
bgneal@866 184 def replace_image_markup(match):
bgneal@870 185 src_parts = match.group(8).split()
bgneal@868 186 if src_parts:
bgneal@868 187 src = src_parts[0]
bgneal@868 188 if src[0] == "<" and src[-1] == ">":
bgneal@868 189 src = src[1:-1]
bgneal@868 190 else:
bgneal@868 191 src = ''
bgneal@868 192
bgneal@868 193 title = ''
bgneal@868 194 if len(src_parts) > 1:
bgneal@868 195 title = " ".join(src_parts[1:])
bgneal@870 196 alt = match.group(1)
bgneal@868 197
bgneal@871 198 new_src = None
bgneal@868 199 if src:
bgneal@868 200 r = urlparse.urlparse(src)
bgneal@871 201 if r.hostname in SG101_HOSTS:
bgneal@871 202 new_src = r.path # convert to relative path
bgneal@871 203 elif r.scheme == 'http':
bgneal@871 204 new_src = save_image_to_cloud(src)
bgneal@868 205 elif r.scheme == 'https':
bgneal@868 206 new_src = src # already https, accept it as-is
bgneal@868 207
bgneal@868 208 if new_src:
bgneal@868 209 if title:
bgneal@871 210 s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title)
bgneal@868 211 else:
bgneal@868 212 s = u'![{alt}]({src})'.format(alt=alt, src=new_src)
bgneal@868 213 else:
bgneal@868 214 # something's messed up, convert to a link using original src
bgneal@868 215 s = u'[{alt}]({src})'.format(alt=alt, src=src)
bgneal@868 216
bgneal@868 217 return s
bgneal@860 218
bgneal@860 219
bgneal@866 220 def process_post(text):
bgneal@863 221 """Process the post object:
bgneal@863 222
bgneal@863 223 A regex substitution is run on the post's text field. This fixes up image
bgneal@863 224 links, getting rid of plain old http sources; either converting to https
bgneal@863 225 or relative style links (if the link is to SG101).
bgneal@863 226
bgneal@863 227 We also do a search for Markdown image reference markup. We aren't expecting
bgneal@863 228 these, but we will log something if we see any.
bgneal@863 229
bgneal@863 230 """
bgneal@866 231 return IMAGE_LINK_RE.sub(replace_image_markup, text)
bgneal@863 232
bgneal@863 233
bgneal@859 234 class Command(NoArgsCommand):
bgneal@859 235 help = "Rewrite forum posts and comments to not use http for images"
bgneal@859 236 option_list = NoArgsCommand.option_list + (
bgneal@866 237 make_option('-m', '--model',
bgneal@866 238 choices=MODEL_CHOICES,
bgneal@866 239 help="which model to update; must be one of {{{}}}".format(
bgneal@866 240 ', '.join(MODEL_CHOICES))),
bgneal@860 241 make_option('-i', '--i',
bgneal@859 242 type='int',
bgneal@863 243 help="optional first slice index; the i in [i:j]"),
bgneal@860 244 make_option('-j', '--j',
bgneal@859 245 type='int',
bgneal@863 246 help="optional second slice index; the j in [i:j]"),
bgneal@859 247 )
bgneal@859 248
bgneal@859 249 def handle_noargs(self, **options):
bgneal@859 250 _setup_logging()
bgneal@860 251 logger.info("Starting; arguments received: %s", options)
bgneal@859 252
bgneal@866 253 if options['model'] not in MODEL_CHOICES:
bgneal@866 254 raise CommandError('Please choose a --model option')
bgneal@859 255
bgneal@866 256 if options['model'] == 'comments':
bgneal@860 257 qs = Comment.objects.all()
bgneal@866 258 text_attr = 'comment'
bgneal@881 259 model_name = 'Comment'
bgneal@860 260 else:
bgneal@860 261 qs = Post.objects.all()
bgneal@866 262 text_attr = 'body'
bgneal@881 263 model_name = 'Post'
bgneal@860 264
bgneal@860 265 i, j = options['i'], options['j']
bgneal@860 266
bgneal@860 267 if i is not None and i < 0:
bgneal@860 268 raise CommandError("-i must be >= 0")
bgneal@860 269 if j is not None and j < 0:
bgneal@860 270 raise CommandError("-j must be >= 0")
bgneal@860 271 if j is not None and i is not None and j <= i:
bgneal@860 272 raise CommandError("-j must be > -i")
bgneal@860 273
bgneal@860 274 if i is not None and j is not None:
bgneal@860 275 qs = qs[i:j]
bgneal@860 276 elif i is not None and j is None:
bgneal@860 277 qs = qs[i:]
bgneal@860 278 elif i is None and j is not None:
bgneal@860 279 qs = qs[:j]
bgneal@860 280
bgneal@881 281 # Set global socket timeout
bgneal@881 282 socket.setdefaulttimeout(30)
bgneal@881 283
bgneal@863 284 # Install signal handler for ctrl-c
bgneal@863 285 signal.signal(signal.SIGINT, signal_handler)
bgneal@863 286
bgneal@881 287 # Create URL opener to download photos
bgneal@881 288 global opener
bgneal@881 289 opener = ImageURLopener()
bgneal@881 290
bgneal@881 291 # Create bucket to upload photos
bgneal@881 292 global bucket
bgneal@881 293 bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY,
bgneal@881 294 secret_key=settings.USER_PHOTOS_SECRET_KEY,
bgneal@881 295 base_url=PHOTO_BASE_URL,
bgneal@881 296 bucket_name=PHOTO_BUCKET_NAME)
bgneal@860 297 s = []
bgneal@881 298 for n, model in enumerate(qs.iterator()):
bgneal@863 299 if quit_flag:
bgneal@863 300 logger.warning("SIGINT received, exiting")
bgneal@881 301 break
bgneal@881 302 logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk)
bgneal@866 303 txt = getattr(model, text_attr)
bgneal@866 304 new_txt = process_post(txt)
bgneal@881 305 if txt != new_txt:
bgneal@881 306 logger.debug("content changed")
bgneal@881 307 logger.debug("original: %s", txt)
bgneal@881 308 logger.debug("changed: %s", new_txt)
bgneal@866 309 s.append(new_txt)
bgneal@860 310
bgneal@860 311 import pprint
bgneal@860 312 pprint.pprint(s)