bgneal@859: """ bgneal@859: ssl_images is a custom manage.py command to convert forum post and comment bgneal@859: images to https. It does this by rewriting the markup: bgneal@859: - Images with src = http://surfguitar101.com/something are rewritten to be bgneal@859: /something. bgneal@859: - Non SG101 images that use http: are downloaded, resized, and uploaded to bgneal@859: an S3 bucket. The src attribute is replaced with the new S3 URL. bgneal@859: """ bgneal@882: import base64 bgneal@895: import datetime bgneal@899: import json bgneal@859: import logging bgneal@859: from optparse import make_option bgneal@888: import os bgneal@863: import re bgneal@863: import signal bgneal@868: import urlparse bgneal@881: import uuid bgneal@859: bgneal@859: from django.core.management.base import NoArgsCommand, CommandError bgneal@859: from django.conf import settings bgneal@894: from lxml import etree bgneal@987: import lxml.html bgneal@863: import markdown.inlinepatterns bgneal@881: from PIL import Image bgneal@979: import requests bgneal@859: bgneal@1012: from bio.models import UserProfile bgneal@860: from comments.models import Comment bgneal@860: from forums.models import Post bgneal@979: from core.download import download_file bgneal@979: from core.functions import remove_file bgneal@881: from core.s3 import S3Bucket bgneal@987: from news.models import Story bgneal@860: bgneal@860: bgneal@859: LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log') bgneal@859: logger = logging.getLogger(__name__) bgneal@859: bgneal@871: IMAGE_LINK_RE = re.compile(markdown.inlinepatterns.IMAGE_LINK_RE, bgneal@871: re.DOTALL | re.UNICODE) bgneal@871: IMAGE_REF_RE = re.compile(markdown.inlinepatterns.IMAGE_REFERENCE_RE, bgneal@871: re.DOTALL | re.UNICODE) bgneal@863: bgneal@868: SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com']) bgneal@963: WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES) bgneal@1012: MODEL_CHOICES = ['comments', 'posts', 'news', 'profiles'] bgneal@866: bgneal@881: PHOTO_MAX_SIZE = (660, 720) bgneal@979: PHOTO_BASE_URL = settings.HOT_LINK_PHOTOS_BASE_URL bgneal@979: PHOTO_BUCKET_NAME = settings.HOT_LINK_PHOTOS_BUCKET bgneal@881: bgneal@899: CACHE_FILENAME = 'ssl_images_cache.json' bgneal@899: bgneal@863: quit_flag = False bgneal@881: bucket = None bgneal@881: url_cache = {} bgneal@899: bad_hosts = set() bgneal@980: request_timeout = None bgneal@863: bgneal@863: bgneal@863: def signal_handler(signum, frame): bgneal@863: """SIGINT signal handler""" bgneal@863: global quit_flag bgneal@863: quit_flag = True bgneal@863: bgneal@859: bgneal@859: def _setup_logging(): bgneal@859: logger.setLevel(logging.DEBUG) bgneal@859: logger.propagate = False bgneal@859: handler = logging.FileHandler(filename=LOGFILE, encoding='utf-8') bgneal@859: formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') bgneal@859: handler.setFormatter(formatter) bgneal@859: logger.addHandler(handler) bgneal@859: bgneal@981: requests_log = logging.getLogger("requests.packages.urllib3") bgneal@981: requests_log.setLevel(logging.INFO) bgneal@981: requests_log.propagate = True bgneal@981: requests_log.addHandler(handler) bgneal@981: bgneal@982: dl_log = logging.getLogger("core.download") bgneal@982: dl_log.setLevel(logging.INFO) bgneal@982: dl_log.propagate = True bgneal@982: dl_log.addHandler(handler) bgneal@982: bgneal@859: bgneal@979: def resize_image(img_path): bgneal@979: """Resizes the image found at img_path if necessary. bgneal@979: bgneal@979: Returns True if the image was resized or resizing wasn't necessary. bgneal@979: Returns False if the image could not be read or processed. bgneal@881: """ bgneal@979: try: bgneal@979: image = Image.open(img_path) bgneal@979: except IOError as ex: bgneal@979: logger.error("Error opening %s: %s", img_path, ex) bgneal@979: return False bgneal@881: bgneal@881: if image.size > PHOTO_MAX_SIZE: bgneal@881: logger.info('Resizing from %s to %s', image.size, PHOTO_MAX_SIZE) bgneal@981: try: bgneal@981: image.thumbnail(PHOTO_MAX_SIZE, Image.ANTIALIAS) bgneal@981: image.save(img_path) bgneal@981: except IOError as ex: bgneal@981: logger.error("Error resizing image from %s: %s", img_path, ex) bgneal@981: return False bgneal@881: bgneal@979: return True bgneal@979: bgneal@881: bgneal@882: def gen_key(): bgneal@882: """Return a random key.""" bgneal@882: return base64.b64encode(uuid.uuid4().bytes, '-_').rstrip('=') bgneal@882: bgneal@882: bgneal@881: def upload_image(img_path): bgneal@881: """Upload image file located at img_path to our S3 bucket. bgneal@881: bgneal@881: Returns the URL of the image in the bucket or None if an error occurs. bgneal@881: """ bgneal@881: logger.info("upload_image starting") bgneal@881: # Make a unique name for the image in the bucket bgneal@881: ext = os.path.splitext(img_path)[1] bgneal@882: file_key = gen_key() + ext bgneal@881: try: bgneal@881: return bucket.upload_from_filename(file_key, img_path, public=True) bgneal@881: except IOError as ex: bgneal@881: logger.error("Error uploading file: %s", ex) bgneal@881: return None bgneal@881: bgneal@881: bgneal@888: def convert_to_ssl(parsed_url): bgneal@888: """Top-level function for moving an image to SSL.""" bgneal@888: bgneal@888: src = parsed_url.geturl() bgneal@888: bgneal@899: if parsed_url.hostname in bad_hosts: bgneal@899: logger.info("Host known to be bad, skipping: %s", src) bgneal@899: return None bgneal@899: bgneal@899: # Check the cache bgneal@897: try: bgneal@897: new_url = url_cache[src] bgneal@897: except KeyError: bgneal@897: # cache miss, try to get the file bgneal@899: new_url = save_image_to_cloud(parsed_url) bgneal@897: url_cache[src] = new_url bgneal@897: else: bgneal@897: if new_url: bgneal@897: logger.info("Found URL in cache: %s => %s", src, new_url) bgneal@897: else: bgneal@897: logger.info("URL known to be bad, skipping: %s", src) bgneal@888: bgneal@889: return new_url bgneal@888: bgneal@888: bgneal@899: def save_image_to_cloud(parsed_url): bgneal@881: """Downloads an image at a given source URL. Uploads it to cloud storage. bgneal@881: bgneal@881: Returns the new URL or None if unsuccessful. bgneal@881: """ bgneal@979: url = parsed_url.geturl() bgneal@979: fn = None bgneal@979: try: bgneal@980: fn = download_file(url, timeout=request_timeout) bgneal@979: except requests.ConnectionError as ex: bgneal@979: logger.error("ConnectionError, ignoring host %s", parsed_url.hostname) bgneal@979: bad_hosts.add(parsed_url.hostname) bgneal@979: except requests.RequestException as ex: bgneal@979: logger.error("%s", ex) bgneal@979: except Exception as ex: bgneal@979: logger.exception("%s", ex) bgneal@979: bgneal@881: if fn: bgneal@979: with remove_file(fn): bgneal@979: if resize_image(fn): bgneal@979: return upload_image(fn) bgneal@881: return None bgneal@868: bgneal@868: bgneal@866: def replace_image_markup(match): bgneal@870: src_parts = match.group(8).split() bgneal@868: if src_parts: bgneal@868: src = src_parts[0] bgneal@868: if src[0] == "<" and src[-1] == ">": bgneal@868: src = src[1:-1] bgneal@868: else: bgneal@868: src = '' bgneal@868: bgneal@868: title = '' bgneal@868: if len(src_parts) > 1: bgneal@868: title = " ".join(src_parts[1:]) bgneal@870: alt = match.group(1) bgneal@868: bgneal@871: new_src = None bgneal@868: if src: bgneal@986: try: bgneal@986: r = urlparse.urlparse(src) bgneal@986: except ValueError: bgneal@986: return u'{bad image}' bgneal@986: bgneal@871: if r.hostname in SG101_HOSTS: bgneal@871: new_src = r.path # convert to relative path bgneal@871: elif r.scheme == 'http': bgneal@888: # Try a few things to get this on ssl: bgneal@888: new_src = convert_to_ssl(r) bgneal@868: elif r.scheme == 'https': bgneal@963: if r.hostname in WHITELIST_HOSTS: bgneal@963: new_src = src # already in whitelist bgneal@963: else: bgneal@963: new_src = convert_to_ssl(r) bgneal@868: bgneal@868: if new_src: bgneal@868: if title: bgneal@871: s = u'![{alt}]({src} {title})'.format(alt=alt, src=new_src, title=title) bgneal@868: else: bgneal@868: s = u'![{alt}]({src})'.format(alt=alt, src=new_src) bgneal@868: else: bgneal@868: # something's messed up, convert to a link using original src bgneal@868: s = u'[{alt}]({src})'.format(alt=alt, src=src) bgneal@868: bgneal@868: return s bgneal@860: bgneal@860: bgneal@887: def warn_if_image_refs(text, model_name, pk): bgneal@887: """Search text for Markdown image reference markup. bgneal@887: bgneal@887: We aren't expecting these, but we will log something if we see any. bgneal@887: """ bgneal@887: if IMAGE_REF_RE.search(text): bgneal@887: logger.warning("Image reference found in %s pk = #%d", model_name, pk) bgneal@887: bgneal@887: bgneal@866: def process_post(text): bgneal@863: """Process the post object: bgneal@863: bgneal@863: A regex substitution is run on the post's text field. This fixes up image bgneal@863: links, getting rid of plain old http sources; either converting to https bgneal@863: or relative style links (if the link is to SG101). bgneal@863: bgneal@863: """ bgneal@866: return IMAGE_LINK_RE.sub(replace_image_markup, text) bgneal@863: bgneal@863: bgneal@987: def process_html(html): bgneal@987: """Process the html fragment, converting to https where needed.""" bgneal@987: s = html.strip() bgneal@987: if not s: bgneal@987: return s bgneal@987: bgneal@987: changed = False bgneal@987: root = lxml.html.fragment_fromstring(s, create_parent=True) bgneal@987: for img in root.iter('img'): bgneal@987: src = img.get('src') bgneal@987: src = src.strip() if src else '' bgneal@987: if src: bgneal@987: try: bgneal@987: r = urlparse.urlparse(src) bgneal@987: except ValueError: bgneal@987: logger.warning("Bad url? Should not happen; skipping...") bgneal@987: continue bgneal@987: bgneal@987: new_src = None bgneal@987: if r.hostname in SG101_HOSTS: bgneal@987: new_src = r.path # convert to relative path bgneal@987: elif ((r.scheme == 'http') or bgneal@987: (r.scheme == 'https' and r.hostname not in WHITELIST_HOSTS)): bgneal@987: new_src = convert_to_ssl(r) bgneal@987: if not new_src: bgneal@987: # failed to convert to https; convert to a link bgneal@987: tail = img.tail bgneal@987: img.clear() bgneal@987: img.tag = 'a' bgneal@987: img.set('href', src) bgneal@987: img.text = 'Image' bgneal@987: img.tail = tail bgneal@987: changed = True bgneal@987: bgneal@987: if new_src: bgneal@987: img.set('src', new_src) bgneal@987: changed = True bgneal@987: bgneal@987: if changed: bgneal@987: result = lxml.html.tostring(root, encoding='utf-8') bgneal@988: result = result[5:-6] # strip off parent div we added bgneal@988: return result.decode('utf-8') bgneal@987: return html bgneal@987: bgneal@987: bgneal@894: def html_check(html): bgneal@894: """Return True if the given HTML fragment has tags with src attributes bgneal@894: that use http, and False otherwise. bgneal@894: """ bgneal@894: if not html: bgneal@894: return False bgneal@894: bgneal@894: root = etree.HTML(html) bgneal@894: for img in root.iter('img'): bgneal@894: src = img.get('src') bgneal@894: if src and src.lower().startswith('http:'): bgneal@894: return True bgneal@894: return False bgneal@894: bgneal@894: bgneal@859: class Command(NoArgsCommand): bgneal@859: help = "Rewrite forum posts and comments to not use http for images" bgneal@859: option_list = NoArgsCommand.option_list + ( bgneal@866: make_option('-m', '--model', bgneal@866: choices=MODEL_CHOICES, bgneal@866: help="which model to update; must be one of {{{}}}".format( bgneal@866: ', '.join(MODEL_CHOICES))), bgneal@860: make_option('-i', '--i', bgneal@859: type='int', bgneal@863: help="optional first slice index; the i in [i:j]"), bgneal@860: make_option('-j', '--j', bgneal@859: type='int', bgneal@863: help="optional second slice index; the j in [i:j]"), bgneal@898: make_option('-t', '--timeout', bgneal@980: type='float', bgneal@979: help="optional socket timeout (secs)", bgneal@980: default=30.0), bgneal@859: ) bgneal@859: bgneal@859: def handle_noargs(self, **options): bgneal@895: time_started = datetime.datetime.now() bgneal@859: _setup_logging() bgneal@860: logger.info("Starting; arguments received: %s", options) bgneal@859: bgneal@866: if options['model'] not in MODEL_CHOICES: bgneal@866: raise CommandError('Please choose a --model option') bgneal@859: bgneal@1012: save_kwargs = {} bgneal@866: if options['model'] == 'comments': bgneal@860: qs = Comment.objects.all() bgneal@987: text_attrs = ['comment'] bgneal@881: model_name = 'Comment' bgneal@987: elif options['model'] == 'posts': bgneal@987: qs = Post.objects.all() bgneal@987: text_attrs = ['body'] bgneal@987: model_name = 'Post' bgneal@1012: elif options['model'] == 'profiles': bgneal@1012: qs = UserProfile.objects.all() bgneal@1012: text_attrs = ['profile_text', 'signature'] bgneal@1012: model_name = 'UserProfile' bgneal@1012: save_kwargs = {'content_update': True} bgneal@860: else: bgneal@987: qs = Story.objects.all() bgneal@987: text_attrs = ['short_text', 'long_text'] bgneal@987: model_name = 'Story' bgneal@987: bgneal@987: html_based = options['model'] == 'news' bgneal@860: bgneal@860: i, j = options['i'], options['j'] bgneal@860: bgneal@860: if i is not None and i < 0: bgneal@860: raise CommandError("-i must be >= 0") bgneal@860: if j is not None and j < 0: bgneal@860: raise CommandError("-j must be >= 0") bgneal@860: if j is not None and i is not None and j <= i: bgneal@860: raise CommandError("-j must be > -i") bgneal@860: bgneal@860: if i is not None and j is not None: bgneal@860: qs = qs[i:j] bgneal@860: elif i is not None and j is None: bgneal@860: qs = qs[i:] bgneal@860: elif i is None and j is not None: bgneal@860: qs = qs[:j] bgneal@860: bgneal@881: # Set global socket timeout bgneal@980: global request_timeout bgneal@980: request_timeout = options.get('timeout') bgneal@980: logger.info("Using socket timeout of %4.2f", request_timeout) bgneal@881: bgneal@863: # Install signal handler for ctrl-c bgneal@863: signal.signal(signal.SIGINT, signal_handler) bgneal@863: bgneal@881: # Create bucket to upload photos bgneal@881: global bucket bgneal@881: bucket = S3Bucket(access_key=settings.USER_PHOTOS_ACCESS_KEY, bgneal@881: secret_key=settings.USER_PHOTOS_SECRET_KEY, bgneal@881: base_url=PHOTO_BASE_URL, bgneal@881: bucket_name=PHOTO_BUCKET_NAME) bgneal@887: bgneal@899: # Load cached info from previous runs bgneal@899: load_cache() bgneal@899: bgneal@887: if i is None: bgneal@887: i = 0 bgneal@887: bgneal@895: count = 0 bgneal@881: for n, model in enumerate(qs.iterator()): bgneal@863: if quit_flag: bgneal@863: logger.warning("SIGINT received, exiting") bgneal@881: break bgneal@881: logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk) bgneal@987: save_flag = False bgneal@987: for text_attr in text_attrs: bgneal@987: txt = getattr(model, text_attr) bgneal@987: bgneal@987: if html_based: bgneal@987: new_txt = process_html(txt) bgneal@987: else: bgneal@987: new_txt = process_post(txt) bgneal@987: warn_if_image_refs(txt, model_name, model.pk) bgneal@987: bgneal@987: if txt != new_txt: bgneal@987: logger.info("Content changed on %s #%d (pk = %d)", bgneal@987: model_name, n + i, model.pk) bgneal@987: logger.debug(u"original: %s", txt) bgneal@987: logger.debug(u"changed: %s", new_txt) bgneal@987: setattr(model, text_attr, new_txt) bgneal@987: save_flag = True bgneal@1012: elif not html_based and hasattr(model, 'html') and html_check(model.html): bgneal@987: # Check for content generated with older smiley code that used bgneal@987: # absolute URLs for the smiley images. If True, then just save bgneal@987: # the model again to force updated HTML to be created. bgneal@987: logger.info("Older Smiley HTML detected, forcing a save") bgneal@987: save_flag = True bgneal@987: bgneal@987: if save_flag: bgneal@1012: model.save(**save_kwargs) bgneal@895: count += 1 bgneal@860: bgneal@895: time_finished = datetime.datetime.now() bgneal@895: elapsed = time_finished - time_started bgneal@895: logger.info("ssl_images exiting; number of objects: %d; elapsed: %s", bgneal@895: count, elapsed) bgneal@897: bgneal@897: http_images = len(url_cache) bgneal@897: https_images = sum(1 for v in url_cache.itervalues() if v) bgneal@897: bad_images = http_images - https_images bgneal@897: if http_images > 0: bgneal@897: pct_saved = float(https_images) / http_images * 100.0 bgneal@897: else: bgneal@897: pct_saved = 0.0 bgneal@897: bgneal@897: logger.info("Summary: http: %d; https: %d; lost: %d; saved: %3.1f %%", bgneal@897: http_images, https_images, bad_images, pct_saved) bgneal@899: bgneal@899: save_cache() bgneal@899: logger.info("ssl_images done") bgneal@899: bgneal@899: bgneal@899: def load_cache(): bgneal@899: """Load cache from previous runs.""" bgneal@899: logger.info("Loading cached information") bgneal@899: try: bgneal@899: with open(CACHE_FILENAME, 'r') as fp: bgneal@899: d = json.load(fp) bgneal@899: except IOError as ex: bgneal@899: logger.error("Cache file (%s) IOError: %s", CACHE_FILENAME, ex) bgneal@899: return bgneal@899: except ValueError: bgneal@899: logger.error("Mangled cache file: %s", CACHE_FILENAME) bgneal@899: return bgneal@899: bgneal@899: global bad_hosts, url_cache bgneal@899: try: bgneal@899: bad_hosts = set(d['bad_hosts']) bgneal@899: url_cache = d['url_cache'] bgneal@899: except KeyError: bgneal@899: logger.error("Malformed cache file: %s", CACHE_FILENAME) bgneal@899: bgneal@899: bgneal@899: def save_cache(): bgneal@899: """Save our cache to a file for future runs.""" bgneal@899: logger.info("Saving cached information") bgneal@899: d = {'bad_hosts': list(bad_hosts), 'url_cache': url_cache} bgneal@899: with open(CACHE_FILENAME, 'w') as fp: bgneal@899: json.dump(d, fp, indent=4)