diff core/management/commands/ssl_images.py @ 987:76525f5ac2b1

Modify ssl_images to update news models.
author Brian Neal <bgneal@gmail.com>
date Wed, 28 Oct 2015 21:06:13 -0500
parents 26de15fb5a80
children 65b2bc9cb3cc
line wrap: on
line diff
--- a/core/management/commands/ssl_images.py	Sun Oct 25 14:47:29 2015 -0500
+++ b/core/management/commands/ssl_images.py	Wed Oct 28 21:06:13 2015 -0500
@@ -20,6 +20,7 @@
 from django.core.management.base import NoArgsCommand, CommandError
 from django.conf import settings
 from lxml import etree
+import lxml.html
 import markdown.inlinepatterns
 from PIL import Image
 import requests
@@ -29,6 +30,7 @@
 from core.download import download_file
 from core.functions import remove_file
 from core.s3 import S3Bucket
+from news.models import Story
 
 
 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log')
@@ -41,7 +43,7 @@
 
 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
 WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES)
-MODEL_CHOICES = ['comments', 'posts']
+MODEL_CHOICES = ['comments', 'posts', 'news']
 
 PHOTO_MAX_SIZE = (660, 720)
 PHOTO_BASE_URL = settings.HOT_LINK_PHOTOS_BASE_URL
@@ -239,6 +241,50 @@
     return IMAGE_LINK_RE.sub(replace_image_markup, text)
 
 
+def process_html(html):
+    """Process the html fragment, converting to https where needed."""
+    s = html.strip()
+    if not s:
+        return s
+
+    changed = False
+    root = lxml.html.fragment_fromstring(s, create_parent=True)
+    for img in root.iter('img'):
+        src = img.get('src')
+        src = src.strip() if src else ''
+        if src:
+            try:
+                r = urlparse.urlparse(src)
+            except ValueError:
+                logger.warning("Bad url? Should not happen; skipping...")
+                continue
+
+            new_src = None
+            if r.hostname in SG101_HOSTS:
+                new_src = r.path        # convert to relative path
+            elif ((r.scheme == 'http') or
+                  (r.scheme == 'https' and r.hostname not in WHITELIST_HOSTS)):
+                new_src = convert_to_ssl(r)
+                if not new_src:
+                    # failed to convert to https; convert to a link
+                    tail = img.tail
+                    img.clear()
+                    img.tag = 'a'
+                    img.set('href', src)
+                    img.text = 'Image'
+                    img.tail = tail
+                    changed = True
+
+            if new_src:
+                img.set('src', new_src)
+                changed = True
+
+    if changed:
+        result = lxml.html.tostring(root, encoding='utf-8')
+        return result[5:-6]     # strip off parent div we added
+    return html
+
+
 def html_check(html):
     """Return True if the given HTML fragment has <img> tags with src attributes
     that use http, and False otherwise.
@@ -283,12 +329,18 @@
 
         if options['model'] == 'comments':
             qs = Comment.objects.all()
-            text_attr = 'comment'
+            text_attrs = ['comment']
             model_name = 'Comment'
+        elif options['model'] == 'posts':
+            qs = Post.objects.all()
+            text_attrs = ['body']
+            model_name = 'Post'
         else:
-            qs = Post.objects.all()
-            text_attr = 'body'
-            model_name = 'Post'
+            qs = Story.objects.all()
+            text_attrs = ['short_text', 'long_text']
+            model_name = 'Story'
+
+        html_based = options['model'] == 'news'
 
         i, j = options['i'], options['j']
 
@@ -333,21 +385,31 @@
                 logger.warning("SIGINT received, exiting")
                 break
             logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk)
-            txt = getattr(model, text_attr)
-            warn_if_image_refs(txt, model_name, model.pk)
-            new_txt = process_post(txt)
-            if txt != new_txt:
-                logger.info("Content changed on %s #%d (pk = %d)",
-                            model_name, n + i, model.pk)
-                logger.debug("original: %s", txt)
-                logger.debug("changed:  %s", new_txt)
-                setattr(model, text_attr, new_txt)
-                model.save()
-            elif html_check(model.html):
-                # Check for content generated with older smiley code that used
-                # absolute URLs for the smiley images. If True, then just save
-                # the model again to force updated HTML to be created.
-                logger.info("Older Smiley HTML detected, forcing a save")
+            save_flag = False
+            for text_attr in text_attrs:
+                txt = getattr(model, text_attr)
+
+                if html_based:
+                    new_txt = process_html(txt)
+                else:
+                    new_txt = process_post(txt)
+                    warn_if_image_refs(txt, model_name, model.pk)
+
+                if txt != new_txt:
+                    logger.info("Content changed on %s #%d (pk = %d)",
+                                model_name, n + i, model.pk)
+                    logger.debug(u"original: %s", txt)
+                    logger.debug(u"changed:  %s", new_txt)
+                    setattr(model, text_attr, new_txt)
+                    save_flag = True
+                elif not html_based and html_check(model.html):
+                    # Check for content generated with older smiley code that used
+                    # absolute URLs for the smiley images. If True, then just save
+                    # the model again to force updated HTML to be created.
+                    logger.info("Older Smiley HTML detected, forcing a save")
+                    save_flag = True
+
+            if save_flag:
                 model.save()
             count += 1