changeset 987:76525f5ac2b1

Modify ssl_images to update news models.
author Brian Neal <bgneal@gmail.com>
date Wed, 28 Oct 2015 21:06:13 -0500
parents 26de15fb5a80
children 65b2bc9cb3cc
files core/management/commands/ssl_images.py core/tests/test_ssl_images.py news/models.py
diffstat 3 files changed, 260 insertions(+), 22 deletions(-) [+]
line wrap: on
line diff
--- a/core/management/commands/ssl_images.py	Sun Oct 25 14:47:29 2015 -0500
+++ b/core/management/commands/ssl_images.py	Wed Oct 28 21:06:13 2015 -0500
@@ -20,6 +20,7 @@
 from django.core.management.base import NoArgsCommand, CommandError
 from django.conf import settings
 from lxml import etree
+import lxml.html
 import markdown.inlinepatterns
 from PIL import Image
 import requests
@@ -29,6 +30,7 @@
 from core.download import download_file
 from core.functions import remove_file
 from core.s3 import S3Bucket
+from news.models import Story
 
 
 LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log')
@@ -41,7 +43,7 @@
 
 SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com'])
 WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES)
-MODEL_CHOICES = ['comments', 'posts']
+MODEL_CHOICES = ['comments', 'posts', 'news']
 
 PHOTO_MAX_SIZE = (660, 720)
 PHOTO_BASE_URL = settings.HOT_LINK_PHOTOS_BASE_URL
@@ -239,6 +241,50 @@
     return IMAGE_LINK_RE.sub(replace_image_markup, text)
 
 
+def process_html(html):
+    """Process the html fragment, converting to https where needed."""
+    s = html.strip()
+    if not s:
+        return s
+
+    changed = False
+    root = lxml.html.fragment_fromstring(s, create_parent=True)
+    for img in root.iter('img'):
+        src = img.get('src')
+        src = src.strip() if src else ''
+        if src:
+            try:
+                r = urlparse.urlparse(src)
+            except ValueError:
+                logger.warning("Bad url? Should not happen; skipping...")
+                continue
+
+            new_src = None
+            if r.hostname in SG101_HOSTS:
+                new_src = r.path        # convert to relative path
+            elif ((r.scheme == 'http') or
+                  (r.scheme == 'https' and r.hostname not in WHITELIST_HOSTS)):
+                new_src = convert_to_ssl(r)
+                if not new_src:
+                    # failed to convert to https; convert to a link
+                    tail = img.tail
+                    img.clear()
+                    img.tag = 'a'
+                    img.set('href', src)
+                    img.text = 'Image'
+                    img.tail = tail
+                    changed = True
+
+            if new_src:
+                img.set('src', new_src)
+                changed = True
+
+    if changed:
+        result = lxml.html.tostring(root, encoding='utf-8')
+        return result[5:-6]     # strip off parent div we added
+    return html
+
+
 def html_check(html):
     """Return True if the given HTML fragment has <img> tags with src attributes
     that use http, and False otherwise.
@@ -283,12 +329,18 @@
 
         if options['model'] == 'comments':
             qs = Comment.objects.all()
-            text_attr = 'comment'
+            text_attrs = ['comment']
             model_name = 'Comment'
+        elif options['model'] == 'posts':
+            qs = Post.objects.all()
+            text_attrs = ['body']
+            model_name = 'Post'
         else:
-            qs = Post.objects.all()
-            text_attr = 'body'
-            model_name = 'Post'
+            qs = Story.objects.all()
+            text_attrs = ['short_text', 'long_text']
+            model_name = 'Story'
+
+        html_based = options['model'] == 'news'
 
         i, j = options['i'], options['j']
 
@@ -333,21 +385,31 @@
                 logger.warning("SIGINT received, exiting")
                 break
             logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk)
-            txt = getattr(model, text_attr)
-            warn_if_image_refs(txt, model_name, model.pk)
-            new_txt = process_post(txt)
-            if txt != new_txt:
-                logger.info("Content changed on %s #%d (pk = %d)",
-                            model_name, n + i, model.pk)
-                logger.debug("original: %s", txt)
-                logger.debug("changed:  %s", new_txt)
-                setattr(model, text_attr, new_txt)
-                model.save()
-            elif html_check(model.html):
-                # Check for content generated with older smiley code that used
-                # absolute URLs for the smiley images. If True, then just save
-                # the model again to force updated HTML to be created.
-                logger.info("Older Smiley HTML detected, forcing a save")
+            save_flag = False
+            for text_attr in text_attrs:
+                txt = getattr(model, text_attr)
+
+                if html_based:
+                    new_txt = process_html(txt)
+                else:
+                    new_txt = process_post(txt)
+                    warn_if_image_refs(txt, model_name, model.pk)
+
+                if txt != new_txt:
+                    logger.info("Content changed on %s #%d (pk = %d)",
+                                model_name, n + i, model.pk)
+                    logger.debug(u"original: %s", txt)
+                    logger.debug(u"changed:  %s", new_txt)
+                    setattr(model, text_attr, new_txt)
+                    save_flag = True
+                elif not html_based and html_check(model.html):
+                    # Check for content generated with older smiley code that used
+                    # absolute URLs for the smiley images. If True, then just save
+                    # the model again to force updated HTML to be created.
+                    logger.info("Older Smiley HTML detected, forcing a save")
+                    save_flag = True
+
+            if save_flag:
                 model.save()
             count += 1
 
--- a/core/tests/test_ssl_images.py	Sun Oct 25 14:47:29 2015 -0500
+++ b/core/tests/test_ssl_images.py	Wed Oct 28 21:06:13 2015 -0500
@@ -7,7 +7,7 @@
 from django.conf import settings
 
 from core.management.commands.ssl_images import html_check
-from core.management.commands.ssl_images import process_post
+from core.management.commands.ssl_images import process_post, process_html
 import core.management.commands.ssl_images
 
 
@@ -290,3 +290,179 @@
             <p>Look again: <img src="https://b.jpg" alt="b" /></p>
             </div>
             """))
+
+
+class ProcessHtmlTestCase(unittest.TestCase):
+
+    SG101_RE = re.compile(r'http://(?:www\.)?surfguitar101.com/', re.I)
+
+    def setUp(self):
+        self.assertTrue(len(settings.USER_IMAGES_SOURCES) > 0)
+        self.safe_host = settings.USER_IMAGES_SOURCES[0]
+
+    def tearDown(self):
+        core.management.commands.ssl_images.url_cache = {}
+
+    def test_empty_string(self):
+        s = process_html('')
+        self.assertEqual(s, '')
+
+    def test_whitespace_string(self):
+        s = process_html('\r\n\r\n')
+        self.assertEqual(s, '')
+
+    def test_no_matches(self):
+        test_str = """<p>Here is a post that doesn't contain any image links at
+        all. It also spans lines.</p>
+        """
+        result = process_html(test_str)
+        self.assertEqual(test_str, result)
+
+    def test_multiple_paragraphs(self):
+        test_str = """<p>Here is a post that doesn't contain any image links at
+        all. It also spans lines.</p>
+        """
+        test_str += test_str
+        result = process_html(test_str)
+        self.assertEqual(test_str, result)
+
+    def test_sg101_images(self):
+        test_str = """<p>An image:
+        <img src="http://www.surfguitar101.com/img.jpg" alt="image">
+        And another: <img src="HTTP://SURFGUITAR101.COM/foo/bar/img.png" alt="pic">
+        More stuff here.</p>"""
+        expected = self.SG101_RE.sub('/', test_str)
+        result = process_html(test_str)
+        self.assertNotEqual(test_str, expected)
+        self.assertEqual(expected, result)
+
+    def test_https_already(self):
+        test_str = """<p>An image that is already using https:
+            <img src="https://{}/zzz.png" alt="pic">
+            It's cool.</p>""".format(self.safe_host)
+        result = process_html(test_str)
+        self.assertEqual(test_str, result)
+
+    def test_https_sg101(self):
+        test_str = """<p>An image that is already using https:
+            <img src="https://www.SURFGUITAR101.com/zzz.png" alt="pic">
+            It's cool.</p>
+            """
+        expected = """<p>An image that is already using https:
+            <img src="/zzz.png" alt="pic">
+            It's cool.</p>"""
+        result = process_html(test_str)
+        self.assertEqual(expected, result)
+
+    def test_multiple_non_http(self):
+        test_str = """<p>An image:
+        <img src="http://www.surfguitar101.com/img.jpg" alt="pic">
+        And another:
+        <img src="HTTPS://{}/foo/bar/img.png" alt="stuff">
+        More stuff here.</p>
+        """.format(self.safe_host)
+        expected = """<p>An image:
+        <img src="/img.jpg" alt="pic">
+        And another:
+        <img src="HTTPS://{}/foo/bar/img.png" alt="stuff">
+        More stuff here.</p>""".format(self.safe_host)
+        result = process_html(test_str)
+        self.assertEqual(expected, result)
+
+    def test_https_already_with_title(self):
+        test_str = """<p>An image that is already using https:
+            <img src="https://{}/zzz.png" alt="1" title="the title">
+            It's cool.</p>
+            """.format(self.safe_host)
+        result = process_html(test_str)
+        self.assertEqual(test_str, result)
+
+    @mock.patch('core.management.commands.ssl_images.save_image_to_cloud')
+    def test_simple_replacement(self, upload_mock):
+        old_src = 'http://example.com/images/my_image.jpg'
+        new_src = 'https://cloud.com/ABCDEF.jpg'
+        test_str = """<p>Here is a really cool http: based image:
+            <img src="{}" alt="a">
+            Cool, right?</p>""".format(old_src)
+        expected = """<p>Here is a really cool http: based image:
+            <img src="{}" alt="a">
+            Cool, right?</p>""".format(new_src)
+
+        upload_mock.return_value = new_src
+        result = process_html(test_str)
+        self.assertEqual(expected, result)
+        upload_mock.assert_called_once_with(urlparse(old_src))
+
+    @mock.patch('core.management.commands.ssl_images.save_image_to_cloud')
+    def test_multiple_replacement(self, upload_mock):
+        old_src = [
+            'http://example.com/images/my_image.jpg',
+            'http://example.com/static/wow.gif',
+            'http://example.com/media/a/b/c/pic.png',
+        ]
+        new_src = [
+            'https://cloud.com/some/path/012345.jpg',
+            'https://cloud.com/some/path/6789AB.gif',
+            'https://cloud.com/some/path/CDEF01.png',
+        ]
+
+        template = """<p>Here is a really cool http: based image:
+            <img src="{}" alt="a">
+            Cool, right?
+            Another one: <img src="{}" alt="b">
+            And finally
+            <img src="{}" alt="c">
+            </p>"""
+
+        test_str = template.format(*old_src)
+        expected = template.format(*new_src)
+
+        upload_mock.side_effect = new_src
+        result = process_html(test_str)
+        self.assertEqual(expected, result)
+        expected_args = [mock.call(urlparse(c)) for c in old_src]
+        self.assertEqual(upload_mock.call_args_list, expected_args)
+
+    @mock.patch('core.management.commands.ssl_images.save_image_to_cloud')
+    def test_multiple_replacement_2(self, upload_mock):
+        old_src = [
+            'http://example.com/images/my_image.jpg',
+            'https://{}/static/wow.gif'.format(self.safe_host),
+            'http://www.surfguitar101.com/media/a/b/c/pic.png',
+            'http://surfguitar101.com/media/a/b/c/pic2.png',
+        ]
+        new_src = [
+            'https://cloud.com/some/path/012345.jpg',
+            'https://{}/static/wow.gif'.format(self.safe_host),
+            '/media/a/b/c/pic.png',
+            '/media/a/b/c/pic2.png',
+        ]
+
+        template = """<p>Here is a really cool http: based image:
+            <img src="{}" alt="a">
+            Cool, right?
+            Another two: <img src="{}" alt="b"><img src="{}" alt="c">
+            And finally
+            <img src="{}" alt="d"></p>"""
+
+        test_str = template.format(*old_src)
+        expected = template.format(*new_src)
+
+        upload_mock.side_effect = new_src
+        result = process_html(test_str)
+        self.assertEqual(expected, result)
+        upload_mock.assert_called_once_with(urlparse(old_src[0]))
+
+    @mock.patch('core.management.commands.ssl_images.convert_to_ssl')
+    def test_change_img_to_a(self, convert_mock):
+        convert_mock.return_value = None
+        test_str = """<p>A bad image:
+            <img src="http://example.com/zzz.png" alt="1" title="the title">
+            It's cool.</p>"""
+
+        result = process_html(test_str)
+
+        expected = """<p>A bad image:
+            <a href="http://example.com/zzz.png">Image</a>
+            It's cool.</p>"""
+        self.assertEqual(result, expected)
--- a/news/models.py	Sun Oct 25 14:47:29 2015 -0500
+++ b/news/models.py	Wed Oct 28 21:06:13 2015 -0500
@@ -18,7 +18,7 @@
         return self.title
 
     def num_stories(self):
-        return News.objects.filter(category = self.pk).count()
+        return Story.objects.filter(category=self.pk).count()
 
     class Meta:
         verbose_name_plural = 'Categories'