Mercurial > public > sg101
changeset 987:76525f5ac2b1
Modify ssl_images to update news models.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Wed, 28 Oct 2015 21:06:13 -0500 |
parents | 26de15fb5a80 |
children | 65b2bc9cb3cc |
files | core/management/commands/ssl_images.py core/tests/test_ssl_images.py news/models.py |
diffstat | 3 files changed, 260 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/core/management/commands/ssl_images.py Sun Oct 25 14:47:29 2015 -0500 +++ b/core/management/commands/ssl_images.py Wed Oct 28 21:06:13 2015 -0500 @@ -20,6 +20,7 @@ from django.core.management.base import NoArgsCommand, CommandError from django.conf import settings from lxml import etree +import lxml.html import markdown.inlinepatterns from PIL import Image import requests @@ -29,6 +30,7 @@ from core.download import download_file from core.functions import remove_file from core.s3 import S3Bucket +from news.models import Story LOGFILE = os.path.join(settings.PROJECT_PATH, 'logs', 'ssl_images.log') @@ -41,7 +43,7 @@ SG101_HOSTS = set(['www.surfguitar101.com', 'surfguitar101.com']) WHITELIST_HOSTS = set(settings.USER_IMAGES_SOURCES) -MODEL_CHOICES = ['comments', 'posts'] +MODEL_CHOICES = ['comments', 'posts', 'news'] PHOTO_MAX_SIZE = (660, 720) PHOTO_BASE_URL = settings.HOT_LINK_PHOTOS_BASE_URL @@ -239,6 +241,50 @@ return IMAGE_LINK_RE.sub(replace_image_markup, text) +def process_html(html): + """Process the html fragment, converting to https where needed.""" + s = html.strip() + if not s: + return s + + changed = False + root = lxml.html.fragment_fromstring(s, create_parent=True) + for img in root.iter('img'): + src = img.get('src') + src = src.strip() if src else '' + if src: + try: + r = urlparse.urlparse(src) + except ValueError: + logger.warning("Bad url? Should not happen; skipping...") + continue + + new_src = None + if r.hostname in SG101_HOSTS: + new_src = r.path # convert to relative path + elif ((r.scheme == 'http') or + (r.scheme == 'https' and r.hostname not in WHITELIST_HOSTS)): + new_src = convert_to_ssl(r) + if not new_src: + # failed to convert to https; convert to a link + tail = img.tail + img.clear() + img.tag = 'a' + img.set('href', src) + img.text = 'Image' + img.tail = tail + changed = True + + if new_src: + img.set('src', new_src) + changed = True + + if changed: + result = lxml.html.tostring(root, encoding='utf-8') + return result[5:-6] # strip off parent div we added + return html + + def html_check(html): """Return True if the given HTML fragment has <img> tags with src attributes that use http, and False otherwise. @@ -283,12 +329,18 @@ if options['model'] == 'comments': qs = Comment.objects.all() - text_attr = 'comment' + text_attrs = ['comment'] model_name = 'Comment' + elif options['model'] == 'posts': + qs = Post.objects.all() + text_attrs = ['body'] + model_name = 'Post' else: - qs = Post.objects.all() - text_attr = 'body' - model_name = 'Post' + qs = Story.objects.all() + text_attrs = ['short_text', 'long_text'] + model_name = 'Story' + + html_based = options['model'] == 'news' i, j = options['i'], options['j'] @@ -333,21 +385,31 @@ logger.warning("SIGINT received, exiting") break logger.info("Processing %s #%d (pk = %d)", model_name, n + i, model.pk) - txt = getattr(model, text_attr) - warn_if_image_refs(txt, model_name, model.pk) - new_txt = process_post(txt) - if txt != new_txt: - logger.info("Content changed on %s #%d (pk = %d)", - model_name, n + i, model.pk) - logger.debug("original: %s", txt) - logger.debug("changed: %s", new_txt) - setattr(model, text_attr, new_txt) - model.save() - elif html_check(model.html): - # Check for content generated with older smiley code that used - # absolute URLs for the smiley images. If True, then just save - # the model again to force updated HTML to be created. - logger.info("Older Smiley HTML detected, forcing a save") + save_flag = False + for text_attr in text_attrs: + txt = getattr(model, text_attr) + + if html_based: + new_txt = process_html(txt) + else: + new_txt = process_post(txt) + warn_if_image_refs(txt, model_name, model.pk) + + if txt != new_txt: + logger.info("Content changed on %s #%d (pk = %d)", + model_name, n + i, model.pk) + logger.debug(u"original: %s", txt) + logger.debug(u"changed: %s", new_txt) + setattr(model, text_attr, new_txt) + save_flag = True + elif not html_based and html_check(model.html): + # Check for content generated with older smiley code that used + # absolute URLs for the smiley images. If True, then just save + # the model again to force updated HTML to be created. + logger.info("Older Smiley HTML detected, forcing a save") + save_flag = True + + if save_flag: model.save() count += 1
--- a/core/tests/test_ssl_images.py Sun Oct 25 14:47:29 2015 -0500 +++ b/core/tests/test_ssl_images.py Wed Oct 28 21:06:13 2015 -0500 @@ -7,7 +7,7 @@ from django.conf import settings from core.management.commands.ssl_images import html_check -from core.management.commands.ssl_images import process_post +from core.management.commands.ssl_images import process_post, process_html import core.management.commands.ssl_images @@ -290,3 +290,179 @@ <p>Look again: <img src="https://b.jpg" alt="b" /></p> </div> """)) + + +class ProcessHtmlTestCase(unittest.TestCase): + + SG101_RE = re.compile(r'http://(?:www\.)?surfguitar101.com/', re.I) + + def setUp(self): + self.assertTrue(len(settings.USER_IMAGES_SOURCES) > 0) + self.safe_host = settings.USER_IMAGES_SOURCES[0] + + def tearDown(self): + core.management.commands.ssl_images.url_cache = {} + + def test_empty_string(self): + s = process_html('') + self.assertEqual(s, '') + + def test_whitespace_string(self): + s = process_html('\r\n\r\n') + self.assertEqual(s, '') + + def test_no_matches(self): + test_str = """<p>Here is a post that doesn't contain any image links at + all. It also spans lines.</p> + """ + result = process_html(test_str) + self.assertEqual(test_str, result) + + def test_multiple_paragraphs(self): + test_str = """<p>Here is a post that doesn't contain any image links at + all. It also spans lines.</p> + """ + test_str += test_str + result = process_html(test_str) + self.assertEqual(test_str, result) + + def test_sg101_images(self): + test_str = """<p>An image: + <img src="http://www.surfguitar101.com/img.jpg" alt="image"> + And another: <img src="HTTP://SURFGUITAR101.COM/foo/bar/img.png" alt="pic"> + More stuff here.</p>""" + expected = self.SG101_RE.sub('/', test_str) + result = process_html(test_str) + self.assertNotEqual(test_str, expected) + self.assertEqual(expected, result) + + def test_https_already(self): + test_str = """<p>An image that is already using https: + <img src="https://{}/zzz.png" alt="pic"> + It's cool.</p>""".format(self.safe_host) + result = process_html(test_str) + self.assertEqual(test_str, result) + + def test_https_sg101(self): + test_str = """<p>An image that is already using https: + <img src="https://www.SURFGUITAR101.com/zzz.png" alt="pic"> + It's cool.</p> + """ + expected = """<p>An image that is already using https: + <img src="/zzz.png" alt="pic"> + It's cool.</p>""" + result = process_html(test_str) + self.assertEqual(expected, result) + + def test_multiple_non_http(self): + test_str = """<p>An image: + <img src="http://www.surfguitar101.com/img.jpg" alt="pic"> + And another: + <img src="HTTPS://{}/foo/bar/img.png" alt="stuff"> + More stuff here.</p> + """.format(self.safe_host) + expected = """<p>An image: + <img src="/img.jpg" alt="pic"> + And another: + <img src="HTTPS://{}/foo/bar/img.png" alt="stuff"> + More stuff here.</p>""".format(self.safe_host) + result = process_html(test_str) + self.assertEqual(expected, result) + + def test_https_already_with_title(self): + test_str = """<p>An image that is already using https: + <img src="https://{}/zzz.png" alt="1" title="the title"> + It's cool.</p> + """.format(self.safe_host) + result = process_html(test_str) + self.assertEqual(test_str, result) + + @mock.patch('core.management.commands.ssl_images.save_image_to_cloud') + def test_simple_replacement(self, upload_mock): + old_src = 'http://example.com/images/my_image.jpg' + new_src = 'https://cloud.com/ABCDEF.jpg' + test_str = """<p>Here is a really cool http: based image: + <img src="{}" alt="a"> + Cool, right?</p>""".format(old_src) + expected = """<p>Here is a really cool http: based image: + <img src="{}" alt="a"> + Cool, right?</p>""".format(new_src) + + upload_mock.return_value = new_src + result = process_html(test_str) + self.assertEqual(expected, result) + upload_mock.assert_called_once_with(urlparse(old_src)) + + @mock.patch('core.management.commands.ssl_images.save_image_to_cloud') + def test_multiple_replacement(self, upload_mock): + old_src = [ + 'http://example.com/images/my_image.jpg', + 'http://example.com/static/wow.gif', + 'http://example.com/media/a/b/c/pic.png', + ] + new_src = [ + 'https://cloud.com/some/path/012345.jpg', + 'https://cloud.com/some/path/6789AB.gif', + 'https://cloud.com/some/path/CDEF01.png', + ] + + template = """<p>Here is a really cool http: based image: + <img src="{}" alt="a"> + Cool, right? + Another one: <img src="{}" alt="b"> + And finally + <img src="{}" alt="c"> + </p>""" + + test_str = template.format(*old_src) + expected = template.format(*new_src) + + upload_mock.side_effect = new_src + result = process_html(test_str) + self.assertEqual(expected, result) + expected_args = [mock.call(urlparse(c)) for c in old_src] + self.assertEqual(upload_mock.call_args_list, expected_args) + + @mock.patch('core.management.commands.ssl_images.save_image_to_cloud') + def test_multiple_replacement_2(self, upload_mock): + old_src = [ + 'http://example.com/images/my_image.jpg', + 'https://{}/static/wow.gif'.format(self.safe_host), + 'http://www.surfguitar101.com/media/a/b/c/pic.png', + 'http://surfguitar101.com/media/a/b/c/pic2.png', + ] + new_src = [ + 'https://cloud.com/some/path/012345.jpg', + 'https://{}/static/wow.gif'.format(self.safe_host), + '/media/a/b/c/pic.png', + '/media/a/b/c/pic2.png', + ] + + template = """<p>Here is a really cool http: based image: + <img src="{}" alt="a"> + Cool, right? + Another two: <img src="{}" alt="b"><img src="{}" alt="c"> + And finally + <img src="{}" alt="d"></p>""" + + test_str = template.format(*old_src) + expected = template.format(*new_src) + + upload_mock.side_effect = new_src + result = process_html(test_str) + self.assertEqual(expected, result) + upload_mock.assert_called_once_with(urlparse(old_src[0])) + + @mock.patch('core.management.commands.ssl_images.convert_to_ssl') + def test_change_img_to_a(self, convert_mock): + convert_mock.return_value = None + test_str = """<p>A bad image: + <img src="http://example.com/zzz.png" alt="1" title="the title"> + It's cool.</p>""" + + result = process_html(test_str) + + expected = """<p>A bad image: + <a href="http://example.com/zzz.png">Image</a> + It's cool.</p>""" + self.assertEqual(result, expected)
--- a/news/models.py Sun Oct 25 14:47:29 2015 -0500 +++ b/news/models.py Wed Oct 28 21:06:13 2015 -0500 @@ -18,7 +18,7 @@ return self.title def num_stories(self): - return News.objects.filter(category = self.pk).count() + return Story.objects.filter(category=self.pk).count() class Meta: verbose_name_plural = 'Categories'