annotate news/management/commands/fix_news_embeds.py @ 1076:b24708086bfc

Check for news stories with http iframes.
author Brian Neal <bgneal@gmail.com>
date Sat, 16 Apr 2016 12:16:19 -0500
parents 5bba39fafad8
children
rev   line source
bgneal@1075 1 """
bgneal@1075 2 Custom management command to find and fix the old YouTube <object> embeds in the
bgneal@1075 3 news stories.
bgneal@1075 4 """
bgneal@1075 5 from optparse import make_option
bgneal@1075 6 import urlparse
bgneal@1075 7 from urllib2 import HTTPError
bgneal@1075 8
bgneal@1075 9 from django.core.management.base import NoArgsCommand
bgneal@1075 10 import lxml.html
bgneal@1075 11
bgneal@1075 12 from news.models import Story
bgneal@1075 13 from oembed.core import get_oembed
bgneal@1075 14
bgneal@1075 15
bgneal@1075 16 class Command(NoArgsCommand):
bgneal@1075 17 help = "Rewrite news stories that have old flash YouTube embeds"
bgneal@1075 18 option_list = NoArgsCommand.option_list + (
bgneal@1075 19 make_option('-p', '--preview',
bgneal@1075 20 action='store_true',
bgneal@1075 21 default=False,
bgneal@1075 22 help="find and print old embeds but don't update"),
bgneal@1075 23 )
bgneal@1075 24
bgneal@1075 25 def handle_noargs(self, **options):
bgneal@1075 26 self.preview = options['preview']
bgneal@1075 27
bgneal@1075 28 qs = Story.objects.all()
bgneal@1075 29 for story in qs.iterator():
bgneal@1075 30 self._process_story(story)
bgneal@1075 31
bgneal@1075 32 def _process_story(self, story):
bgneal@1075 33 r1 = self._process_html(story, 'short_text')
bgneal@1075 34 r2 = self._process_html(story, 'long_text')
bgneal@1075 35 r3 = self._process_html(story, 'admin_content')
bgneal@1075 36 if not self.preview and (r1 or r2 or r3):
bgneal@1075 37 print "Updating", story.title
bgneal@1075 38 story.save()
bgneal@1075 39
bgneal@1075 40 def _process_html(self, story, field):
bgneal@1075 41 html = getattr(story, field)
bgneal@1075 42 s = html.strip()
bgneal@1075 43 if not s:
bgneal@1075 44 return False
bgneal@1075 45
bgneal@1075 46 root = lxml.html.fragment_fromstring(s, create_parent=True)
bgneal@1076 47
bgneal@1076 48 for iframe in root.iter('iframe'):
bgneal@1076 49 src = iframe.get('src')
bgneal@1076 50 if src and src.startswith('http:'):
bgneal@1076 51 print "*" * 5, "iframe with http src -", story.title
bgneal@1076 52
bgneal@1075 53 for obj in root.iter('object'):
bgneal@1075 54 if story.version != 0:
bgneal@1075 55 print "*" * 5, story.title, "bad version!"
bgneal@1075 56 continue
bgneal@1075 57 for param in obj.iter('param'):
bgneal@1075 58 value = param.get('value')
bgneal@1075 59 if value and value.startswith('http'):
bgneal@1075 60 r = urlparse.urlparse(value)
bgneal@1075 61 if (r.hostname != 'www.youtube.com' and
bgneal@1075 62 r.hostname != 'www.youtube-nocookie.com'):
bgneal@1075 63 print "Unknown source hostname:", r.hostname, ";", story.title
bgneal@1075 64 continue
bgneal@1075 65 try:
bgneal@1075 66 new_embed = self._process_path(story, r.path)
bgneal@1075 67 except HTTPError as ex:
bgneal@1075 68 print "*" * 5, story.title, ex
bgneal@1075 69 continue
bgneal@1075 70 if not new_embed:
bgneal@1075 71 continue
bgneal@1075 72 parent = obj.getparent()
bgneal@1075 73 new_child = lxml.html.fragment_fromstring(new_embed)
bgneal@1075 74 parent.replace(obj, new_child)
bgneal@1075 75 new_html = lxml.html.tostring(root)[5:-6]
bgneal@1075 76 if self.preview:
bgneal@1075 77 print story.title
bgneal@1075 78 print new_html, "\n" * 3
bgneal@1075 79 return False
bgneal@1075 80
bgneal@1075 81 setattr(story, field, new_html)
bgneal@1075 82 return True
bgneal@1075 83
bgneal@1075 84
bgneal@1075 85 def _process_path(self, story, path):
bgneal@1075 86 if path.startswith('/v/'):
bgneal@1075 87 video_id = strip_query(path[3:])
bgneal@1075 88 url = "https://www.youtube.com/watch?v={}".format(video_id)
bgneal@1075 89 elif path.startswith('/p/'):
bgneal@1075 90 playlist_id = strip_query(path[3:])
bgneal@1075 91 url = "https://www.youtube.com/playlist?list=PL{}".format(playlist_id)
bgneal@1075 92 else:
bgneal@1075 93 print "Unknown YouTube path:", path, ";", story.title
bgneal@1075 94 return None
bgneal@1075 95
bgneal@1075 96 oembed = get_oembed("http://www.youtube.com/oembed", url, scheme='https')
bgneal@1075 97 return oembed['html']
bgneal@1075 98
bgneal@1075 99
bgneal@1075 100 def strip_query(path):
bgneal@1075 101 n = path.find('&')
bgneal@1075 102 if n != -1:
bgneal@1075 103 return path[:n]
bgneal@1075 104 return path