annotate forums/latest.py @ 887:9a15f7c27526

Actually save model object upon change. This commit was tested on the comments model. Additional logging added. Added check for Markdown image references. Added TODOs after observing behavior on comments.
author Brian Neal <bgneal@gmail.com>
date Tue, 03 Feb 2015 21:09:44 -0600
parents 7429c98c8ece
children 90e8cc6eff77
rev   line source
bgneal@509 1 """
bgneal@509 2 This module maintains the latest posts datastore. The latest posts are often
bgneal@509 3 needed by RSS feeds, "latest posts" template tags, etc. This module listens for
bgneal@509 4 the post_content_update signal, then bundles the post up and stores it by forum
bgneal@509 5 ID in Redis. We also maintain a combined forums list. This allows quick
bgneal@509 6 retrieval of the latest posts and avoids some slow SQL queries.
bgneal@509 7
bgneal@522 8 We also do things like send topic notification emails, auto-favorite, and
bgneal@522 9 auto-subscribe functions here rather than bog the user down in the request /
bgneal@522 10 response cycle.
bgneal@522 11
bgneal@509 12 """
bgneal@595 13 # Maintenance notes:
bgneal@595 14 # How we use Redis in this module:
bgneal@595 15 #
bgneal@595 16 # Forum post processing:
bgneal@595 17 #
bgneal@595 18 # * Forum posts are turned into Python dictionaries, then converted to JSON and
bgneal@595 19 # stored under keys: forums:post:id
bgneal@595 20 # * Each forum has a list in Redis stored under the key: forums:rss:id. This
bgneal@595 21 # is a list of post IDs.
bgneal@595 22 # * There is also a key called forums:rss:* which is the combined latest
bgneal@595 23 # feed. It is also a list of post IDs.
bgneal@595 24 # * A sorted set is maintained that keeps track of the reference count for each
bgneal@595 25 # post. When a new post is created, this reference count is 2 because it is
bgneal@595 26 # stored in both the combined list and the parent forum list.
bgneal@595 27 # This sorted set is stored under the key: forums:post_ref_cnt.
bgneal@595 28 # * When a post falls off a list due to aging, the reference count in the
bgneal@595 29 # ordered set is decremented. If it falls to zero, the post's key is deleted
bgneal@595 30 # from Redis.
bgneal@595 31 # * When a post is edited, and it is in Redis, we simply update the JSON
bgneal@595 32 # content.
bgneal@595 33 # * When a post is deleted, and it is in Redis, it is removed from the 2 lists,
bgneal@595 34 # the ordered set, and deleted from Redis.
bgneal@595 35 # * When the RSS feed wants to update, it simply pulls down the entire list of
bgneal@595 36 # post IDs for the feed of interest, then does a get on all the posts.
bgneal@595 37 #
bgneal@595 38 # Topics with recent posts processing:
bgneal@595 39 #
bgneal@595 40 # * A key is created for each topic that is updated.
bgneal@595 41 # * An ordered set of topics is maintained with the current time as the score.
bgneal@595 42 # * An updated topic gets its score bumped.
bgneal@595 43 # * We only allow MAX_UPDATED_TOPICS number of topics in the set. We sort the
bgneal@595 44 # set by score, and the expired topics are removed from the set and their keys
bgneal@595 45 # are deleted from Redis.
bgneal@595 46 # * The template tag (or anyone) who wants the list of topics with new posts
bgneal@595 47 # gets the list of IDs sorted by score from newest to oldest. An mget is then
bgneal@595 48 # performed to get all the topic data and it is deserialized from JSON.
bgneal@595 49 #
bgneal@595 50 # We also maintain topic and post counts in Redis since select(*) can take a
bgneal@595 51 # while with MySQL InnoDb.
bgneal@595 52 #
bgneal@509 53 import datetime
bgneal@679 54 import json
bgneal@522 55 import logging
bgneal@509 56 import time
bgneal@509 57
bgneal@509 58 from django.dispatch import receiver
bgneal@594 59 from django.template.loader import render_to_string
bgneal@523 60 import redis
bgneal@509 61
bgneal@522 62 from forums.signals import post_content_update, topic_content_update
bgneal@594 63 from forums.models import Forum, Topic, Post, Attachment
bgneal@522 64 from forums.views.subscriptions import notify_topic_subscribers
bgneal@522 65 from forums.tools import auto_favorite, auto_subscribe
bgneal@509 66 from core.services import get_redis_connection
bgneal@792 67 from core.markup import site_markup
bgneal@509 68
bgneal@509 69 # This constant controls how many latest posts per forum we store
bgneal@509 70 MAX_POSTS = 50
bgneal@509 71
bgneal@522 72 # This controls how many updated topics we track
bgneal@522 73 MAX_UPDATED_TOPICS = 50
bgneal@522 74
bgneal@522 75 # Redis key names:
bgneal@522 76 POST_COUNT_KEY = "forums:public_post_count"
bgneal@522 77 TOPIC_COUNT_KEY = "forums:public_topic_count"
bgneal@522 78 UPDATED_TOPICS_SET_KEY = "forums:updated_topics:set"
bgneal@522 79 UPDATED_TOPIC_KEY = "forums:updated_topics:%s"
bgneal@595 80 POST_KEY = "forums:post:%s"
bgneal@595 81 FORUM_RSS_KEY = "forums:rss:%s"
bgneal@595 82 ALL_FORUMS_RSS_KEY = "forums:rss:*"
bgneal@595 83 POST_SET_KEY = "forums:post_ref_cnt"
bgneal@522 84
bgneal@522 85 logger = logging.getLogger(__name__)
bgneal@522 86
bgneal@509 87
bgneal@509 88 @receiver(post_content_update, dispatch_uid='forums.latest_posts')
bgneal@509 89 def on_post_update(sender, **kwargs):
bgneal@509 90 """
bgneal@595 91 This function is our signal handler, called when a post has been updated
bgneal@595 92 or created.
bgneal@509 93
bgneal@522 94 We kick off a Celery task to perform work outside of the request/response
bgneal@522 95 cycle.
bgneal@509 96
bgneal@509 97 """
bgneal@595 98 if kwargs['created']:
bgneal@595 99 forums.tasks.new_post_task.delay(sender.id)
bgneal@595 100 else:
bgneal@595 101 forums.tasks.updated_post_task.delay(sender.id)
bgneal@522 102
bgneal@522 103
bgneal@522 104 def process_new_post(post_id):
bgneal@522 105 """
bgneal@522 106 This function is run on a Celery task. It performs all new-post processing.
bgneal@522 107
bgneal@522 108 """
bgneal@522 109 try:
bgneal@522 110 post = Post.objects.select_related().get(pk=post_id)
bgneal@522 111 except Post.DoesNotExist:
bgneal@522 112 logger.warning("process_new_post: post %d does not exist", post_id)
bgneal@509 113 return
bgneal@509 114
bgneal@522 115 # selectively process posts from non-public forums
bgneal@522 116 public_forums = Forum.objects.public_forum_ids()
bgneal@522 117
bgneal@522 118 if post.topic.forum.id in public_forums:
bgneal@523 119 conn = get_redis_connection()
bgneal@523 120 _update_post_feeds(conn, post)
bgneal@523 121 _update_post_count(conn, public_forums)
bgneal@523 122 _update_latest_topics(conn, post)
bgneal@522 123
bgneal@522 124 # send out any email notifications
bgneal@522 125 notify_topic_subscribers(post, defer=False)
bgneal@522 126
bgneal@522 127 # perform any auto-favorite and auto-subscribe actions for the new post
bgneal@522 128 auto_favorite(post)
bgneal@522 129 auto_subscribe(post)
bgneal@522 130
bgneal@522 131
bgneal@595 132 def process_updated_post(post_id):
bgneal@595 133 """
bgneal@595 134 This function is run on a Celery task. It performs all updated-post
bgneal@595 135 processing.
bgneal@595 136
bgneal@595 137 """
bgneal@595 138 # Is this post ID in a RSS feed?
bgneal@595 139 conn = get_redis_connection()
bgneal@595 140 post_key = POST_KEY % post_id
bgneal@595 141 post_val = conn.get(post_key)
bgneal@595 142
bgneal@595 143 if post_val is not None:
bgneal@595 144 # Update the post value in Redis
bgneal@595 145 try:
bgneal@595 146 post = Post.objects.select_related().get(pk=post_id)
bgneal@595 147 except Post.DoesNotExist:
bgneal@595 148 logger.warning("process_updated_post: post %d does not exist", post_id)
bgneal@595 149 return
bgneal@595 150 conn.set(post_key, _serialize_post(post))
bgneal@595 151
bgneal@595 152
bgneal@523 153 def _update_post_feeds(conn, post):
bgneal@522 154 """
bgneal@522 155 Updates the forum feeds we keep in Redis so that our RSS feeds are quick.
bgneal@522 156
bgneal@522 157 """
bgneal@595 158 post_key = POST_KEY % post.id
bgneal@595 159 post_value = _serialize_post(post)
bgneal@509 160
bgneal@523 161 pipeline = conn.pipeline()
bgneal@509 162
bgneal@595 163 # Store serialized post content under its own key
bgneal@595 164 pipeline.set(post_key, post_value)
bgneal@509 165
bgneal@595 166 # Store in the RSS feed for the post's forum
bgneal@595 167 forum_key = FORUM_RSS_KEY % post.topic.forum.id
bgneal@595 168 pipeline.lpush(forum_key, post.id)
bgneal@509 169
bgneal@595 170 # Store in the RSS feed for combined forums
bgneal@595 171 pipeline.lpush(ALL_FORUMS_RSS_KEY, post.id)
bgneal@509 172
bgneal@595 173 # Store reference count for the post
bgneal@595 174 pipeline.zadd(POST_SET_KEY, 2, post.id)
bgneal@509 175
bgneal@595 176 results = pipeline.execute()
bgneal@509 177
bgneal@595 178 # Make sure our forums RSS lists lengths are not exceeded
bgneal@595 179
bgneal@595 180 if results[1] > MAX_POSTS or results[2] > MAX_POSTS:
bgneal@595 181 pipeline = conn.pipeline()
bgneal@595 182
bgneal@595 183 # Truncate lists of posts:
bgneal@595 184 if results[1] > MAX_POSTS:
bgneal@595 185 pipeline.rpop(forum_key)
bgneal@595 186 if results[2] > MAX_POSTS:
bgneal@595 187 pipeline.rpop(ALL_FORUMS_RSS_KEY)
bgneal@595 188 post_ids = pipeline.execute()
bgneal@595 189
bgneal@595 190 # Decrement reference count(s)
bgneal@595 191 pipeline = conn.pipeline()
bgneal@595 192 for post_id in post_ids:
bgneal@595 193 pipeline.zincrby(POST_SET_KEY, post_id, -1)
bgneal@595 194 scores = pipeline.execute()
bgneal@595 195
bgneal@595 196 # If any reference counts have fallen to 0, clean up:
bgneal@595 197 if not all(scores):
bgneal@595 198 pipeline = conn.pipeline()
bgneal@595 199
bgneal@595 200 # remove from post set
bgneal@595 201 ids = [post_ids[n] for n, s in enumerate(scores) if s <= 0.0]
bgneal@595 202 pipeline.zrem(POST_SET_KEY, *ids)
bgneal@595 203
bgneal@595 204 # remove serialized post data
bgneal@595 205 keys = [POST_KEY % n for n in ids]
bgneal@595 206 pipeline.delete(*keys)
bgneal@595 207
bgneal@595 208 pipeline.execute()
bgneal@509 209
bgneal@509 210
bgneal@523 211 def _update_post_count(conn, public_forums):
bgneal@522 212 """
bgneal@522 213 Updates the post count we cache in Redis. Doing a COUNT(*) on the post table
bgneal@522 214 can be expensive in MySQL InnoDB.
bgneal@522 215
bgneal@522 216 """
bgneal@523 217 result = conn.incr(POST_COUNT_KEY)
bgneal@522 218 if result == 1:
bgneal@522 219 # it is likely redis got trashed, so re-compute the correct value
bgneal@522 220
bgneal@522 221 count = Post.objects.filter(topic__forum__in=public_forums).count()
bgneal@523 222 conn.set(POST_COUNT_KEY, count)
bgneal@522 223
bgneal@522 224
bgneal@523 225 def _update_latest_topics(conn, post):
bgneal@522 226 """
bgneal@522 227 Updates the "latest topics with new posts" list we cache in Redis for speed.
bgneal@522 228 There is a template tag and forum view that uses this information.
bgneal@522 229
bgneal@522 230 """
bgneal@522 231 # serialize topic attributes
bgneal@522 232 topic_id = post.topic.id
bgneal@522 233 topic_score = int(time.mktime(post.creation_date.timetuple()))
bgneal@522 234
bgneal@522 235 topic_content = {
bgneal@522 236 'title': post.topic.name,
bgneal@522 237 'author': post.user.username,
bgneal@522 238 'date': topic_score,
bgneal@529 239 'url': post.topic.get_latest_post_url()
bgneal@522 240 }
bgneal@679 241 topic_json = json.dumps(topic_content)
bgneal@522 242 key = UPDATED_TOPIC_KEY % topic_id
bgneal@522 243
bgneal@523 244 pipeline = conn.pipeline()
bgneal@679 245 pipeline.set(key, topic_json)
bgneal@522 246 pipeline.zadd(UPDATED_TOPICS_SET_KEY, topic_score, topic_id)
bgneal@522 247 pipeline.zcard(UPDATED_TOPICS_SET_KEY)
bgneal@522 248 results = pipeline.execute()
bgneal@522 249
bgneal@522 250 # delete topics beyond our maximum count
bgneal@522 251 num_topics = results[-1]
bgneal@522 252 num_to_del = num_topics - MAX_UPDATED_TOPICS
bgneal@522 253 if num_to_del > 0:
bgneal@522 254 # get the IDs of the topics we need to delete first
bgneal@522 255 start = 0
bgneal@522 256 stop = num_to_del - 1 # Redis indices are inclusive
bgneal@523 257 old_ids = conn.zrange(UPDATED_TOPICS_SET_KEY, start, stop)
bgneal@522 258
bgneal@522 259 keys = [UPDATED_TOPIC_KEY % n for n in old_ids]
bgneal@523 260 conn.delete(*keys)
bgneal@522 261
bgneal@522 262 # now delete the oldest num_to_del topics
bgneal@523 263 conn.zremrangebyrank(UPDATED_TOPICS_SET_KEY, start, stop)
bgneal@522 264
bgneal@522 265
bgneal@509 266 def get_latest_posts(num_posts=MAX_POSTS, forum_id=None):
bgneal@509 267 """
bgneal@509 268 This function retrieves num_posts latest posts for the forum with the given
bgneal@509 269 forum_id. If forum_id is None, the posts are retrieved from the combined
bgneal@509 270 forums datastore. A list of dictionaries is returned. Each dictionary
bgneal@509 271 contains information about a post.
bgneal@509 272
bgneal@509 273 """
bgneal@595 274 key = FORUM_RSS_KEY % forum_id if forum_id else ALL_FORUMS_RSS_KEY
bgneal@509 275
bgneal@509 276 num_posts = max(0, min(MAX_POSTS, num_posts))
bgneal@509 277
bgneal@509 278 if num_posts == 0:
bgneal@509 279 return []
bgneal@509 280
bgneal@523 281 conn = get_redis_connection()
bgneal@595 282 post_ids = conn.lrange(key, 0, num_posts - 1)
bgneal@595 283 if not post_ids:
bgneal@595 284 return []
bgneal@595 285
bgneal@595 286 post_keys = [POST_KEY % n for n in post_ids]
bgneal@595 287 raw_posts = conn.mget(post_keys)
bgneal@595 288 raw_posts = [s for s in raw_posts if s is not None]
bgneal@509 289
bgneal@509 290 posts = []
bgneal@509 291 for raw_post in raw_posts:
bgneal@679 292 post = json.loads(raw_post)
bgneal@509 293
bgneal@509 294 # fix up the pubdate; turn it back into a datetime object
bgneal@509 295 post['pubdate'] = datetime.datetime.fromtimestamp(post['pubdate'])
bgneal@509 296
bgneal@509 297 posts.append(post)
bgneal@509 298
bgneal@509 299 return posts
bgneal@522 300
bgneal@522 301
bgneal@522 302 @receiver(topic_content_update, dispatch_uid='forums.latest_posts')
bgneal@522 303 def on_topic_update(sender, **kwargs):
bgneal@522 304 """
bgneal@595 305 This function is our signal handler, called when a topic has been updated
bgneal@595 306 or created.
bgneal@522 307
bgneal@522 308 We kick off a Celery task to perform work outside of the request/response
bgneal@522 309 cycle.
bgneal@522 310
bgneal@522 311 """
bgneal@595 312 if kwargs['created']:
bgneal@595 313 forums.tasks.new_topic_task.delay(sender.id)
bgneal@595 314 else:
bgneal@595 315 forums.tasks.updated_topic_task.delay(sender.id)
bgneal@522 316
bgneal@522 317
bgneal@522 318 def process_new_topic(topic_id):
bgneal@522 319 """
bgneal@522 320 This function contains new topic processing. Currently we only update the
bgneal@522 321 topic count statistic.
bgneal@522 322
bgneal@522 323 """
bgneal@522 324 try:
bgneal@522 325 topic = Topic.objects.select_related().get(pk=topic_id)
bgneal@522 326 except Topic.DoesNotExist:
bgneal@522 327 logger.warning("process_new_topic: topic %d does not exist", topic_id)
bgneal@522 328 return
bgneal@522 329
bgneal@522 330 # selectively process topics from non-public forums
bgneal@522 331 public_forums = Forum.objects.public_forum_ids()
bgneal@522 332
bgneal@522 333 if topic.forum.id not in public_forums:
bgneal@522 334 return
bgneal@522 335
bgneal@522 336 # update the topic count statistic
bgneal@523 337 conn = get_redis_connection()
bgneal@522 338
bgneal@523 339 result = conn.incr(TOPIC_COUNT_KEY)
bgneal@522 340 if result == 1:
bgneal@522 341 # it is likely redis got trashed, so re-compute the correct value
bgneal@522 342
bgneal@522 343 count = Topic.objects.filter(forum__in=public_forums).count()
bgneal@523 344 conn.set(TOPIC_COUNT_KEY, count)
bgneal@522 345
bgneal@522 346
bgneal@595 347 def process_updated_topic(topic_id):
bgneal@595 348 """
bgneal@595 349 This function contains updated topic processing. Update the title only.
bgneal@595 350
bgneal@595 351 """
bgneal@595 352 conn = get_redis_connection()
bgneal@595 353 key = UPDATED_TOPIC_KEY % topic_id
bgneal@679 354 topic_json = conn.get(key)
bgneal@679 355 if topic_json is not None:
bgneal@595 356 try:
bgneal@595 357 topic = Topic.objects.get(pk=topic_id)
bgneal@595 358 except Topic.DoesNotExist:
bgneal@595 359 logger.warning("topic %d does not exist", topic_id)
bgneal@595 360 return
bgneal@595 361
bgneal@679 362 topic_dict = json.loads(topic_json)
bgneal@595 363
bgneal@595 364 if topic.name != topic_dict['title']:
bgneal@595 365 topic_dict['title'] = topic.name
bgneal@679 366 topic_json = json.dumps(topic_dict)
bgneal@679 367 conn.set(key, topic_json)
bgneal@595 368
bgneal@595 369
bgneal@522 370 def get_stats():
bgneal@522 371 """
bgneal@522 372 This function returns the topic and post count statistics as a tuple, in
bgneal@522 373 that order. If a statistic is not available, its position in the tuple will
bgneal@522 374 be None.
bgneal@522 375
bgneal@522 376 """
bgneal@522 377 try:
bgneal@523 378 conn = get_redis_connection()
bgneal@523 379 result = conn.mget(TOPIC_COUNT_KEY, POST_COUNT_KEY)
bgneal@522 380 except redis.RedisError, e:
bgneal@522 381 logger.error(e)
bgneal@522 382 return (None, None)
bgneal@522 383
bgneal@522 384 topic_count = int(result[0]) if result[0] else None
bgneal@522 385 post_count = int(result[1]) if result[1] else None
bgneal@522 386
bgneal@522 387 return (topic_count, post_count)
bgneal@522 388
bgneal@522 389
bgneal@522 390 def get_latest_topic_ids(num):
bgneal@522 391 """
bgneal@522 392 Return a list of topic ids from the latest topics that have posts. The ids
bgneal@522 393 will be sorted from newest to oldest.
bgneal@522 394
bgneal@522 395 """
bgneal@522 396 try:
bgneal@523 397 conn = get_redis_connection()
bgneal@523 398 result = conn.zrevrange(UPDATED_TOPICS_SET_KEY, 0, num - 1)
bgneal@522 399 except redis.RedisError, e:
bgneal@522 400 logger.error(e)
bgneal@522 401 return []
bgneal@522 402
bgneal@522 403 return [int(n) for n in result]
bgneal@522 404
bgneal@522 405
bgneal@522 406 def get_latest_topics(num):
bgneal@522 407 """
bgneal@522 408 Return a list of dictionaries with information about the latest topics that
bgneal@522 409 have updated posts. The topics are sorted from newest to oldest.
bgneal@522 410
bgneal@522 411 """
bgneal@522 412 try:
bgneal@523 413 conn = get_redis_connection()
bgneal@523 414 result = conn.zrevrange(UPDATED_TOPICS_SET_KEY, 0, num - 1)
bgneal@522 415
bgneal@522 416 topic_keys = [UPDATED_TOPIC_KEY % n for n in result]
bgneal@524 417 json_list = conn.mget(topic_keys) if topic_keys else []
bgneal@522 418
bgneal@522 419 except redis.RedisError, e:
bgneal@522 420 logger.error(e)
bgneal@522 421 return []
bgneal@522 422
bgneal@522 423 topics = []
bgneal@522 424 for s in json_list:
bgneal@679 425 item = json.loads(s)
bgneal@522 426 item['date'] = datetime.datetime.fromtimestamp(item['date'])
bgneal@522 427 topics.append(item)
bgneal@522 428
bgneal@522 429 return topics
bgneal@522 430
bgneal@522 431
bgneal@522 432 def notify_topic_delete(topic):
bgneal@522 433 """
bgneal@522 434 This function should be called when a topic is deleted. It will remove the
bgneal@522 435 topic from the updated topics set, if present, and delete any info we have
bgneal@522 436 about the topic.
bgneal@522 437
bgneal@522 438 Note we don't do anything like this for posts. Since they just populate RSS
bgneal@522 439 feeds we'll let them 404. The updated topic list is seen in a prominent
bgneal@522 440 template tag however, so it is a bit more important to get that cleaned up.
bgneal@522 441
bgneal@522 442 """
bgneal@522 443 try:
bgneal@523 444 conn = get_redis_connection()
bgneal@523 445 pipeline = conn.pipeline()
bgneal@522 446 pipeline.zrem(UPDATED_TOPICS_SET_KEY, topic.id)
bgneal@522 447 pipeline.delete(UPDATED_TOPIC_KEY % topic.id)
bgneal@522 448 pipeline.execute()
bgneal@522 449 except redis.RedisError, e:
bgneal@522 450 logger.error(e)
bgneal@522 451
bgneal@522 452
bgneal@595 453 def _serialize_post(post):
bgneal@595 454 """Serialize a post to JSON and return it.
bgneal@595 455
bgneal@595 456 """
bgneal@792 457 # Use absolute URLs for smileys for RSS. This means we have to reconvert the
bgneal@792 458 # post Markdown to HTML.
bgneal@792 459 content = site_markup(post.body, relative_urls=False)
bgneal@792 460
bgneal@595 461 # get any attachments for the post
bgneal@595 462 attachments = Attachment.objects.filter(post=post).select_related(
bgneal@595 463 'embed').order_by('order')
bgneal@595 464 embeds = [item.embed for item in attachments]
bgneal@792 465 if len(embeds):
bgneal@595 466 content = render_to_string('forums/post_rss.html', {
bgneal@792 467 'content': content,
bgneal@595 468 'embeds': embeds,
bgneal@595 469 })
bgneal@595 470
bgneal@595 471 # serialize post attributes
bgneal@595 472 post_content = {
bgneal@595 473 'id': post.id,
bgneal@595 474 'title': post.topic.name,
bgneal@595 475 'content': content,
bgneal@595 476 'author': post.user.username,
bgneal@595 477 'pubdate': int(time.mktime(post.creation_date.timetuple())),
bgneal@595 478 'forum_name': post.topic.forum.name,
bgneal@595 479 'url': post.get_absolute_url()
bgneal@595 480 }
bgneal@595 481
bgneal@679 482 return json.dumps(post_content)
bgneal@595 483
bgneal@595 484
bgneal@522 485 # Down here to avoid a circular import
bgneal@522 486 import forums.tasks