# HG changeset patch # User Brian Neal # Date 1298161914 0 # Node ID 0c18dfb1da1c51de16f8e7ce173c724a8cc1b639 # Parent c3d3d7114749872e1acd689f5156c910668d4648 Fixing #149; adding the ygroup application: an archive of the old Yahoo Group messages. diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/settings.py --- a/gpp/settings.py Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/settings.py Sun Feb 20 00:31:54 2011 +0000 @@ -83,7 +83,7 @@ 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', - 'debug_toolbar.middleware.DebugToolbarMiddleware', + #'debug_toolbar.middleware.DebugToolbarMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', 'gpp.core.middleware.InactiveUserMiddleware', 'gpp.core.middleware.WhosOnline', @@ -158,9 +158,10 @@ 'shoutbox', 'smiley', 'weblinks', + 'ygroup', ] -if DEBUG: - INSTALLED_APPS.append('debug_toolbar') +#if DEBUG: +# INSTALLED_APPS.append('debug_toolbar') LOGIN_URL = '/accounts/login/' LOGIN_REDIRECT_URL = '/profile/me/' diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/base.html --- a/gpp/templates/base.html Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/templates/base.html Sun Feb 20 00:31:54 2011 +0000 @@ -1,8 +1,8 @@ -{% load url from future %} +{% load url from future %} {% load shoutbox_tags %} {% load irc_tags %} {% load potd_tags %} @@ -69,6 +69,7 @@
  • Photo of the Day
  • Links
  • Search
  • +
  • Yahoo Group
  • {% cache 300 potd_block %} {% photo_of_the_day %} diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/search/indexes/ygroup/post_text.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/search/indexes/ygroup/post_text.txt Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,2 @@ +{{ object.title }} +{{ object.msg }} diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/search/search.html --- a/gpp/templates/search/search.html Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/templates/search/search.html Sun Feb 20 00:31:54 2011 +0000 @@ -1,9 +1,30 @@ {% extends 'base.html' %} {% load highlight %} {% block title %}Search{% endblock %} +{% block custom_js %} + +{% endblock %} {% block content %}

    Search Search

    -
    + @@ -14,17 +35,29 @@ Search in:
    - - - + + + - - - + + + + + +
    Forum PostsNews StoriesUser Profiles + + +
    LinksDownloadsPodcasts + + +
    +
    +

    Check all | Check none

    +
    {% if query %}

    Results for "{{ query }}" page {{ page.number }} of {{ page.paginator.num_pages }}

    @@ -60,5 +93,4 @@ {% else %} {# Show some example queries to run, maybe query syntax, something else? #} {% endif %} - {% endblock %} diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/ygroup/pagination.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/pagination.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,26 @@ + +
    diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/ygroup/post_detail.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/post_detail.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,15 @@ +{% extends 'base.html' %} +{% load url from future %} +{% block title %}Yahoo Group Archives: {{ post.title }}{% endblock %} +{% block content %} +

    Yahoo Group Archives »

    +

    {{ post.title }} + + permalink +

    +
    +
    {{ post.poster }} - {{ post.creation_date|date:"d M Y H:i:s" }}
    +
    {{ post.msg|linebreaks }}
    +
    +

    See this post in context.

    +{% endblock %} diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/ygroup/thread.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/thread.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,28 @@ +{% extends 'base.html' %} +{% load url from future %} +{% block title %}Yahoo Group Archives: {{ thread.title }}{% endblock %} +{% block custom_css %} + +{% endblock %} +{% block content %} +{% if thread.page == 1 %} +

    Yahoo Group Archives »

    +{% else %} +

    Yahoo Group Archives » + Page {{ thread.page }} »

    +{% endif %} +

    {{ thread.title }} + + permalink +

    +{% include "ygroup/pagination.html" %} +
    + {% for post in page_obj.object_list %} +
    {{ post.poster }} - {{ post.creation_date|date:"d M Y H:i:s" }} + + permalink
    +
    {{ post.msg|linebreaks }}
    + {% endfor %} +
    +{% include "ygroup/pagination.html" %} +{% endblock %} diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/templates/ygroup/thread_list.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/thread_list.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,25 @@ +{% extends 'base.html' %} +{% load url from future %} +{% block title %}Yahoo Group Archives{% endblock %} +{% block custom_css %} + +{% endblock %} +{% block content %} +

    Yahoo Group Archives » Page {{ page_obj.number }}

    +

    +SurfGuitar101.com began as a Yahoo Group on October 31, 2001. It ran until August, 2007 when this site officially replaced it. On these pages you'll find the archived messages of our original group. You can also search through these messages via our search page. +

    +{% include "ygroup/pagination.html" %} + + + {% for thread in page_obj.object_list %} + + + + + + + {% endfor %} +
    TitleAuthorPostsDate
    {{ thread.title }}{{ thread.poster }}{{ thread.post_count }}{{ thread.creation_date|date:"d M Y" }}
    +{% include "ygroup/pagination.html" %} +{% endblock %} diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/urls.py --- a/gpp/urls.py Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/urls.py Sun Feb 20 00:31:54 2011 +0000 @@ -46,6 +46,7 @@ (r'^profile/', include('bio.urls')), (r'^shout/', include('shoutbox.urls')), (r'^smiley/', include('smiley.urls')), + (r'^ygroup/', include('ygroup.urls')), ) # Haystack search views diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/management/commands/sync_ygroup_posts.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/management/commands/sync_ygroup_posts.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,53 @@ +""" +sync_ygroup_posts.py - A management command to synchronize the yahoo group +archives by recomputing the de-normalized fields in the post objects. + +""" +import optparse + +from django.core.management.base import NoArgsCommand, CommandError +from django.core.urlresolvers import reverse + +from ygroup.models import Thread, Post +import ygroup.views + + +class Command(NoArgsCommand): + help = """\ +This command synchronizes the ygroup application's post objects +by updating their de-normalized fields. +""" + option_list = NoArgsCommand.option_list + ( + optparse.make_option("-p", "--progress", action="store_true", + help="Output a . after every 100 posts to show progress"), + ) + + def handle_noargs(self, **opts): + + show_progress = opts.get('progress', False) or False + + threads = {} + self.stdout.write("Processing threads...\n") + for thread in Thread.objects.iterator(): + threads[thread.id] = [reverse('ygroup-thread_view', args=[thread.id]), + list(Post.objects.filter(thread=thread).values_list('id', flat=True))] + + self.stdout.write("Processing posts...\n") + n = 0 + for post in Post.objects.iterator(): + thread = threads[post.thread.id] + pos = thread[1].index(post.id) + page = pos / ygroup.views.POSTS_PER_PAGE + 1 + if page == 1: + post.thread_url = thread[0] + '#p%d' % (post.id, ) + else: + post.thread_url = thread[0] + '?page=%d#p%d' % (page, post.id) + post.save() + + n += 1 + if show_progress and n % 100 == 0: + self.stdout.write('.') + self.stdout.flush() + + self.stdout.write('\n') + diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/management/commands/sync_ygroup_threads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/management/commands/sync_ygroup_threads.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,39 @@ +""" +sync_ygroup_threads.py - A management command to synchronize the yahoo group +archives by recomputing the de-normalized fields in the thread objects. + +""" +import optparse + +from django.core.management.base import NoArgsCommand, CommandError + +from ygroup.models import Thread, Post +import ygroup.views + + +class Command(NoArgsCommand): + help = """\ +This command synchronizes the ygroup application's thread objects +by updating their de-normalized fields. +""" + option_list = NoArgsCommand.option_list + ( + optparse.make_option("-p", "--progress", action="store_true", + help="Output a . after every 50 threads to show progress"), + ) + + def handle_noargs(self, **opts): + + show_progress = opts.get('progress', False) or False + + n = 0 + for thread in Thread.objects.iterator(): + thread.post_count = Post.objects.filter(thread=thread).count() + thread.page = n / ygroup.views.THREADS_PER_PAGE + 1 + thread.save() + n += 1 + if n % 50 == 0: + self.stdout.write('.') + self.stdout.flush() + + self.stdout.write('\n') + diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/models.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/models.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,55 @@ +""" +Models for the ygroup application, which is a read-only archive of messages +from the old Yahoo Group. +""" +from django.db import models + + +class Thread(models.Model): + title = models.CharField(max_length=255) + creation_date = models.DateTimeField() + + # denormalized fields to reduce database hits + poster = models.CharField(max_length=128) + post_count = models.IntegerField(blank=True, default=0) + page = models.IntegerField(blank=True, default=1) + + class Meta: + ordering = ('creation_date', ) + + def __unicode__(self): + return u'Thread %d, %s' % (self.pk, self.title) + + @models.permalink + def get_absolute_url(self): + return ('ygroup-thread_view', [self.id]) + + +class Post(models.Model): + thread = models.ForeignKey(Thread, null=True, blank=True, + on_delete=models.SET_NULL, related_name='posts') + title = models.CharField(max_length=255) + creation_date = models.DateTimeField() + poster = models.CharField(max_length=128) + msg = models.TextField() + + # precomputed URL to this post in the parent thread for efficiency + thread_url = models.URLField(verify_exists=False, blank=True) + + class Meta: + ordering = ('creation_date', ) + verbose_name = 'yahoo group post' + verbose_name_plural = 'yahoo group posts' + + def __unicode__(self): + return u'Post %d, %s' % (self.pk, self.title) + + @models.permalink + def get_absolute_url(self): + return ('ygroup-post_view', [], {'pk': self.id}) + + def search_title(self): + return self.title + + def search_summary(self): + return self.msg diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/search_indexes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/search_indexes.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,19 @@ +""" +Haystack search index for the Yahoo Group archives application. + +""" +from haystack.indexes import * +from haystack import site + +from ygroup.models import Post + + +class PostIndex(SearchIndex): + text = CharField(document=True, use_template=True) + pub_date = DateTimeField(model_attr='creation_date') + + def get_updated_field(self): + return 'creation_date' + + +site.register(Post, PostIndex) diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/tests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/tests.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,16 @@ +""" +This file demonstrates writing tests using the unittest module. These will pass +when you run "manage.py test". + +Replace this with more appropriate tests for your application. +""" + +from django.test import TestCase + + +class SimpleTest(TestCase): + def test_basic_addition(self): + """ + Tests that 1 + 1 always equals 2. + """ + self.assertEqual(1 + 1, 2) diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/urls.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/urls.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,23 @@ +""" +urls.py - URLs for the ygroup application. + +""" +from django.conf.urls.defaults import * +from django.views.generic import ListView, DetailView + +from ygroup.models import Thread, Post +from ygroup.views import ThreadIndexView, ThreadView + + +urlpatterns = patterns('', + url(r'^threads/$', + ThreadIndexView.as_view(), + name='ygroup-thread_index'), + url(r'^thread/(\d+)/$', + ThreadView.as_view(), + name='ygroup-thread_view'), + url(r'^post/(?P\d+)/$', + DetailView.as_view(model=Post, context_object_name='post'), + name='ygroup-post_view'), +) + diff -r c3d3d7114749 -r 0c18dfb1da1c gpp/ygroup/views.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/views.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,55 @@ +""" +Views for the ygroup (Yahoo Group Archive) application. + +""" +from django.shortcuts import get_object_or_404 +from django.views.generic import ListView + +from ygroup.models import Thread, Post +from core.paginator import DiggPaginator + + +THREADS_PER_PAGE = 40 +POSTS_PER_PAGE = 20 + + +class ThreadIndexView(ListView): + """ + This generic view displays the list of threads available. + + """ + model = Thread + paginate_by = THREADS_PER_PAGE + + def get_paginator(self, queryset, per_page, **kwargs): + """ + Return an instance of the paginator for this view. + """ + return DiggPaginator(queryset, per_page, body=5, tail=2, + margin=3, padding=2, **kwargs) + + +class ThreadView(ListView): + """ + This generic view displays the posts in a thread. + + """ + context_object_name = "post_list" + template_name = "ygroup/thread.html" + paginate_by = POSTS_PER_PAGE + + def get_queryset(self): + self.thread = get_object_or_404(Thread, pk=self.args[0]) + return Post.objects.filter(thread=self.thread) + + def get_context_data(self, **kwargs): + context = super(ThreadView, self).get_context_data(**kwargs) + context['thread'] = self.thread + return context + + def get_paginator(self, queryset, per_page, **kwargs): + """ + Return an instance of the paginator for this view. + """ + return DiggPaginator(queryset, per_page, body=5, tail=2, + margin=3, padding=2, **kwargs) diff -r c3d3d7114749 -r 0c18dfb1da1c static/css/base.css --- a/static/css/base.css Sat Feb 12 21:37:17 2011 +0000 +++ b/static/css/base.css Sun Feb 20 00:31:54 2011 +0000 @@ -392,3 +392,6 @@ div.forum-attachment { margin: 1.0em 1.5em; } +.pointer { + cursor: pointer; +} diff -r c3d3d7114749 -r 0c18dfb1da1c tools/load_ygroup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/load_ygroup.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,230 @@ +""" +load_ygroup.py + +This application reads the Yahoo Group posts database and creates .csv files +for populating the ygroup application tables. The .csv files can be used +with the mysqlimport command to load the ygroup_thread and ygroup_post tables. + +E.g.: +mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv + +""" +import csv +import datetime +import optparse +import re +import sys +from email.utils import parseaddr + +import MySQLdb + + +USAGE = "usage: %prog [options]" +DESCRIPTION = """\ +This program reads the Yahoo Groups database and outputs 2 .csv files suitable +for import with mysqlimport for loading the ygroup application tables. Threads +and posts are created from the initial data. +""" +############################################################################### + +class ConvertPosts(object): + REPLY_RE = re.compile(r"^Re:", re.IGNORECASE) + SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE) + + def __init__(self, db, show_progress=False): + self.db = db + self.show_progress = show_progress + self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb')) + self.post_writer = csv.writer(open('ygroup_post.csv', 'wb')) + self.thread_cache = {} + self.last_date = None + + def process(self): + """ + Main processing function. Processes a row at a time from the legacy + database, creating csv records in the thread and posts files as + appropriate. + + """ + c = self.db.cursor(MySQLdb.cursors.DictCursor) + + # query the legacy database + sql = "SELECT * FROM post ORDER BY id" + c.execute(sql) + + # convert the old data and write the output to the file + i = 0 + while True: + row = c.fetchone() + if row is None: + break + i += 1 + if i % 100 == 0: + sys.stdout.write('.') + sys.stdout.flush() + + self._process_row(row) + + print + c.close() + + def _process_row(self, row): + """ + Process one row from the legacy database, creating a csv record + in the thread or post files as appropriate. + + """ + # Create a unified author name from the Yahoo ID and email address + # fields in the original post: + row['author'] = self.get_author(row['name'], row['email']) + + # Some posts (mainly from 1 user...) have no date; we'll just + # make one up by using the last date we saw + 1 second + + if row['date'] is None: + assert self.last_date is not None + row['date'] = self.last_date + datetime.timedelta(seconds=1) + + self.last_date = row['date'] + + # determine if this is a new thread or a reply + + if self.REPLY_RE.match(row['title']): + # This appears to be a reply. + # Remove all the leading Re: etc., cruft + stripped_title = self._strip_title(row['title']) + thread_id = self.thread_cache.get(stripped_title) + if thread_id: + self._create_post(thread_id, row) + else: + # Huh, no parent thread..?; create a new one + # and cache under stripped name so replies will find it + self._create_thread(row) + self.thread_cache[stripped_title] = row['id'] + else: + # At first glance, not a reply; see if another thread + # already exists with the exact same title: + thread_id = self.thread_cache.get(row['title']) + if thread_id: + # Duplicate; Yahoo or someone lopped off the Re: + # or it is just a coincidence. Either way, make it + # a post of an existing thread. + self._create_post(thread_id, row) + else: + self._create_thread(row) + self.thread_cache[row['title']] = row['id'] + + + def _create_thread(self, row): + """ + Create a new thread from the post data by writing a record in the + thread .csv file and a record in the post file. + + """ + self.thread_writer.writerow((row['id'], + row['title'].encode('utf-8'), + row['date'], + row['author'].encode('utf-8'), + 0)) + self._create_post(row['id'], row) + + def _create_post(self, thread_id, row): + """ + Create a new post from the post data by writing a record in the + post .csv file. + + """ + self.post_writer.writerow((row['id'], + thread_id, + row['title'].encode('utf-8'), + row['date'], + row['author'].encode('utf-8'), + row['msg'].encode('utf-8'), + '')) + + def _strip_title(self, title): + """ + Strip out all the Re: and [SurfGuitar101] stuff to get a bare + title. + + """ + s = title + while self.REPLY_RE.match(s): + s = self.SG101_REPLY_RE.sub('', s).strip() + s = self.REPLY_RE.sub('', s).strip() + + return s + + @staticmethod + def get_author(yahoo_id, email): + + def anti_spam(s): + return s.replace('.', ' dot ').replace('@', ' at ') + + name, addr = parseaddr(email) + + if name == addr: + name = anti_spam(name) + else: + # For some weird reason, sometimes Yahoo (?) put the email address + # in the name field: "John Doe " + name2, addr = parseaddr(name) + if name2: + name = name2 + + if name and yahoo_id and name != yahoo_id: + author = "%s (%s)" % (name, yahoo_id) + elif name: + author = name + elif yahoo_id: + author = yahoo_id + else: + author = anti_spam(email) + return author + +############################################################################### + +def main(argv=None): + parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) + parser.set_defaults( + progress=False, + host='localhost', + user='root', + password='', + database='sg101_yahoo_group', + ) + parser.add_option("-s", "--progress", action="store_true", + help="Output a . after every 100 posts to show progress [default: %default]") + parser.add_option("-a", "--host", + help="set MySQL host name [default: %default]"), + parser.add_option("-u", "--user", + help="set MySQL user name [default: %default]") + parser.add_option("-p", "--password", + help="set MySQL user password [default: %default]"), + parser.add_option("-d", "--database", + help="set MySQL database name [default: %default]") + opts, args = parser.parse_args(args=argv) + + # connect to the legacy database + try: + db = MySQLdb.connect(host=opts.host, + user=opts.user, + passwd=opts.password, + db=opts.database, + use_unicode=True) + except MySQLdb.DatabaseError, e: + sys.exit("Can't connect to database: %s" % e) + + converter = ConvertPosts(db, opts.progress) + converter.process() + db.close() + +############################################################################### + +if __name__ == '__main__': + try: + main() + except IOError, ex: + sys.exit("IO Error: %s" % ex) + except KeyboardInterrupt: + sys.exit("Control-C interrupt")