Mercurial > public > sg101
changeset 323:0c18dfb1da1c
Fixing #149; adding the ygroup application: an archive of the old Yahoo Group messages.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 20 Feb 2011 00:31:54 +0000 |
parents | c3d3d7114749 |
children | 33f3d5987a96 |
files | gpp/settings.py gpp/templates/base.html gpp/templates/search/indexes/ygroup/post_text.txt gpp/templates/search/search.html gpp/templates/ygroup/pagination.html gpp/templates/ygroup/post_detail.html gpp/templates/ygroup/thread.html gpp/templates/ygroup/thread_list.html gpp/urls.py gpp/ygroup/__init__.py gpp/ygroup/management/__init__.py gpp/ygroup/management/commands/__init__.py gpp/ygroup/management/commands/sync_ygroup_posts.py gpp/ygroup/management/commands/sync_ygroup_threads.py gpp/ygroup/models.py gpp/ygroup/search_indexes.py gpp/ygroup/tests.py gpp/ygroup/urls.py gpp/ygroup/views.py static/css/base.css tools/load_ygroup.py |
diffstat | 18 files changed, 636 insertions(+), 12 deletions(-) [+] |
line wrap: on
line diff
--- a/gpp/settings.py Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/settings.py Sun Feb 20 00:31:54 2011 +0000 @@ -83,7 +83,7 @@ 'django.middleware.csrf.CsrfViewMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', 'django.contrib.messages.middleware.MessageMiddleware', - 'debug_toolbar.middleware.DebugToolbarMiddleware', + #'debug_toolbar.middleware.DebugToolbarMiddleware', 'django.contrib.auth.middleware.AuthenticationMiddleware', 'gpp.core.middleware.InactiveUserMiddleware', 'gpp.core.middleware.WhosOnline', @@ -158,9 +158,10 @@ 'shoutbox', 'smiley', 'weblinks', + 'ygroup', ] -if DEBUG: - INSTALLED_APPS.append('debug_toolbar') +#if DEBUG: +# INSTALLED_APPS.append('debug_toolbar') LOGIN_URL = '/accounts/login/' LOGIN_REDIRECT_URL = '/profile/me/'
--- a/gpp/templates/base.html Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/templates/base.html Sun Feb 20 00:31:54 2011 +0000 @@ -1,8 +1,8 @@ -{% load url from future %} <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +{% load url from future %} {% load shoutbox_tags %} {% load irc_tags %} {% load potd_tags %} @@ -69,6 +69,7 @@ <li><a href="{% url 'potd-view' %}">Photo of the Day</a></li> <li><a href="{% url 'weblinks-main' %}">Links</a></li> <li><a href="{% url 'haystack_search' %}">Search</a></li> + <li><a href="{% url 'ygroup-thread_index' %}">Yahoo Group</a></li> </ul> {% cache 300 potd_block %} {% photo_of_the_day %}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/search/indexes/ygroup/post_text.txt Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,2 @@ +{{ object.title }} +{{ object.msg }}
--- a/gpp/templates/search/search.html Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/templates/search/search.html Sun Feb 20 00:31:54 2011 +0000 @@ -1,9 +1,30 @@ {% extends 'base.html' %} {% load highlight %} {% block title %}Search{% endblock %} +{% block custom_js %} +<script type="text/javascript"> +//<![CDATA[ +$(document).ready(function() { + chkboxes = $('#search-form input[type="checkbox"]'); + $('#chk_all').click(function() { + chkboxes.each(function(index) { + $(this).attr('checked', true); + }); + return false; + }); + $('#chk_none').click(function() { + chkboxes.each(function(index) { + $(this).attr('checked', false); + }); + return false; + }); +}); +//]]> +</script> +{% endblock %} {% block content %} <h2>Search <img src="{{ STATIC_URL }}icons/magnifier.png" alt="Search" /></h2> -<form method="get" action="."> +<form id="search-form" method="get" action="."> <table> <tr> <td><input type="text" name="q" id="id_q" size="48" class="text" /></td> @@ -14,17 +35,29 @@ <legend>Search in:</legend> <table> <tr> - <td><input type="checkbox" name="models" value="forums.post" checked="checked" />Forum Posts</td> - <td><input type="checkbox" name="models" value="news.story" checked="checked" />News Stories</td> - <td><input type="checkbox" name="models" value="bio.userprofile" checked="checked" />User Profiles</td> + <td><input id="chk-forums" type="checkbox" name="models" value="forums.post" checked="checked" /> + <label for="chk-forums" class="pointer">Forum Posts</label></td> + <td><input id="chk-news" type="checkbox" name="models" value="news.story" checked="checked" /> + <label for="chk-news" class="pointer">News Stories</label></td> + <td><input id="chk-profiles" type="checkbox" name="models" value="bio.userprofile" checked="checked" /> + <label for="chk-profiles" class="pointer">User Profiles</label></td> </tr> <tr> - <td><input type="checkbox" name="models" value="weblinks.link" checked="checked" />Links</td> - <td><input type="checkbox" name="models" value="downloads.download" checked="checked" />Downloads</td> - <td><input type="checkbox" name="models" value="podcast.item" checked="checked" />Podcasts</td> + <td><input id="chk-links" type="checkbox" name="models" value="weblinks.link" checked="checked" /> + <label for="chk-links" class="pointer">Links</label></td> + <td><input id="chk-dls" type="checkbox" name="models" value="downloads.download" checked="checked" /> + <label for="chk-dls" class="pointer">Downloads</label></td> + <td><input id="chk-podcasts" type="checkbox" name="models" value="podcast.item" checked="checked" /> + <label for="chk-podcasts" class="pointer">Podcasts</label></td> + </tr> + <tr> + <td colspan="2"><input id="chk-ygroup" type="checkbox" name="models" value="ygroup.post" checked="checked" /> + <label for="chk-ygroup" class="pointer">Yahoo Group Archives</label></td> </tr> </table> +<p><a href="#" id="chk_all">Check all</a> | <a href="#" id="chk_none">Check none</a></p> </fieldset> +</form> {% if query %} <h3>Results for "{{ query }}" page {{ page.number }} of {{ page.paginator.num_pages }}</h3> @@ -60,5 +93,4 @@ {% else %} {# Show some example queries to run, maybe query syntax, something else? #} {% endif %} -</form> {% endblock %}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/pagination.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,26 @@ +<div class="pagination"> +{% if page_obj.paginator.num_pages == 1 %} +Page 1 of 1 +{% else %} +<ul> +{% if page_obj.has_previous %} +<li class="prev"><a href="./?page={{ page_obj.previous_page_number }}" title="Go to page {{ page_obj.previous_page_number }}">« Previous</a></li> +{% endif %} +{% for num in page_obj.page_range %} +{% if num %} +{% ifequal num page_obj.number %} +<li class="current">{{ num }}</li> +{% else %} +<li class="page"><a href="./?page={{ num }}" title="Go to page {{ num }}">{{ num }}</a></li> +{% endifequal %} +{% else %} +<li>…</li> +{% endif %} +{% endfor %} +{% if page_obj.has_next %} +<li class="next"><a href="./?page={{ page_obj.next_page_number }}" title="Go to page {{ page_obj.next_page_number }}">Next »</a></li> +{% endif %} +</ul> +{% endif %} +</div> +<br clear="left" />
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/post_detail.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,15 @@ +{% extends 'base.html' %} +{% load url from future %} +{% block title %}Yahoo Group Archives: {{ post.title }}{% endblock %} +{% block content %} +<h2 class="forum-nav"><a href="{% url 'ygroup-thread_index' %}">Yahoo Group Archives</a> » </h2> +<h3 class="forum-nav">{{ post.title }} + <a href="{{ post.get_absolute_url }}" rel="nofollow"> + <img src="{{ STATIC_URL }}icons/link.png" alt="permalink" title="permalink" /></a> +</h3> +<dl> + <dt>{{ post.poster }} - {{ post.creation_date|date:"d M Y H:i:s" }}</dt> + <dd>{{ post.msg|linebreaks }}</dd> +</dl> +<p><a href="{{ post.thread_url }}" rel="nofollow">See this post in context</a>.</p> +{% endblock %}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/thread.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,28 @@ +{% extends 'base.html' %} +{% load url from future %} +{% block title %}Yahoo Group Archives: {{ thread.title }}{% endblock %} +{% block custom_css %} +<link rel="stylesheet" type="text/css" href="{{ STATIC_URL }}css/pagination.css" /> +{% endblock %} +{% block content %} +{% if thread.page == 1 %} +<h2 class="forum-nav"><a href="{% url 'ygroup-thread_index' %}">Yahoo Group Archives</a> » </h2> +{% else %} +<h2 class="forum-nav"><a href="{% url 'ygroup-thread_index' %}">Yahoo Group Archives</a> » + <a href="{% url 'ygroup-thread_index' %}?page={{ thread.page }}">Page {{ thread.page }}</a> »</h2> +{% endif %} +<h3 class="forum-nav">{{ thread.title }} + <a href="{{ thread.get_absolute_url }}" rel="nofollow"> + <img src="{{ STATIC_URL }}icons/link.png" alt="permalink" title="permalink" /></a> +</h3> +{% include "ygroup/pagination.html" %} +<dl> + {% for post in page_obj.object_list %} + <dt><a name="p{{ post.id }}"></a>{{ post.poster }} - {{ post.creation_date|date:"d M Y H:i:s" }} + <a href="{{ post.get_absolute_url }}" rel="nofollow"> + <img src="{{ STATIC_URL }}icons/link.png" alt="permalink" title="permalink" /></a></dt> + <dd>{{ post.msg|linebreaks }}</dd> + {% endfor %} +</dl> +{% include "ygroup/pagination.html" %} +{% endblock %}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/templates/ygroup/thread_list.html Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,25 @@ +{% extends 'base.html' %} +{% load url from future %} +{% block title %}Yahoo Group Archives{% endblock %} +{% block custom_css %} +<link rel="stylesheet" type="text/css" href="{{ STATIC_URL }}css/pagination.css" /> +{% endblock %} +{% block content %} +<h2>Yahoo Group Archives » Page {{ page_obj.number }}</h2> +<p> +SurfGuitar101.com began as a Yahoo Group on October 31, 2001. It ran until August, 2007 when this site officially replaced it. On these pages you'll find the archived messages of our original group. You can also search through these messages via our <a href="{% url 'haystack_search' %}">search page</a>. +</p> +{% include "ygroup/pagination.html" %} +<table> + <tr><th>Title</th><th>Author</th><th>Posts</th><th>Date</th></tr> + {% for thread in page_obj.object_list %} + <tr> + <td><a href="{{ thread.get_absolute_url }}">{{ thread.title }}</a></td> + <td>{{ thread.poster }}</td> + <td>{{ thread.post_count }}</td> + <td>{{ thread.creation_date|date:"d M Y" }}</td> + </tr> + {% endfor %} +</table> +{% include "ygroup/pagination.html" %} +{% endblock %}
--- a/gpp/urls.py Sat Feb 12 21:37:17 2011 +0000 +++ b/gpp/urls.py Sun Feb 20 00:31:54 2011 +0000 @@ -46,6 +46,7 @@ (r'^profile/', include('bio.urls')), (r'^shout/', include('shoutbox.urls')), (r'^smiley/', include('smiley.urls')), + (r'^ygroup/', include('ygroup.urls')), ) # Haystack search views
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/management/commands/sync_ygroup_posts.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,53 @@ +""" +sync_ygroup_posts.py - A management command to synchronize the yahoo group +archives by recomputing the de-normalized fields in the post objects. + +""" +import optparse + +from django.core.management.base import NoArgsCommand, CommandError +from django.core.urlresolvers import reverse + +from ygroup.models import Thread, Post +import ygroup.views + + +class Command(NoArgsCommand): + help = """\ +This command synchronizes the ygroup application's post objects +by updating their de-normalized fields. +""" + option_list = NoArgsCommand.option_list + ( + optparse.make_option("-p", "--progress", action="store_true", + help="Output a . after every 100 posts to show progress"), + ) + + def handle_noargs(self, **opts): + + show_progress = opts.get('progress', False) or False + + threads = {} + self.stdout.write("Processing threads...\n") + for thread in Thread.objects.iterator(): + threads[thread.id] = [reverse('ygroup-thread_view', args=[thread.id]), + list(Post.objects.filter(thread=thread).values_list('id', flat=True))] + + self.stdout.write("Processing posts...\n") + n = 0 + for post in Post.objects.iterator(): + thread = threads[post.thread.id] + pos = thread[1].index(post.id) + page = pos / ygroup.views.POSTS_PER_PAGE + 1 + if page == 1: + post.thread_url = thread[0] + '#p%d' % (post.id, ) + else: + post.thread_url = thread[0] + '?page=%d#p%d' % (page, post.id) + post.save() + + n += 1 + if show_progress and n % 100 == 0: + self.stdout.write('.') + self.stdout.flush() + + self.stdout.write('\n') +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/management/commands/sync_ygroup_threads.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,39 @@ +""" +sync_ygroup_threads.py - A management command to synchronize the yahoo group +archives by recomputing the de-normalized fields in the thread objects. + +""" +import optparse + +from django.core.management.base import NoArgsCommand, CommandError + +from ygroup.models import Thread, Post +import ygroup.views + + +class Command(NoArgsCommand): + help = """\ +This command synchronizes the ygroup application's thread objects +by updating their de-normalized fields. +""" + option_list = NoArgsCommand.option_list + ( + optparse.make_option("-p", "--progress", action="store_true", + help="Output a . after every 50 threads to show progress"), + ) + + def handle_noargs(self, **opts): + + show_progress = opts.get('progress', False) or False + + n = 0 + for thread in Thread.objects.iterator(): + thread.post_count = Post.objects.filter(thread=thread).count() + thread.page = n / ygroup.views.THREADS_PER_PAGE + 1 + thread.save() + n += 1 + if n % 50 == 0: + self.stdout.write('.') + self.stdout.flush() + + self.stdout.write('\n') +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/models.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,55 @@ +""" +Models for the ygroup application, which is a read-only archive of messages +from the old Yahoo Group. +""" +from django.db import models + + +class Thread(models.Model): + title = models.CharField(max_length=255) + creation_date = models.DateTimeField() + + # denormalized fields to reduce database hits + poster = models.CharField(max_length=128) + post_count = models.IntegerField(blank=True, default=0) + page = models.IntegerField(blank=True, default=1) + + class Meta: + ordering = ('creation_date', ) + + def __unicode__(self): + return u'Thread %d, %s' % (self.pk, self.title) + + @models.permalink + def get_absolute_url(self): + return ('ygroup-thread_view', [self.id]) + + +class Post(models.Model): + thread = models.ForeignKey(Thread, null=True, blank=True, + on_delete=models.SET_NULL, related_name='posts') + title = models.CharField(max_length=255) + creation_date = models.DateTimeField() + poster = models.CharField(max_length=128) + msg = models.TextField() + + # precomputed URL to this post in the parent thread for efficiency + thread_url = models.URLField(verify_exists=False, blank=True) + + class Meta: + ordering = ('creation_date', ) + verbose_name = 'yahoo group post' + verbose_name_plural = 'yahoo group posts' + + def __unicode__(self): + return u'Post %d, %s' % (self.pk, self.title) + + @models.permalink + def get_absolute_url(self): + return ('ygroup-post_view', [], {'pk': self.id}) + + def search_title(self): + return self.title + + def search_summary(self): + return self.msg
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/search_indexes.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,19 @@ +""" +Haystack search index for the Yahoo Group archives application. + +""" +from haystack.indexes import * +from haystack import site + +from ygroup.models import Post + + +class PostIndex(SearchIndex): + text = CharField(document=True, use_template=True) + pub_date = DateTimeField(model_attr='creation_date') + + def get_updated_field(self): + return 'creation_date' + + +site.register(Post, PostIndex)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/tests.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,16 @@ +""" +This file demonstrates writing tests using the unittest module. These will pass +when you run "manage.py test". + +Replace this with more appropriate tests for your application. +""" + +from django.test import TestCase + + +class SimpleTest(TestCase): + def test_basic_addition(self): + """ + Tests that 1 + 1 always equals 2. + """ + self.assertEqual(1 + 1, 2)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/urls.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,23 @@ +""" +urls.py - URLs for the ygroup application. + +""" +from django.conf.urls.defaults import * +from django.views.generic import ListView, DetailView + +from ygroup.models import Thread, Post +from ygroup.views import ThreadIndexView, ThreadView + + +urlpatterns = patterns('', + url(r'^threads/$', + ThreadIndexView.as_view(), + name='ygroup-thread_index'), + url(r'^thread/(\d+)/$', + ThreadView.as_view(), + name='ygroup-thread_view'), + url(r'^post/(?P<pk>\d+)/$', + DetailView.as_view(model=Post, context_object_name='post'), + name='ygroup-post_view'), +) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gpp/ygroup/views.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,55 @@ +""" +Views for the ygroup (Yahoo Group Archive) application. + +""" +from django.shortcuts import get_object_or_404 +from django.views.generic import ListView + +from ygroup.models import Thread, Post +from core.paginator import DiggPaginator + + +THREADS_PER_PAGE = 40 +POSTS_PER_PAGE = 20 + + +class ThreadIndexView(ListView): + """ + This generic view displays the list of threads available. + + """ + model = Thread + paginate_by = THREADS_PER_PAGE + + def get_paginator(self, queryset, per_page, **kwargs): + """ + Return an instance of the paginator for this view. + """ + return DiggPaginator(queryset, per_page, body=5, tail=2, + margin=3, padding=2, **kwargs) + + +class ThreadView(ListView): + """ + This generic view displays the posts in a thread. + + """ + context_object_name = "post_list" + template_name = "ygroup/thread.html" + paginate_by = POSTS_PER_PAGE + + def get_queryset(self): + self.thread = get_object_or_404(Thread, pk=self.args[0]) + return Post.objects.filter(thread=self.thread) + + def get_context_data(self, **kwargs): + context = super(ThreadView, self).get_context_data(**kwargs) + context['thread'] = self.thread + return context + + def get_paginator(self, queryset, per_page, **kwargs): + """ + Return an instance of the paginator for this view. + """ + return DiggPaginator(queryset, per_page, body=5, tail=2, + margin=3, padding=2, **kwargs)
--- a/static/css/base.css Sat Feb 12 21:37:17 2011 +0000 +++ b/static/css/base.css Sun Feb 20 00:31:54 2011 +0000 @@ -392,3 +392,6 @@ div.forum-attachment { margin: 1.0em 1.5em; } +.pointer { + cursor: pointer; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/load_ygroup.py Sun Feb 20 00:31:54 2011 +0000 @@ -0,0 +1,230 @@ +""" +load_ygroup.py + +This application reads the Yahoo Group posts database and creates .csv files +for populating the ygroup application tables. The .csv files can be used +with the mysqlimport command to load the ygroup_thread and ygroup_post tables. + +E.g.: +mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv + +""" +import csv +import datetime +import optparse +import re +import sys +from email.utils import parseaddr + +import MySQLdb + + +USAGE = "usage: %prog [options]" +DESCRIPTION = """\ +This program reads the Yahoo Groups database and outputs 2 .csv files suitable +for import with mysqlimport for loading the ygroup application tables. Threads +and posts are created from the initial data. +""" +############################################################################### + +class ConvertPosts(object): + REPLY_RE = re.compile(r"^Re:", re.IGNORECASE) + SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE) + + def __init__(self, db, show_progress=False): + self.db = db + self.show_progress = show_progress + self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb')) + self.post_writer = csv.writer(open('ygroup_post.csv', 'wb')) + self.thread_cache = {} + self.last_date = None + + def process(self): + """ + Main processing function. Processes a row at a time from the legacy + database, creating csv records in the thread and posts files as + appropriate. + + """ + c = self.db.cursor(MySQLdb.cursors.DictCursor) + + # query the legacy database + sql = "SELECT * FROM post ORDER BY id" + c.execute(sql) + + # convert the old data and write the output to the file + i = 0 + while True: + row = c.fetchone() + if row is None: + break + i += 1 + if i % 100 == 0: + sys.stdout.write('.') + sys.stdout.flush() + + self._process_row(row) + + print + c.close() + + def _process_row(self, row): + """ + Process one row from the legacy database, creating a csv record + in the thread or post files as appropriate. + + """ + # Create a unified author name from the Yahoo ID and email address + # fields in the original post: + row['author'] = self.get_author(row['name'], row['email']) + + # Some posts (mainly from 1 user...) have no date; we'll just + # make one up by using the last date we saw + 1 second + + if row['date'] is None: + assert self.last_date is not None + row['date'] = self.last_date + datetime.timedelta(seconds=1) + + self.last_date = row['date'] + + # determine if this is a new thread or a reply + + if self.REPLY_RE.match(row['title']): + # This appears to be a reply. + # Remove all the leading Re: etc., cruft + stripped_title = self._strip_title(row['title']) + thread_id = self.thread_cache.get(stripped_title) + if thread_id: + self._create_post(thread_id, row) + else: + # Huh, no parent thread..?; create a new one + # and cache under stripped name so replies will find it + self._create_thread(row) + self.thread_cache[stripped_title] = row['id'] + else: + # At first glance, not a reply; see if another thread + # already exists with the exact same title: + thread_id = self.thread_cache.get(row['title']) + if thread_id: + # Duplicate; Yahoo or someone lopped off the Re: + # or it is just a coincidence. Either way, make it + # a post of an existing thread. + self._create_post(thread_id, row) + else: + self._create_thread(row) + self.thread_cache[row['title']] = row['id'] + + + def _create_thread(self, row): + """ + Create a new thread from the post data by writing a record in the + thread .csv file and a record in the post file. + + """ + self.thread_writer.writerow((row['id'], + row['title'].encode('utf-8'), + row['date'], + row['author'].encode('utf-8'), + 0)) + self._create_post(row['id'], row) + + def _create_post(self, thread_id, row): + """ + Create a new post from the post data by writing a record in the + post .csv file. + + """ + self.post_writer.writerow((row['id'], + thread_id, + row['title'].encode('utf-8'), + row['date'], + row['author'].encode('utf-8'), + row['msg'].encode('utf-8'), + '')) + + def _strip_title(self, title): + """ + Strip out all the Re: and [SurfGuitar101] stuff to get a bare + title. + + """ + s = title + while self.REPLY_RE.match(s): + s = self.SG101_REPLY_RE.sub('', s).strip() + s = self.REPLY_RE.sub('', s).strip() + + return s + + @staticmethod + def get_author(yahoo_id, email): + + def anti_spam(s): + return s.replace('.', ' dot ').replace('@', ' at ') + + name, addr = parseaddr(email) + + if name == addr: + name = anti_spam(name) + else: + # For some weird reason, sometimes Yahoo (?) put the email address + # in the name field: "John Doe <doe@example.com" <doe@example.com>" + name2, addr = parseaddr(name) + if name2: + name = name2 + + if name and yahoo_id and name != yahoo_id: + author = "%s (%s)" % (name, yahoo_id) + elif name: + author = name + elif yahoo_id: + author = yahoo_id + else: + author = anti_spam(email) + return author + +############################################################################### + +def main(argv=None): + parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) + parser.set_defaults( + progress=False, + host='localhost', + user='root', + password='', + database='sg101_yahoo_group', + ) + parser.add_option("-s", "--progress", action="store_true", + help="Output a . after every 100 posts to show progress [default: %default]") + parser.add_option("-a", "--host", + help="set MySQL host name [default: %default]"), + parser.add_option("-u", "--user", + help="set MySQL user name [default: %default]") + parser.add_option("-p", "--password", + help="set MySQL user password [default: %default]"), + parser.add_option("-d", "--database", + help="set MySQL database name [default: %default]") + opts, args = parser.parse_args(args=argv) + + # connect to the legacy database + try: + db = MySQLdb.connect(host=opts.host, + user=opts.user, + passwd=opts.password, + db=opts.database, + use_unicode=True) + except MySQLdb.DatabaseError, e: + sys.exit("Can't connect to database: %s" % e) + + converter = ConvertPosts(db, opts.progress) + converter.process() + db.close() + +############################################################################### + +if __name__ == '__main__': + try: + main() + except IOError, ex: + sys.exit("IO Error: %s" % ex) + except KeyboardInterrupt: + sys.exit("Control-C interrupt")