annotate tools/load_ygroup.py @ 334:6805d15cda13

Adding a script I had to write on the fly to filter out posts from the posts csv file that had no parent topics. MyISAM let me get away with that, but InnoDB won't.
author Brian Neal <bgneal@gmail.com>
date Sat, 26 Feb 2011 01:28:22 +0000
parents 0c18dfb1da1c
children
rev   line source
bgneal@323 1 """
bgneal@323 2 load_ygroup.py
bgneal@323 3
bgneal@323 4 This application reads the Yahoo Group posts database and creates .csv files
bgneal@323 5 for populating the ygroup application tables. The .csv files can be used
bgneal@323 6 with the mysqlimport command to load the ygroup_thread and ygroup_post tables.
bgneal@323 7
bgneal@323 8 E.g.:
bgneal@323 9 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv
bgneal@323 10
bgneal@323 11 """
bgneal@323 12 import csv
bgneal@323 13 import datetime
bgneal@323 14 import optparse
bgneal@323 15 import re
bgneal@323 16 import sys
bgneal@323 17 from email.utils import parseaddr
bgneal@323 18
bgneal@323 19 import MySQLdb
bgneal@323 20
bgneal@323 21
bgneal@323 22 USAGE = "usage: %prog [options]"
bgneal@323 23 DESCRIPTION = """\
bgneal@323 24 This program reads the Yahoo Groups database and outputs 2 .csv files suitable
bgneal@323 25 for import with mysqlimport for loading the ygroup application tables. Threads
bgneal@323 26 and posts are created from the initial data.
bgneal@323 27 """
bgneal@323 28 ###############################################################################
bgneal@323 29
bgneal@323 30 class ConvertPosts(object):
bgneal@323 31 REPLY_RE = re.compile(r"^Re:", re.IGNORECASE)
bgneal@323 32 SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE)
bgneal@323 33
bgneal@323 34 def __init__(self, db, show_progress=False):
bgneal@323 35 self.db = db
bgneal@323 36 self.show_progress = show_progress
bgneal@323 37 self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb'))
bgneal@323 38 self.post_writer = csv.writer(open('ygroup_post.csv', 'wb'))
bgneal@323 39 self.thread_cache = {}
bgneal@323 40 self.last_date = None
bgneal@323 41
bgneal@323 42 def process(self):
bgneal@323 43 """
bgneal@323 44 Main processing function. Processes a row at a time from the legacy
bgneal@323 45 database, creating csv records in the thread and posts files as
bgneal@323 46 appropriate.
bgneal@323 47
bgneal@323 48 """
bgneal@323 49 c = self.db.cursor(MySQLdb.cursors.DictCursor)
bgneal@323 50
bgneal@323 51 # query the legacy database
bgneal@323 52 sql = "SELECT * FROM post ORDER BY id"
bgneal@323 53 c.execute(sql)
bgneal@323 54
bgneal@323 55 # convert the old data and write the output to the file
bgneal@323 56 i = 0
bgneal@323 57 while True:
bgneal@323 58 row = c.fetchone()
bgneal@323 59 if row is None:
bgneal@323 60 break
bgneal@323 61 i += 1
bgneal@323 62 if i % 100 == 0:
bgneal@323 63 sys.stdout.write('.')
bgneal@323 64 sys.stdout.flush()
bgneal@323 65
bgneal@323 66 self._process_row(row)
bgneal@323 67
bgneal@323 68 print
bgneal@323 69 c.close()
bgneal@323 70
bgneal@323 71 def _process_row(self, row):
bgneal@323 72 """
bgneal@323 73 Process one row from the legacy database, creating a csv record
bgneal@323 74 in the thread or post files as appropriate.
bgneal@323 75
bgneal@323 76 """
bgneal@323 77 # Create a unified author name from the Yahoo ID and email address
bgneal@323 78 # fields in the original post:
bgneal@323 79 row['author'] = self.get_author(row['name'], row['email'])
bgneal@323 80
bgneal@323 81 # Some posts (mainly from 1 user...) have no date; we'll just
bgneal@323 82 # make one up by using the last date we saw + 1 second
bgneal@323 83
bgneal@323 84 if row['date'] is None:
bgneal@323 85 assert self.last_date is not None
bgneal@323 86 row['date'] = self.last_date + datetime.timedelta(seconds=1)
bgneal@323 87
bgneal@323 88 self.last_date = row['date']
bgneal@323 89
bgneal@323 90 # determine if this is a new thread or a reply
bgneal@323 91
bgneal@323 92 if self.REPLY_RE.match(row['title']):
bgneal@323 93 # This appears to be a reply.
bgneal@323 94 # Remove all the leading Re: etc., cruft
bgneal@323 95 stripped_title = self._strip_title(row['title'])
bgneal@323 96 thread_id = self.thread_cache.get(stripped_title)
bgneal@323 97 if thread_id:
bgneal@323 98 self._create_post(thread_id, row)
bgneal@323 99 else:
bgneal@323 100 # Huh, no parent thread..?; create a new one
bgneal@323 101 # and cache under stripped name so replies will find it
bgneal@323 102 self._create_thread(row)
bgneal@323 103 self.thread_cache[stripped_title] = row['id']
bgneal@323 104 else:
bgneal@323 105 # At first glance, not a reply; see if another thread
bgneal@323 106 # already exists with the exact same title:
bgneal@323 107 thread_id = self.thread_cache.get(row['title'])
bgneal@323 108 if thread_id:
bgneal@323 109 # Duplicate; Yahoo or someone lopped off the Re:
bgneal@323 110 # or it is just a coincidence. Either way, make it
bgneal@323 111 # a post of an existing thread.
bgneal@323 112 self._create_post(thread_id, row)
bgneal@323 113 else:
bgneal@323 114 self._create_thread(row)
bgneal@323 115 self.thread_cache[row['title']] = row['id']
bgneal@323 116
bgneal@323 117
bgneal@323 118 def _create_thread(self, row):
bgneal@323 119 """
bgneal@323 120 Create a new thread from the post data by writing a record in the
bgneal@323 121 thread .csv file and a record in the post file.
bgneal@323 122
bgneal@323 123 """
bgneal@323 124 self.thread_writer.writerow((row['id'],
bgneal@323 125 row['title'].encode('utf-8'),
bgneal@323 126 row['date'],
bgneal@323 127 row['author'].encode('utf-8'),
bgneal@323 128 0))
bgneal@323 129 self._create_post(row['id'], row)
bgneal@323 130
bgneal@323 131 def _create_post(self, thread_id, row):
bgneal@323 132 """
bgneal@323 133 Create a new post from the post data by writing a record in the
bgneal@323 134 post .csv file.
bgneal@323 135
bgneal@323 136 """
bgneal@323 137 self.post_writer.writerow((row['id'],
bgneal@323 138 thread_id,
bgneal@323 139 row['title'].encode('utf-8'),
bgneal@323 140 row['date'],
bgneal@323 141 row['author'].encode('utf-8'),
bgneal@323 142 row['msg'].encode('utf-8'),
bgneal@323 143 ''))
bgneal@323 144
bgneal@323 145 def _strip_title(self, title):
bgneal@323 146 """
bgneal@323 147 Strip out all the Re: and [SurfGuitar101] stuff to get a bare
bgneal@323 148 title.
bgneal@323 149
bgneal@323 150 """
bgneal@323 151 s = title
bgneal@323 152 while self.REPLY_RE.match(s):
bgneal@323 153 s = self.SG101_REPLY_RE.sub('', s).strip()
bgneal@323 154 s = self.REPLY_RE.sub('', s).strip()
bgneal@323 155
bgneal@323 156 return s
bgneal@323 157
bgneal@323 158 @staticmethod
bgneal@323 159 def get_author(yahoo_id, email):
bgneal@323 160
bgneal@323 161 def anti_spam(s):
bgneal@323 162 return s.replace('.', ' dot ').replace('@', ' at ')
bgneal@323 163
bgneal@323 164 name, addr = parseaddr(email)
bgneal@323 165
bgneal@323 166 if name == addr:
bgneal@323 167 name = anti_spam(name)
bgneal@323 168 else:
bgneal@323 169 # For some weird reason, sometimes Yahoo (?) put the email address
bgneal@323 170 # in the name field: "John Doe <doe@example.com" <doe@example.com>"
bgneal@323 171 name2, addr = parseaddr(name)
bgneal@323 172 if name2:
bgneal@323 173 name = name2
bgneal@323 174
bgneal@323 175 if name and yahoo_id and name != yahoo_id:
bgneal@323 176 author = "%s (%s)" % (name, yahoo_id)
bgneal@323 177 elif name:
bgneal@323 178 author = name
bgneal@323 179 elif yahoo_id:
bgneal@323 180 author = yahoo_id
bgneal@323 181 else:
bgneal@323 182 author = anti_spam(email)
bgneal@323 183 return author
bgneal@323 184
bgneal@323 185 ###############################################################################
bgneal@323 186
bgneal@323 187 def main(argv=None):
bgneal@323 188 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@323 189 parser.set_defaults(
bgneal@323 190 progress=False,
bgneal@323 191 host='localhost',
bgneal@323 192 user='root',
bgneal@323 193 password='',
bgneal@323 194 database='sg101_yahoo_group',
bgneal@323 195 )
bgneal@323 196 parser.add_option("-s", "--progress", action="store_true",
bgneal@323 197 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@323 198 parser.add_option("-a", "--host",
bgneal@323 199 help="set MySQL host name [default: %default]"),
bgneal@323 200 parser.add_option("-u", "--user",
bgneal@323 201 help="set MySQL user name [default: %default]")
bgneal@323 202 parser.add_option("-p", "--password",
bgneal@323 203 help="set MySQL user password [default: %default]"),
bgneal@323 204 parser.add_option("-d", "--database",
bgneal@323 205 help="set MySQL database name [default: %default]")
bgneal@323 206 opts, args = parser.parse_args(args=argv)
bgneal@323 207
bgneal@323 208 # connect to the legacy database
bgneal@323 209 try:
bgneal@323 210 db = MySQLdb.connect(host=opts.host,
bgneal@323 211 user=opts.user,
bgneal@323 212 passwd=opts.password,
bgneal@323 213 db=opts.database,
bgneal@323 214 use_unicode=True)
bgneal@323 215 except MySQLdb.DatabaseError, e:
bgneal@323 216 sys.exit("Can't connect to database: %s" % e)
bgneal@323 217
bgneal@323 218 converter = ConvertPosts(db, opts.progress)
bgneal@323 219 converter.process()
bgneal@323 220 db.close()
bgneal@323 221
bgneal@323 222 ###############################################################################
bgneal@323 223
bgneal@323 224 if __name__ == '__main__':
bgneal@323 225 try:
bgneal@323 226 main()
bgneal@323 227 except IOError, ex:
bgneal@323 228 sys.exit("IO Error: %s" % ex)
bgneal@323 229 except KeyboardInterrupt:
bgneal@323 230 sys.exit("Control-C interrupt")