annotate tools/load_ygroup.py @ 986:26de15fb5a80

Guard against bad image urls in ssl_images.
author Brian Neal <bgneal@gmail.com>
date Sun, 25 Oct 2015 14:47:29 -0500
parents 0c18dfb1da1c
children
rev   line source
bgneal@323 1 """
bgneal@323 2 load_ygroup.py
bgneal@323 3
bgneal@323 4 This application reads the Yahoo Group posts database and creates .csv files
bgneal@323 5 for populating the ygroup application tables. The .csv files can be used
bgneal@323 6 with the mysqlimport command to load the ygroup_thread and ygroup_post tables.
bgneal@323 7
bgneal@323 8 E.g.:
bgneal@323 9 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv
bgneal@323 10
bgneal@323 11 """
bgneal@323 12 import csv
bgneal@323 13 import datetime
bgneal@323 14 import optparse
bgneal@323 15 import re
bgneal@323 16 import sys
bgneal@323 17 from email.utils import parseaddr
bgneal@323 18
bgneal@323 19 import MySQLdb
bgneal@323 20
bgneal@323 21
bgneal@323 22 USAGE = "usage: %prog [options]"
bgneal@323 23 DESCRIPTION = """\
bgneal@323 24 This program reads the Yahoo Groups database and outputs 2 .csv files suitable
bgneal@323 25 for import with mysqlimport for loading the ygroup application tables. Threads
bgneal@323 26 and posts are created from the initial data.
bgneal@323 27 """
bgneal@323 28 ###############################################################################
bgneal@323 29
bgneal@323 30 class ConvertPosts(object):
bgneal@323 31 REPLY_RE = re.compile(r"^Re:", re.IGNORECASE)
bgneal@323 32 SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE)
bgneal@323 33
bgneal@323 34 def __init__(self, db, show_progress=False):
bgneal@323 35 self.db = db
bgneal@323 36 self.show_progress = show_progress
bgneal@323 37 self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb'))
bgneal@323 38 self.post_writer = csv.writer(open('ygroup_post.csv', 'wb'))
bgneal@323 39 self.thread_cache = {}
bgneal@323 40 self.last_date = None
bgneal@323 41
bgneal@323 42 def process(self):
bgneal@323 43 """
bgneal@323 44 Main processing function. Processes a row at a time from the legacy
bgneal@323 45 database, creating csv records in the thread and posts files as
bgneal@323 46 appropriate.
bgneal@323 47
bgneal@323 48 """
bgneal@323 49 c = self.db.cursor(MySQLdb.cursors.DictCursor)
bgneal@323 50
bgneal@323 51 # query the legacy database
bgneal@323 52 sql = "SELECT * FROM post ORDER BY id"
bgneal@323 53 c.execute(sql)
bgneal@323 54
bgneal@323 55 # convert the old data and write the output to the file
bgneal@323 56 i = 0
bgneal@323 57 while True:
bgneal@323 58 row = c.fetchone()
bgneal@323 59 if row is None:
bgneal@323 60 break
bgneal@323 61 i += 1
bgneal@323 62 if i % 100 == 0:
bgneal@323 63 sys.stdout.write('.')
bgneal@323 64 sys.stdout.flush()
bgneal@323 65
bgneal@323 66 self._process_row(row)
bgneal@323 67
bgneal@323 68 print
bgneal@323 69 c.close()
bgneal@323 70
bgneal@323 71 def _process_row(self, row):
bgneal@323 72 """
bgneal@323 73 Process one row from the legacy database, creating a csv record
bgneal@323 74 in the thread or post files as appropriate.
bgneal@323 75
bgneal@323 76 """
bgneal@323 77 # Create a unified author name from the Yahoo ID and email address
bgneal@323 78 # fields in the original post:
bgneal@323 79 row['author'] = self.get_author(row['name'], row['email'])
bgneal@323 80
bgneal@323 81 # Some posts (mainly from 1 user...) have no date; we'll just
bgneal@323 82 # make one up by using the last date we saw + 1 second
bgneal@323 83
bgneal@323 84 if row['date'] is None:
bgneal@323 85 assert self.last_date is not None
bgneal@323 86 row['date'] = self.last_date + datetime.timedelta(seconds=1)
bgneal@323 87
bgneal@323 88 self.last_date = row['date']
bgneal@323 89
bgneal@323 90 # determine if this is a new thread or a reply
bgneal@323 91
bgneal@323 92 if self.REPLY_RE.match(row['title']):
bgneal@323 93 # This appears to be a reply.
bgneal@323 94 # Remove all the leading Re: etc., cruft
bgneal@323 95 stripped_title = self._strip_title(row['title'])
bgneal@323 96 thread_id = self.thread_cache.get(stripped_title)
bgneal@323 97 if thread_id:
bgneal@323 98 self._create_post(thread_id, row)
bgneal@323 99 else:
bgneal@323 100 # Huh, no parent thread..?; create a new one
bgneal@323 101 # and cache under stripped name so replies will find it
bgneal@323 102 self._create_thread(row)
bgneal@323 103 self.thread_cache[stripped_title] = row['id']
bgneal@323 104 else:
bgneal@323 105 # At first glance, not a reply; see if another thread
bgneal@323 106 # already exists with the exact same title:
bgneal@323 107 thread_id = self.thread_cache.get(row['title'])
bgneal@323 108 if thread_id:
bgneal@323 109 # Duplicate; Yahoo or someone lopped off the Re:
bgneal@323 110 # or it is just a coincidence. Either way, make it
bgneal@323 111 # a post of an existing thread.
bgneal@323 112 self._create_post(thread_id, row)
bgneal@323 113 else:
bgneal@323 114 self._create_thread(row)
bgneal@323 115 self.thread_cache[row['title']] = row['id']
bgneal@323 116
bgneal@323 117
bgneal@323 118 def _create_thread(self, row):
bgneal@323 119 """
bgneal@323 120 Create a new thread from the post data by writing a record in the
bgneal@323 121 thread .csv file and a record in the post file.
bgneal@323 122
bgneal@323 123 """
bgneal@323 124 self.thread_writer.writerow((row['id'],
bgneal@323 125 row['title'].encode('utf-8'),
bgneal@323 126 row['date'],
bgneal@323 127 row['author'].encode('utf-8'),
bgneal@323 128 0))
bgneal@323 129 self._create_post(row['id'], row)
bgneal@323 130
bgneal@323 131 def _create_post(self, thread_id, row):
bgneal@323 132 """
bgneal@323 133 Create a new post from the post data by writing a record in the
bgneal@323 134 post .csv file.
bgneal@323 135
bgneal@323 136 """
bgneal@323 137 self.post_writer.writerow((row['id'],
bgneal@323 138 thread_id,
bgneal@323 139 row['title'].encode('utf-8'),
bgneal@323 140 row['date'],
bgneal@323 141 row['author'].encode('utf-8'),
bgneal@323 142 row['msg'].encode('utf-8'),
bgneal@323 143 ''))
bgneal@323 144
bgneal@323 145 def _strip_title(self, title):
bgneal@323 146 """
bgneal@323 147 Strip out all the Re: and [SurfGuitar101] stuff to get a bare
bgneal@323 148 title.
bgneal@323 149
bgneal@323 150 """
bgneal@323 151 s = title
bgneal@323 152 while self.REPLY_RE.match(s):
bgneal@323 153 s = self.SG101_REPLY_RE.sub('', s).strip()
bgneal@323 154 s = self.REPLY_RE.sub('', s).strip()
bgneal@323 155
bgneal@323 156 return s
bgneal@323 157
bgneal@323 158 @staticmethod
bgneal@323 159 def get_author(yahoo_id, email):
bgneal@323 160
bgneal@323 161 def anti_spam(s):
bgneal@323 162 return s.replace('.', ' dot ').replace('@', ' at ')
bgneal@323 163
bgneal@323 164 name, addr = parseaddr(email)
bgneal@323 165
bgneal@323 166 if name == addr:
bgneal@323 167 name = anti_spam(name)
bgneal@323 168 else:
bgneal@323 169 # For some weird reason, sometimes Yahoo (?) put the email address
bgneal@323 170 # in the name field: "John Doe <doe@example.com" <doe@example.com>"
bgneal@323 171 name2, addr = parseaddr(name)
bgneal@323 172 if name2:
bgneal@323 173 name = name2
bgneal@323 174
bgneal@323 175 if name and yahoo_id and name != yahoo_id:
bgneal@323 176 author = "%s (%s)" % (name, yahoo_id)
bgneal@323 177 elif name:
bgneal@323 178 author = name
bgneal@323 179 elif yahoo_id:
bgneal@323 180 author = yahoo_id
bgneal@323 181 else:
bgneal@323 182 author = anti_spam(email)
bgneal@323 183 return author
bgneal@323 184
bgneal@323 185 ###############################################################################
bgneal@323 186
bgneal@323 187 def main(argv=None):
bgneal@323 188 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
bgneal@323 189 parser.set_defaults(
bgneal@323 190 progress=False,
bgneal@323 191 host='localhost',
bgneal@323 192 user='root',
bgneal@323 193 password='',
bgneal@323 194 database='sg101_yahoo_group',
bgneal@323 195 )
bgneal@323 196 parser.add_option("-s", "--progress", action="store_true",
bgneal@323 197 help="Output a . after every 100 posts to show progress [default: %default]")
bgneal@323 198 parser.add_option("-a", "--host",
bgneal@323 199 help="set MySQL host name [default: %default]"),
bgneal@323 200 parser.add_option("-u", "--user",
bgneal@323 201 help="set MySQL user name [default: %default]")
bgneal@323 202 parser.add_option("-p", "--password",
bgneal@323 203 help="set MySQL user password [default: %default]"),
bgneal@323 204 parser.add_option("-d", "--database",
bgneal@323 205 help="set MySQL database name [default: %default]")
bgneal@323 206 opts, args = parser.parse_args(args=argv)
bgneal@323 207
bgneal@323 208 # connect to the legacy database
bgneal@323 209 try:
bgneal@323 210 db = MySQLdb.connect(host=opts.host,
bgneal@323 211 user=opts.user,
bgneal@323 212 passwd=opts.password,
bgneal@323 213 db=opts.database,
bgneal@323 214 use_unicode=True)
bgneal@323 215 except MySQLdb.DatabaseError, e:
bgneal@323 216 sys.exit("Can't connect to database: %s" % e)
bgneal@323 217
bgneal@323 218 converter = ConvertPosts(db, opts.progress)
bgneal@323 219 converter.process()
bgneal@323 220 db.close()
bgneal@323 221
bgneal@323 222 ###############################################################################
bgneal@323 223
bgneal@323 224 if __name__ == '__main__':
bgneal@323 225 try:
bgneal@323 226 main()
bgneal@323 227 except IOError, ex:
bgneal@323 228 sys.exit("IO Error: %s" % ex)
bgneal@323 229 except KeyboardInterrupt:
bgneal@323 230 sys.exit("Control-C interrupt")