comparison tools/load_ygroup.py @ 323:0c18dfb1da1c

Fixing #149; adding the ygroup application: an archive of the old Yahoo Group messages.
author Brian Neal <bgneal@gmail.com>
date Sun, 20 Feb 2011 00:31:54 +0000
parents
children
comparison
equal deleted inserted replaced
322:c3d3d7114749 323:0c18dfb1da1c
1 """
2 load_ygroup.py
3
4 This application reads the Yahoo Group posts database and creates .csv files
5 for populating the ygroup application tables. The .csv files can be used
6 with the mysqlimport command to load the ygroup_thread and ygroup_post tables.
7
8 E.g.:
9 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv
10
11 """
12 import csv
13 import datetime
14 import optparse
15 import re
16 import sys
17 from email.utils import parseaddr
18
19 import MySQLdb
20
21
22 USAGE = "usage: %prog [options]"
23 DESCRIPTION = """\
24 This program reads the Yahoo Groups database and outputs 2 .csv files suitable
25 for import with mysqlimport for loading the ygroup application tables. Threads
26 and posts are created from the initial data.
27 """
28 ###############################################################################
29
30 class ConvertPosts(object):
31 REPLY_RE = re.compile(r"^Re:", re.IGNORECASE)
32 SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE)
33
34 def __init__(self, db, show_progress=False):
35 self.db = db
36 self.show_progress = show_progress
37 self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb'))
38 self.post_writer = csv.writer(open('ygroup_post.csv', 'wb'))
39 self.thread_cache = {}
40 self.last_date = None
41
42 def process(self):
43 """
44 Main processing function. Processes a row at a time from the legacy
45 database, creating csv records in the thread and posts files as
46 appropriate.
47
48 """
49 c = self.db.cursor(MySQLdb.cursors.DictCursor)
50
51 # query the legacy database
52 sql = "SELECT * FROM post ORDER BY id"
53 c.execute(sql)
54
55 # convert the old data and write the output to the file
56 i = 0
57 while True:
58 row = c.fetchone()
59 if row is None:
60 break
61 i += 1
62 if i % 100 == 0:
63 sys.stdout.write('.')
64 sys.stdout.flush()
65
66 self._process_row(row)
67
68 print
69 c.close()
70
71 def _process_row(self, row):
72 """
73 Process one row from the legacy database, creating a csv record
74 in the thread or post files as appropriate.
75
76 """
77 # Create a unified author name from the Yahoo ID and email address
78 # fields in the original post:
79 row['author'] = self.get_author(row['name'], row['email'])
80
81 # Some posts (mainly from 1 user...) have no date; we'll just
82 # make one up by using the last date we saw + 1 second
83
84 if row['date'] is None:
85 assert self.last_date is not None
86 row['date'] = self.last_date + datetime.timedelta(seconds=1)
87
88 self.last_date = row['date']
89
90 # determine if this is a new thread or a reply
91
92 if self.REPLY_RE.match(row['title']):
93 # This appears to be a reply.
94 # Remove all the leading Re: etc., cruft
95 stripped_title = self._strip_title(row['title'])
96 thread_id = self.thread_cache.get(stripped_title)
97 if thread_id:
98 self._create_post(thread_id, row)
99 else:
100 # Huh, no parent thread..?; create a new one
101 # and cache under stripped name so replies will find it
102 self._create_thread(row)
103 self.thread_cache[stripped_title] = row['id']
104 else:
105 # At first glance, not a reply; see if another thread
106 # already exists with the exact same title:
107 thread_id = self.thread_cache.get(row['title'])
108 if thread_id:
109 # Duplicate; Yahoo or someone lopped off the Re:
110 # or it is just a coincidence. Either way, make it
111 # a post of an existing thread.
112 self._create_post(thread_id, row)
113 else:
114 self._create_thread(row)
115 self.thread_cache[row['title']] = row['id']
116
117
118 def _create_thread(self, row):
119 """
120 Create a new thread from the post data by writing a record in the
121 thread .csv file and a record in the post file.
122
123 """
124 self.thread_writer.writerow((row['id'],
125 row['title'].encode('utf-8'),
126 row['date'],
127 row['author'].encode('utf-8'),
128 0))
129 self._create_post(row['id'], row)
130
131 def _create_post(self, thread_id, row):
132 """
133 Create a new post from the post data by writing a record in the
134 post .csv file.
135
136 """
137 self.post_writer.writerow((row['id'],
138 thread_id,
139 row['title'].encode('utf-8'),
140 row['date'],
141 row['author'].encode('utf-8'),
142 row['msg'].encode('utf-8'),
143 ''))
144
145 def _strip_title(self, title):
146 """
147 Strip out all the Re: and [SurfGuitar101] stuff to get a bare
148 title.
149
150 """
151 s = title
152 while self.REPLY_RE.match(s):
153 s = self.SG101_REPLY_RE.sub('', s).strip()
154 s = self.REPLY_RE.sub('', s).strip()
155
156 return s
157
158 @staticmethod
159 def get_author(yahoo_id, email):
160
161 def anti_spam(s):
162 return s.replace('.', ' dot ').replace('@', ' at ')
163
164 name, addr = parseaddr(email)
165
166 if name == addr:
167 name = anti_spam(name)
168 else:
169 # For some weird reason, sometimes Yahoo (?) put the email address
170 # in the name field: "John Doe <doe@example.com" <doe@example.com>"
171 name2, addr = parseaddr(name)
172 if name2:
173 name = name2
174
175 if name and yahoo_id and name != yahoo_id:
176 author = "%s (%s)" % (name, yahoo_id)
177 elif name:
178 author = name
179 elif yahoo_id:
180 author = yahoo_id
181 else:
182 author = anti_spam(email)
183 return author
184
185 ###############################################################################
186
187 def main(argv=None):
188 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
189 parser.set_defaults(
190 progress=False,
191 host='localhost',
192 user='root',
193 password='',
194 database='sg101_yahoo_group',
195 )
196 parser.add_option("-s", "--progress", action="store_true",
197 help="Output a . after every 100 posts to show progress [default: %default]")
198 parser.add_option("-a", "--host",
199 help="set MySQL host name [default: %default]"),
200 parser.add_option("-u", "--user",
201 help="set MySQL user name [default: %default]")
202 parser.add_option("-p", "--password",
203 help="set MySQL user password [default: %default]"),
204 parser.add_option("-d", "--database",
205 help="set MySQL database name [default: %default]")
206 opts, args = parser.parse_args(args=argv)
207
208 # connect to the legacy database
209 try:
210 db = MySQLdb.connect(host=opts.host,
211 user=opts.user,
212 passwd=opts.password,
213 db=opts.database,
214 use_unicode=True)
215 except MySQLdb.DatabaseError, e:
216 sys.exit("Can't connect to database: %s" % e)
217
218 converter = ConvertPosts(db, opts.progress)
219 converter.process()
220 db.close()
221
222 ###############################################################################
223
224 if __name__ == '__main__':
225 try:
226 main()
227 except IOError, ex:
228 sys.exit("IO Error: %s" % ex)
229 except KeyboardInterrupt:
230 sys.exit("Control-C interrupt")