Mercurial > public > sg101
comparison tools/load_ygroup.py @ 323:0c18dfb1da1c
Fixing #149; adding the ygroup application: an archive of the old Yahoo Group messages.
author | Brian Neal <bgneal@gmail.com> |
---|---|
date | Sun, 20 Feb 2011 00:31:54 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
322:c3d3d7114749 | 323:0c18dfb1da1c |
---|---|
1 """ | |
2 load_ygroup.py | |
3 | |
4 This application reads the Yahoo Group posts database and creates .csv files | |
5 for populating the ygroup application tables. The .csv files can be used | |
6 with the mysqlimport command to load the ygroup_thread and ygroup_post tables. | |
7 | |
8 E.g.: | |
9 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv | |
10 | |
11 """ | |
12 import csv | |
13 import datetime | |
14 import optparse | |
15 import re | |
16 import sys | |
17 from email.utils import parseaddr | |
18 | |
19 import MySQLdb | |
20 | |
21 | |
22 USAGE = "usage: %prog [options]" | |
23 DESCRIPTION = """\ | |
24 This program reads the Yahoo Groups database and outputs 2 .csv files suitable | |
25 for import with mysqlimport for loading the ygroup application tables. Threads | |
26 and posts are created from the initial data. | |
27 """ | |
28 ############################################################################### | |
29 | |
30 class ConvertPosts(object): | |
31 REPLY_RE = re.compile(r"^Re:", re.IGNORECASE) | |
32 SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE) | |
33 | |
34 def __init__(self, db, show_progress=False): | |
35 self.db = db | |
36 self.show_progress = show_progress | |
37 self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb')) | |
38 self.post_writer = csv.writer(open('ygroup_post.csv', 'wb')) | |
39 self.thread_cache = {} | |
40 self.last_date = None | |
41 | |
42 def process(self): | |
43 """ | |
44 Main processing function. Processes a row at a time from the legacy | |
45 database, creating csv records in the thread and posts files as | |
46 appropriate. | |
47 | |
48 """ | |
49 c = self.db.cursor(MySQLdb.cursors.DictCursor) | |
50 | |
51 # query the legacy database | |
52 sql = "SELECT * FROM post ORDER BY id" | |
53 c.execute(sql) | |
54 | |
55 # convert the old data and write the output to the file | |
56 i = 0 | |
57 while True: | |
58 row = c.fetchone() | |
59 if row is None: | |
60 break | |
61 i += 1 | |
62 if i % 100 == 0: | |
63 sys.stdout.write('.') | |
64 sys.stdout.flush() | |
65 | |
66 self._process_row(row) | |
67 | |
68 print | |
69 c.close() | |
70 | |
71 def _process_row(self, row): | |
72 """ | |
73 Process one row from the legacy database, creating a csv record | |
74 in the thread or post files as appropriate. | |
75 | |
76 """ | |
77 # Create a unified author name from the Yahoo ID and email address | |
78 # fields in the original post: | |
79 row['author'] = self.get_author(row['name'], row['email']) | |
80 | |
81 # Some posts (mainly from 1 user...) have no date; we'll just | |
82 # make one up by using the last date we saw + 1 second | |
83 | |
84 if row['date'] is None: | |
85 assert self.last_date is not None | |
86 row['date'] = self.last_date + datetime.timedelta(seconds=1) | |
87 | |
88 self.last_date = row['date'] | |
89 | |
90 # determine if this is a new thread or a reply | |
91 | |
92 if self.REPLY_RE.match(row['title']): | |
93 # This appears to be a reply. | |
94 # Remove all the leading Re: etc., cruft | |
95 stripped_title = self._strip_title(row['title']) | |
96 thread_id = self.thread_cache.get(stripped_title) | |
97 if thread_id: | |
98 self._create_post(thread_id, row) | |
99 else: | |
100 # Huh, no parent thread..?; create a new one | |
101 # and cache under stripped name so replies will find it | |
102 self._create_thread(row) | |
103 self.thread_cache[stripped_title] = row['id'] | |
104 else: | |
105 # At first glance, not a reply; see if another thread | |
106 # already exists with the exact same title: | |
107 thread_id = self.thread_cache.get(row['title']) | |
108 if thread_id: | |
109 # Duplicate; Yahoo or someone lopped off the Re: | |
110 # or it is just a coincidence. Either way, make it | |
111 # a post of an existing thread. | |
112 self._create_post(thread_id, row) | |
113 else: | |
114 self._create_thread(row) | |
115 self.thread_cache[row['title']] = row['id'] | |
116 | |
117 | |
118 def _create_thread(self, row): | |
119 """ | |
120 Create a new thread from the post data by writing a record in the | |
121 thread .csv file and a record in the post file. | |
122 | |
123 """ | |
124 self.thread_writer.writerow((row['id'], | |
125 row['title'].encode('utf-8'), | |
126 row['date'], | |
127 row['author'].encode('utf-8'), | |
128 0)) | |
129 self._create_post(row['id'], row) | |
130 | |
131 def _create_post(self, thread_id, row): | |
132 """ | |
133 Create a new post from the post data by writing a record in the | |
134 post .csv file. | |
135 | |
136 """ | |
137 self.post_writer.writerow((row['id'], | |
138 thread_id, | |
139 row['title'].encode('utf-8'), | |
140 row['date'], | |
141 row['author'].encode('utf-8'), | |
142 row['msg'].encode('utf-8'), | |
143 '')) | |
144 | |
145 def _strip_title(self, title): | |
146 """ | |
147 Strip out all the Re: and [SurfGuitar101] stuff to get a bare | |
148 title. | |
149 | |
150 """ | |
151 s = title | |
152 while self.REPLY_RE.match(s): | |
153 s = self.SG101_REPLY_RE.sub('', s).strip() | |
154 s = self.REPLY_RE.sub('', s).strip() | |
155 | |
156 return s | |
157 | |
158 @staticmethod | |
159 def get_author(yahoo_id, email): | |
160 | |
161 def anti_spam(s): | |
162 return s.replace('.', ' dot ').replace('@', ' at ') | |
163 | |
164 name, addr = parseaddr(email) | |
165 | |
166 if name == addr: | |
167 name = anti_spam(name) | |
168 else: | |
169 # For some weird reason, sometimes Yahoo (?) put the email address | |
170 # in the name field: "John Doe <doe@example.com" <doe@example.com>" | |
171 name2, addr = parseaddr(name) | |
172 if name2: | |
173 name = name2 | |
174 | |
175 if name and yahoo_id and name != yahoo_id: | |
176 author = "%s (%s)" % (name, yahoo_id) | |
177 elif name: | |
178 author = name | |
179 elif yahoo_id: | |
180 author = yahoo_id | |
181 else: | |
182 author = anti_spam(email) | |
183 return author | |
184 | |
185 ############################################################################### | |
186 | |
187 def main(argv=None): | |
188 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION) | |
189 parser.set_defaults( | |
190 progress=False, | |
191 host='localhost', | |
192 user='root', | |
193 password='', | |
194 database='sg101_yahoo_group', | |
195 ) | |
196 parser.add_option("-s", "--progress", action="store_true", | |
197 help="Output a . after every 100 posts to show progress [default: %default]") | |
198 parser.add_option("-a", "--host", | |
199 help="set MySQL host name [default: %default]"), | |
200 parser.add_option("-u", "--user", | |
201 help="set MySQL user name [default: %default]") | |
202 parser.add_option("-p", "--password", | |
203 help="set MySQL user password [default: %default]"), | |
204 parser.add_option("-d", "--database", | |
205 help="set MySQL database name [default: %default]") | |
206 opts, args = parser.parse_args(args=argv) | |
207 | |
208 # connect to the legacy database | |
209 try: | |
210 db = MySQLdb.connect(host=opts.host, | |
211 user=opts.user, | |
212 passwd=opts.password, | |
213 db=opts.database, | |
214 use_unicode=True) | |
215 except MySQLdb.DatabaseError, e: | |
216 sys.exit("Can't connect to database: %s" % e) | |
217 | |
218 converter = ConvertPosts(db, opts.progress) | |
219 converter.process() | |
220 db.close() | |
221 | |
222 ############################################################################### | |
223 | |
224 if __name__ == '__main__': | |
225 try: | |
226 main() | |
227 except IOError, ex: | |
228 sys.exit("IO Error: %s" % ex) | |
229 except KeyboardInterrupt: | |
230 sys.exit("Control-C interrupt") |