bgneal@323
|
1 """
|
bgneal@323
|
2 load_ygroup.py
|
bgneal@323
|
3
|
bgneal@323
|
4 This application reads the Yahoo Group posts database and creates .csv files
|
bgneal@323
|
5 for populating the ygroup application tables. The .csv files can be used
|
bgneal@323
|
6 with the mysqlimport command to load the ygroup_thread and ygroup_post tables.
|
bgneal@323
|
7
|
bgneal@323
|
8 E.g.:
|
bgneal@323
|
9 mysqlimport --fields-optionally-enclosed-by=\" --fields-terminated-by=, --fields-escaped-by="" --lines-terminated-by="\r\n" --user=root --password --local --default-character-set=utf8 gremmies_portal /path/to/ygroup_thread.csv
|
bgneal@323
|
10
|
bgneal@323
|
11 """
|
bgneal@323
|
12 import csv
|
bgneal@323
|
13 import datetime
|
bgneal@323
|
14 import optparse
|
bgneal@323
|
15 import re
|
bgneal@323
|
16 import sys
|
bgneal@323
|
17 from email.utils import parseaddr
|
bgneal@323
|
18
|
bgneal@323
|
19 import MySQLdb
|
bgneal@323
|
20
|
bgneal@323
|
21
|
bgneal@323
|
22 USAGE = "usage: %prog [options]"
|
bgneal@323
|
23 DESCRIPTION = """\
|
bgneal@323
|
24 This program reads the Yahoo Groups database and outputs 2 .csv files suitable
|
bgneal@323
|
25 for import with mysqlimport for loading the ygroup application tables. Threads
|
bgneal@323
|
26 and posts are created from the initial data.
|
bgneal@323
|
27 """
|
bgneal@323
|
28 ###############################################################################
|
bgneal@323
|
29
|
bgneal@323
|
30 class ConvertPosts(object):
|
bgneal@323
|
31 REPLY_RE = re.compile(r"^Re:", re.IGNORECASE)
|
bgneal@323
|
32 SG101_REPLY_RE = re.compile(r"^Re:\s*\[SurfGuitar101\]", re.IGNORECASE)
|
bgneal@323
|
33
|
bgneal@323
|
34 def __init__(self, db, show_progress=False):
|
bgneal@323
|
35 self.db = db
|
bgneal@323
|
36 self.show_progress = show_progress
|
bgneal@323
|
37 self.thread_writer = csv.writer(open('ygroup_thread.csv', 'wb'))
|
bgneal@323
|
38 self.post_writer = csv.writer(open('ygroup_post.csv', 'wb'))
|
bgneal@323
|
39 self.thread_cache = {}
|
bgneal@323
|
40 self.last_date = None
|
bgneal@323
|
41
|
bgneal@323
|
42 def process(self):
|
bgneal@323
|
43 """
|
bgneal@323
|
44 Main processing function. Processes a row at a time from the legacy
|
bgneal@323
|
45 database, creating csv records in the thread and posts files as
|
bgneal@323
|
46 appropriate.
|
bgneal@323
|
47
|
bgneal@323
|
48 """
|
bgneal@323
|
49 c = self.db.cursor(MySQLdb.cursors.DictCursor)
|
bgneal@323
|
50
|
bgneal@323
|
51 # query the legacy database
|
bgneal@323
|
52 sql = "SELECT * FROM post ORDER BY id"
|
bgneal@323
|
53 c.execute(sql)
|
bgneal@323
|
54
|
bgneal@323
|
55 # convert the old data and write the output to the file
|
bgneal@323
|
56 i = 0
|
bgneal@323
|
57 while True:
|
bgneal@323
|
58 row = c.fetchone()
|
bgneal@323
|
59 if row is None:
|
bgneal@323
|
60 break
|
bgneal@323
|
61 i += 1
|
bgneal@323
|
62 if i % 100 == 0:
|
bgneal@323
|
63 sys.stdout.write('.')
|
bgneal@323
|
64 sys.stdout.flush()
|
bgneal@323
|
65
|
bgneal@323
|
66 self._process_row(row)
|
bgneal@323
|
67
|
bgneal@323
|
68 print
|
bgneal@323
|
69 c.close()
|
bgneal@323
|
70
|
bgneal@323
|
71 def _process_row(self, row):
|
bgneal@323
|
72 """
|
bgneal@323
|
73 Process one row from the legacy database, creating a csv record
|
bgneal@323
|
74 in the thread or post files as appropriate.
|
bgneal@323
|
75
|
bgneal@323
|
76 """
|
bgneal@323
|
77 # Create a unified author name from the Yahoo ID and email address
|
bgneal@323
|
78 # fields in the original post:
|
bgneal@323
|
79 row['author'] = self.get_author(row['name'], row['email'])
|
bgneal@323
|
80
|
bgneal@323
|
81 # Some posts (mainly from 1 user...) have no date; we'll just
|
bgneal@323
|
82 # make one up by using the last date we saw + 1 second
|
bgneal@323
|
83
|
bgneal@323
|
84 if row['date'] is None:
|
bgneal@323
|
85 assert self.last_date is not None
|
bgneal@323
|
86 row['date'] = self.last_date + datetime.timedelta(seconds=1)
|
bgneal@323
|
87
|
bgneal@323
|
88 self.last_date = row['date']
|
bgneal@323
|
89
|
bgneal@323
|
90 # determine if this is a new thread or a reply
|
bgneal@323
|
91
|
bgneal@323
|
92 if self.REPLY_RE.match(row['title']):
|
bgneal@323
|
93 # This appears to be a reply.
|
bgneal@323
|
94 # Remove all the leading Re: etc., cruft
|
bgneal@323
|
95 stripped_title = self._strip_title(row['title'])
|
bgneal@323
|
96 thread_id = self.thread_cache.get(stripped_title)
|
bgneal@323
|
97 if thread_id:
|
bgneal@323
|
98 self._create_post(thread_id, row)
|
bgneal@323
|
99 else:
|
bgneal@323
|
100 # Huh, no parent thread..?; create a new one
|
bgneal@323
|
101 # and cache under stripped name so replies will find it
|
bgneal@323
|
102 self._create_thread(row)
|
bgneal@323
|
103 self.thread_cache[stripped_title] = row['id']
|
bgneal@323
|
104 else:
|
bgneal@323
|
105 # At first glance, not a reply; see if another thread
|
bgneal@323
|
106 # already exists with the exact same title:
|
bgneal@323
|
107 thread_id = self.thread_cache.get(row['title'])
|
bgneal@323
|
108 if thread_id:
|
bgneal@323
|
109 # Duplicate; Yahoo or someone lopped off the Re:
|
bgneal@323
|
110 # or it is just a coincidence. Either way, make it
|
bgneal@323
|
111 # a post of an existing thread.
|
bgneal@323
|
112 self._create_post(thread_id, row)
|
bgneal@323
|
113 else:
|
bgneal@323
|
114 self._create_thread(row)
|
bgneal@323
|
115 self.thread_cache[row['title']] = row['id']
|
bgneal@323
|
116
|
bgneal@323
|
117
|
bgneal@323
|
118 def _create_thread(self, row):
|
bgneal@323
|
119 """
|
bgneal@323
|
120 Create a new thread from the post data by writing a record in the
|
bgneal@323
|
121 thread .csv file and a record in the post file.
|
bgneal@323
|
122
|
bgneal@323
|
123 """
|
bgneal@323
|
124 self.thread_writer.writerow((row['id'],
|
bgneal@323
|
125 row['title'].encode('utf-8'),
|
bgneal@323
|
126 row['date'],
|
bgneal@323
|
127 row['author'].encode('utf-8'),
|
bgneal@323
|
128 0))
|
bgneal@323
|
129 self._create_post(row['id'], row)
|
bgneal@323
|
130
|
bgneal@323
|
131 def _create_post(self, thread_id, row):
|
bgneal@323
|
132 """
|
bgneal@323
|
133 Create a new post from the post data by writing a record in the
|
bgneal@323
|
134 post .csv file.
|
bgneal@323
|
135
|
bgneal@323
|
136 """
|
bgneal@323
|
137 self.post_writer.writerow((row['id'],
|
bgneal@323
|
138 thread_id,
|
bgneal@323
|
139 row['title'].encode('utf-8'),
|
bgneal@323
|
140 row['date'],
|
bgneal@323
|
141 row['author'].encode('utf-8'),
|
bgneal@323
|
142 row['msg'].encode('utf-8'),
|
bgneal@323
|
143 ''))
|
bgneal@323
|
144
|
bgneal@323
|
145 def _strip_title(self, title):
|
bgneal@323
|
146 """
|
bgneal@323
|
147 Strip out all the Re: and [SurfGuitar101] stuff to get a bare
|
bgneal@323
|
148 title.
|
bgneal@323
|
149
|
bgneal@323
|
150 """
|
bgneal@323
|
151 s = title
|
bgneal@323
|
152 while self.REPLY_RE.match(s):
|
bgneal@323
|
153 s = self.SG101_REPLY_RE.sub('', s).strip()
|
bgneal@323
|
154 s = self.REPLY_RE.sub('', s).strip()
|
bgneal@323
|
155
|
bgneal@323
|
156 return s
|
bgneal@323
|
157
|
bgneal@323
|
158 @staticmethod
|
bgneal@323
|
159 def get_author(yahoo_id, email):
|
bgneal@323
|
160
|
bgneal@323
|
161 def anti_spam(s):
|
bgneal@323
|
162 return s.replace('.', ' dot ').replace('@', ' at ')
|
bgneal@323
|
163
|
bgneal@323
|
164 name, addr = parseaddr(email)
|
bgneal@323
|
165
|
bgneal@323
|
166 if name == addr:
|
bgneal@323
|
167 name = anti_spam(name)
|
bgneal@323
|
168 else:
|
bgneal@323
|
169 # For some weird reason, sometimes Yahoo (?) put the email address
|
bgneal@323
|
170 # in the name field: "John Doe <doe@example.com" <doe@example.com>"
|
bgneal@323
|
171 name2, addr = parseaddr(name)
|
bgneal@323
|
172 if name2:
|
bgneal@323
|
173 name = name2
|
bgneal@323
|
174
|
bgneal@323
|
175 if name and yahoo_id and name != yahoo_id:
|
bgneal@323
|
176 author = "%s (%s)" % (name, yahoo_id)
|
bgneal@323
|
177 elif name:
|
bgneal@323
|
178 author = name
|
bgneal@323
|
179 elif yahoo_id:
|
bgneal@323
|
180 author = yahoo_id
|
bgneal@323
|
181 else:
|
bgneal@323
|
182 author = anti_spam(email)
|
bgneal@323
|
183 return author
|
bgneal@323
|
184
|
bgneal@323
|
185 ###############################################################################
|
bgneal@323
|
186
|
bgneal@323
|
187 def main(argv=None):
|
bgneal@323
|
188 parser = optparse.OptionParser(usage=USAGE, description=DESCRIPTION)
|
bgneal@323
|
189 parser.set_defaults(
|
bgneal@323
|
190 progress=False,
|
bgneal@323
|
191 host='localhost',
|
bgneal@323
|
192 user='root',
|
bgneal@323
|
193 password='',
|
bgneal@323
|
194 database='sg101_yahoo_group',
|
bgneal@323
|
195 )
|
bgneal@323
|
196 parser.add_option("-s", "--progress", action="store_true",
|
bgneal@323
|
197 help="Output a . after every 100 posts to show progress [default: %default]")
|
bgneal@323
|
198 parser.add_option("-a", "--host",
|
bgneal@323
|
199 help="set MySQL host name [default: %default]"),
|
bgneal@323
|
200 parser.add_option("-u", "--user",
|
bgneal@323
|
201 help="set MySQL user name [default: %default]")
|
bgneal@323
|
202 parser.add_option("-p", "--password",
|
bgneal@323
|
203 help="set MySQL user password [default: %default]"),
|
bgneal@323
|
204 parser.add_option("-d", "--database",
|
bgneal@323
|
205 help="set MySQL database name [default: %default]")
|
bgneal@323
|
206 opts, args = parser.parse_args(args=argv)
|
bgneal@323
|
207
|
bgneal@323
|
208 # connect to the legacy database
|
bgneal@323
|
209 try:
|
bgneal@323
|
210 db = MySQLdb.connect(host=opts.host,
|
bgneal@323
|
211 user=opts.user,
|
bgneal@323
|
212 passwd=opts.password,
|
bgneal@323
|
213 db=opts.database,
|
bgneal@323
|
214 use_unicode=True)
|
bgneal@323
|
215 except MySQLdb.DatabaseError, e:
|
bgneal@323
|
216 sys.exit("Can't connect to database: %s" % e)
|
bgneal@323
|
217
|
bgneal@323
|
218 converter = ConvertPosts(db, opts.progress)
|
bgneal@323
|
219 converter.process()
|
bgneal@323
|
220 db.close()
|
bgneal@323
|
221
|
bgneal@323
|
222 ###############################################################################
|
bgneal@323
|
223
|
bgneal@323
|
224 if __name__ == '__main__':
|
bgneal@323
|
225 try:
|
bgneal@323
|
226 main()
|
bgneal@323
|
227 except IOError, ex:
|
bgneal@323
|
228 sys.exit("IO Error: %s" % ex)
|
bgneal@323
|
229 except KeyboardInterrupt:
|
bgneal@323
|
230 sys.exit("Control-C interrupt")
|