diff tools/import_blogophile.py @ 10:6c03ca07a16d

Renamed my tools directory to "tools". I named it __bgn because I was worried it would clash with a future Pelican updaet. But it seems like this would only happen if I re-ran the quickstart script. "tools" is a better name. :)
author Brian Neal <bgneal@gmail.com>
date Sun, 02 Feb 2014 11:32:13 -0600
parents __bgn/import_blogophile.py@c3115da3ff73
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/import_blogophile.py	Sun Feb 02 11:32:13 2014 -0600
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+"""
+A simple script to convert my Blogofile restructured text posts into the format
+expected by Pelican.
+
+"""
+# Copyright (C) 2014 by Brian Neal.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+import datetime
+import os
+import re
+import time
+
+
+SRC_DIR = os.path.expanduser('~/coding/python/virtualenvs/blogofile/blog/_posts')
+DST_DIR = os.path.expanduser('~/coding/python/venvs/blog/blog-pelican/content/Coding')
+
+TITLE_RE = re.compile(r'^title: (?P<title>.*)$')
+DATE_RE = re.compile(r'^date: (?P<year>\d{4})/'
+                     r'(?P<month>\d{1,2})/'
+                     r'(?P<day>\d{1,2})\s*'
+                     r'(?P<time>\d{2}:\d{2}:\d{2})\s*$')
+CAT_RE = re.compile(r'^categories: (?P<cats>.*)$')
+
+PELICAN_FMT = """\
+{title}
+{title_underline}
+
+:date: {date}
+:tags: {tags}
+:slug: {slug}
+:author: Brian Neal
+
+{content}
+"""
+
+
+class ConvertError(Exception):
+    """Exception class for the conversion process"""
+
+
+def slugify(s):
+    """Return a slug from the string s.
+
+    This code must match what Blogofile was doing in order to keep the URLs the
+    same. In this case I had customized Blogfile's functionality based on a tip
+    by Mike Bayer: http://techspot.zzzeek.org/2010/12/06/my-blogofile-hacks/
+
+    """
+    slug = s.lower()
+
+    # convert ellipses to spaces
+    slug = re.sub(r'\.{2,}', ' ', slug)
+
+    # flatten everything non alpha or . into a single -
+    slug = re.sub(r'[^0-9a-zA-Z\.]+', '-', slug)
+
+    # trim off leading/trailing -
+    slug = re.sub(r'^-+|-+$', '', slug)
+    return slug
+
+
+def convert(src, dst):
+    """Convert Blogofile to Pelican."""
+    print '{} -> {}'.format(src, dst)
+    meta, content = parse_input(src)
+    write_output(meta, content, dst)
+
+
+def parse_input(src):
+    """Parse input Blogofile .rst input.
+
+    Returns a 2-tuple:
+        meta - dictionary of Blogofile metadata
+        content - blog post body as a string
+
+    """
+    with open(src, 'r') as fp:
+        lines = fp.readlines()
+
+    # Find meta block
+    for i, line in enumerate(lines):
+        if line == '---\n':
+            meta_start = i
+            break
+    else:
+        raise ConvertError("Can't find start of meta block")
+
+    for i, line in enumerate(lines[meta_start + 1 :]):
+        if line == '---\n':
+            meta_end = meta_start + 1 + i
+            break
+    else:
+        raise ConvertError("Can't find end of meta block")
+
+    meta_lines = lines[meta_start + 1 : meta_end]
+    meta = {}
+    for line in meta_lines:
+        m = TITLE_RE.match(line)
+        if m:
+            meta['title'] = m.group('title').strip()
+            continue
+        m = DATE_RE.match(line)
+        if m:
+            year = int(m.group('year'))
+            month = int(m.group('month'))
+            day = int(m.group('day'))
+            t = time.strptime(m.group('time'), '%H:%M:%S')
+            meta['date'] = datetime.datetime.combine(
+                    datetime.date(year, month, day),
+                    datetime.time(t.tm_hour, t.tm_min, t.tm_sec))
+            continue
+        m = CAT_RE.match(line)
+        if m:
+            meta['categories'] = m.group('cats').replace(' ', '').split(',')
+            continue
+
+    for k in ['title', 'date', 'categories']:
+        if k not in meta:
+            raise ConvertError("Missing {} in metadata".format(k))
+
+    content = ''.join(lines[meta_end + 1:]).strip()
+    return meta, content
+
+
+def write_output(meta, content, dst):
+    """Create the Pelican style .rst file from the Blogofile metadata and
+    content. Output is written to the file specified by dst.
+
+    """
+    title = meta['title']
+    date = meta['date'].strftime('%Y-%m-%d %H:%M')
+    tags = ', '.join(meta['categories'])
+    slug = slugify(title)
+
+    post = PELICAN_FMT.format(title=title,
+            title_underline='#'*len(title),
+            date=date,
+            tags=tags,
+            slug=slug,
+            content=content)
+
+    with open(dst, 'w') as fp:
+        fp.write(post)
+
+
+if __name__ == '__main__':
+    for name in os.listdir(SRC_DIR):
+        if name.endswith('.rst'):
+            src = os.path.join(SRC_DIR, name)
+            dst = os.path.join(DST_DIR, name)
+
+            try:
+                convert(src, dst)
+            except ConvertError as ex:
+                print "Error converting {}: {}".format(name, ex)