view tools/import_blogophile.py @ 12:5ff71680269a

Set DISQUS_NO_ID so we don't confuse Disqus. The Pelican Bootstrap3 theme is by default adding data-disqus-identifier to my comment count links. Since my old blog didn't have these, I think Disqus is getting confused. When I removed this, some blog entries where Disqus didn't know what the comment count was suddenly began showing the correct comment counts (mainly 0 I think).
author Brian Neal <bgneal@gmail.com>
date Tue, 04 Feb 2014 18:44:59 -0600
parents 6c03ca07a16d
children
line wrap: on
line source
#!/usr/bin/env python
"""
A simple script to convert my Blogofile restructured text posts into the format
expected by Pelican.

"""
# Copyright (C) 2014 by Brian Neal.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import datetime
import os
import re
import time


SRC_DIR = os.path.expanduser('~/coding/python/virtualenvs/blogofile/blog/_posts')
DST_DIR = os.path.expanduser('~/coding/python/venvs/blog/blog-pelican/content/Coding')

TITLE_RE = re.compile(r'^title: (?P<title>.*)$')
DATE_RE = re.compile(r'^date: (?P<year>\d{4})/'
                     r'(?P<month>\d{1,2})/'
                     r'(?P<day>\d{1,2})\s*'
                     r'(?P<time>\d{2}:\d{2}:\d{2})\s*$')
CAT_RE = re.compile(r'^categories: (?P<cats>.*)$')

PELICAN_FMT = """\
{title}
{title_underline}

:date: {date}
:tags: {tags}
:slug: {slug}
:author: Brian Neal

{content}
"""


class ConvertError(Exception):
    """Exception class for the conversion process"""


def slugify(s):
    """Return a slug from the string s.

    This code must match what Blogofile was doing in order to keep the URLs the
    same. In this case I had customized Blogfile's functionality based on a tip
    by Mike Bayer: http://techspot.zzzeek.org/2010/12/06/my-blogofile-hacks/

    """
    slug = s.lower()

    # convert ellipses to spaces
    slug = re.sub(r'\.{2,}', ' ', slug)

    # flatten everything non alpha or . into a single -
    slug = re.sub(r'[^0-9a-zA-Z\.]+', '-', slug)

    # trim off leading/trailing -
    slug = re.sub(r'^-+|-+$', '', slug)
    return slug


def convert(src, dst):
    """Convert Blogofile to Pelican."""
    print '{} -> {}'.format(src, dst)
    meta, content = parse_input(src)
    write_output(meta, content, dst)


def parse_input(src):
    """Parse input Blogofile .rst input.

    Returns a 2-tuple:
        meta - dictionary of Blogofile metadata
        content - blog post body as a string

    """
    with open(src, 'r') as fp:
        lines = fp.readlines()

    # Find meta block
    for i, line in enumerate(lines):
        if line == '---\n':
            meta_start = i
            break
    else:
        raise ConvertError("Can't find start of meta block")

    for i, line in enumerate(lines[meta_start + 1 :]):
        if line == '---\n':
            meta_end = meta_start + 1 + i
            break
    else:
        raise ConvertError("Can't find end of meta block")

    meta_lines = lines[meta_start + 1 : meta_end]
    meta = {}
    for line in meta_lines:
        m = TITLE_RE.match(line)
        if m:
            meta['title'] = m.group('title').strip()
            continue
        m = DATE_RE.match(line)
        if m:
            year = int(m.group('year'))
            month = int(m.group('month'))
            day = int(m.group('day'))
            t = time.strptime(m.group('time'), '%H:%M:%S')
            meta['date'] = datetime.datetime.combine(
                    datetime.date(year, month, day),
                    datetime.time(t.tm_hour, t.tm_min, t.tm_sec))
            continue
        m = CAT_RE.match(line)
        if m:
            meta['categories'] = m.group('cats').replace(' ', '').split(',')
            continue

    for k in ['title', 'date', 'categories']:
        if k not in meta:
            raise ConvertError("Missing {} in metadata".format(k))

    content = ''.join(lines[meta_end + 1:]).strip()
    return meta, content


def write_output(meta, content, dst):
    """Create the Pelican style .rst file from the Blogofile metadata and
    content. Output is written to the file specified by dst.

    """
    title = meta['title']
    date = meta['date'].strftime('%Y-%m-%d %H:%M')
    tags = ', '.join(meta['categories'])
    slug = slugify(title)

    post = PELICAN_FMT.format(title=title,
            title_underline='#'*len(title),
            date=date,
            tags=tags,
            slug=slug,
            content=content)

    with open(dst, 'w') as fp:
        fp.write(post)


if __name__ == '__main__':
    for name in os.listdir(SRC_DIR):
        if name.endswith('.rst'):
            src = os.path.join(SRC_DIR, name)
            dst = os.path.join(DST_DIR, name)

            try:
                convert(src, dst)
            except ConvertError as ex:
                print "Error converting {}: {}".format(name, ex)