add threadpools for I/O intensive sections

include some refactoring to simplify parsing, uses a few features that
might be python 3.7 only
M quiescent/bootstrap.py +1 -2
@@ 21,14 21,13 @@ logger = logging.getLogger(__name__)
 def bootstrap():
     config = 'config.ini', """
 [STATIC]
-domain = example.com
+domain = http://example.com
 name = Example Name
 author = Example Author
 output directory = build
 posts directory = posts
 media directory = media
 templates directory = templates
-date format = %Y-%m-%d
 feed link = feed.atom
 """.lstrip()
 

          
M quiescent/command_line.py +3 -3
@@ 16,7 16,7 @@ 
 import argparse
 import logging
 
-from .static import StaticGenerator
+from .static import StaticGenerator, configure
 from .bootstrap import bootstrap
 
 logger = logging.getLogger(__name__)

          
@@ 35,8 35,8 @@ def main():
     if args.bootstrap:
         bootstrap()
     else:
-        s = StaticGenerator(config_file=args.config)
-        s.configure()
+        configuration = configure(args.config)
+        s = StaticGenerator(configuration)
         s.process_posts()
         s.write_generated_files()
         s.copy_media()

          
M quiescent/feed.py +2 -3
@@ 15,7 15,6 @@ 
 
 """
 Atom feed[0] generator
-  - update times are reported in UTC with no offset
 
 [0]: https://tools.ietf.org/html/rfc4287
 """

          
@@ 54,10 53,10 @@ def _feed_entry(parent_element, post, do
     entry_id = ET.SubElement(entry, 'id')
     entry_id.text = urljoin(domain, post.path)
     updated = ET.SubElement(entry, 'updated')
-    updated.text = post._date.isoformat()
+    updated.text = post.date_time.isoformat()
     content = ET.SubElement(entry, 'content')
     content.attrib['type'] = 'html'
-    content.text = post.body
+    content.text = post.html_body
     return entry
 
 def feed(all_posts, date=None, name=None, domain=None, feed_link=None, feed_author=None):

          
M quiescent/post.py +55 -103
@@ 13,110 13,62 @@ 
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+from dataclasses import dataclass
 from datetime import datetime, timezone
-import functools
+from functools import partial
 import urllib.parse
 import os
 import re
 
 from mistune import Markdown
 
-
-@functools.total_ordering
-class Post:
-
-    def __init__(self, relative_dir=''):
-        self.relative_dir = relative_dir
-        self.path = None
-        self.title = None
-        self._date = None
-        self.date = None
-        self.leader = None
-        self.body = None
-        self.markup = None
-        self.markdown = Markdown()
+@dataclass(repr=False)
+class RawPost:
+    title: str
+    date: str
+    leader: str
+    body: str
 
-    def __gt__(self, other):
-        '''used for sorting, reverse chronologically'''
-        return other._date > self._date
-
-    def __eq__(self, other):
-        '''
-        this may be a bit ambiguous, but semantically, it seems like a post is
-        "equal" to another if the text body is the same
-        '''
-        return self.body == other.body
-
-    def __repr__(self):
-        return ('<Post: {title}, {date}>'
-                .format(title=self.title, date=self.date))
+def parse(raw_text: str) -> RawPost:
+    """
+    first paragraph of a post is a "leader" and is split out to
+    generate an index page. Small idiosyncracy, Python's `re` module
+    won't split on a zero-width match (e.g. `^$`) so we're splitting
+    on the first two newlines
+    """
+    head, body = re.split(r'^\+\+\+$', raw_text, maxsplit=1, flags=re.M)
+    metadata = {k.strip().lower(): v.strip()
+                for k, v in (line.split(':', maxsplit=1)
+                             for line in head.strip().split('\n'))}
+    leader, _ = body.strip().split('\n\n', maxsplit=1)
+    return RawPost(metadata['title'], metadata['date'], leader, body)
 
-    def parse(self, raw_text):
-        '''
-        Args:
-            raw_text: string contents of a post file
-        '''
-        try:
-            post = Post(relative_dir=self.relative_dir)
-            meta, body = self._split(raw_text)
-            post.title = meta['title']
-            post.slug = slugify(post.title)
-            post.path = os.path.join(self.relative_dir,
-                                     '{slug}.html'.format(slug=post.slug))
-            post._date = self._parse_date(meta['date'])
-            post.date = post._date.strftime('%Y-%m-%d')
-            post.body = self.markdown(body)
-            post.leader = self.markdown(self._parse_leader(body))
-            return post
-        except (ValueError, KeyError, TypeError) as e:
-            raise ValueError('Unable to parse post from:\n{text}'
-                             .format(text=raw_text[:50]))
-
-    @staticmethod
-    def _split(text):
-        '''
-        Take as input text comprising a post file:
-
-            title: some text
-            date: 2015-12-01
-            ++++
-            ... post contents ...
+@dataclass(repr=False)
+class RenderedPost:
+    title: str
+    date_time: datetime
+    date_string: str
+    html_leader: str
+    html_body: str
+    path: str
 
-        and return a tuple of a dictionary of the top "metadata" kv-pairs and
-        a string of the rest of the file
-        '''
-        frontmatter, body = re.split(r'^\+\+\+$', text, maxsplit=1, flags=re.M)
-        lines = frontmatter.strip().split('\n')
-        line_pairs = (line.split(':', maxsplit=1) for line in lines)
-        meta = {key.strip().lower(): value.strip()
-                for key, value in line_pairs}
-        return meta, body
-
-    @staticmethod
-    def _parse_date(text, date_spec='%Y-%m-%d'):
-        return (datetime
-                .strptime(text, date_spec)
-                .replace(tzinfo=timezone.utc))
-
-    @staticmethod
-    def _parse_leader(post_body):
-        '''
-        I refer to the first paragraph of a post as a "leader" and like to extract
-        it out automatically to generate an index page. Small idiosyncracy,
-        Python's `re` module won't split on a zero-width match (e.g. `^$`) so
-        we're splitting on the first two newlines ¯\_(ツ)_/¯
-        Args:
-            post_body: string, a post, stripped of frontmatter
-        '''
-        first_paragraph, *_ = (post_body
-                               .strip() # excess whitespace
-                               .split('\n\n', maxsplit=1))
-        return first_paragraph
-
+def process(post: RawPost) -> RenderedPost:
+    md = Markdown()
+    # UTC is used here so that a valid date format is included in the
+    # atom feed, not great
+    date = datetime.strptime(post.date, '%Y-%m-%d').replace(tzinfo=timezone.utc)
+    path = os.path.join(date.strftime("%Y"),
+                        '{slug}.html'.format(slug=slugify(post.title)))
+    return RenderedPost(post.title,
+                        date,
+                        post.date,
+                        md(post.leader),
+                        md(post.body),
+                        path)
 
 def slugify(text):
     '''
-    Build hyphenated post slugs from "unsafe" text. RFC3986 requires percent
+    Build hyphenated post slugs from raw text. RFC3986 requires percent
     encoding for UCS/unicode points.
 
     >>> slugify("Wow, 2015 has \"Come and Gone\" already! It's amazing.")

          
@@ 125,15 77,15 @@ def slugify(text):
     >>> slugify("λ is a lambda")
     '%CE%BB-is-a-lambda'
     '''
-    QUOTES = re.compile(r'[\"\']')
-    MULTIPLE_DASH = re.compile(r'-+')
-    NOT_CHAR = re.compile(r'[\W]')
-    # my kingdom for a pipe operator or a threading macro...
-    _string = QUOTES.sub('', text)
-    _string = _string.lower()
-    _string = NOT_CHAR.sub('-', _string)
-    _string = MULTIPLE_DASH.sub('-', _string)
-    _string = urllib.parse.quote(_string, safe='-')
-    output_string = _string.strip('-')
-
-    return output_string
+    remove_quotes = partial(re.sub, r'[\"\']', '')
+    remove_quotes = partial(re.sub, r'[\"\']', '')
+    replace_nonword = partial(re.sub, r'[\W]', '-')
+    collapse_hyphens = partial(re.sub, r'-+', '-')
+    remove_trailing_hyphens = lambda s: s.strip('-')
+    percent_encode = partial(urllib.parse.quote, safe='-')
+    return percent_encode(
+        remove_trailing_hyphens(
+            collapse_hyphens(
+                replace_nonword(
+                    remove_quotes(
+                        text.lower())))))

          
M quiescent/static.py +82 -68
@@ 15,50 15,72 @@ 
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+import concurrent.futures
+from dataclasses import dataclass
 from datetime import datetime, timezone
 import configparser
+import functools
 import argparse
 import logging
 import shutil
-import json
 import sys
 import os
 import re
 
-from .post import Post
+from .post import parse, process
 from .feed import feed
 from .templite import Templite
 
+INDEX_TEMPLATE   = 'index.html'
+ARCHIVE_TEMPLATE = 'archive.html'
+POST_TEMPLATE    = 'post.html'
+
 logger = logging.getLogger(__name__)
 
+@dataclass
+class Configuration:
+    output_dir: str
+    posts_dir: str
+    media_dir: str
+    template_dir: str
+    author: str
+    domain: str
+    feed_name: str
+    feed_link: str
+
+def configure(ini_file):
+    try:
+        config_parser = configparser.ConfigParser(interpolation=None)
+        config_parser.read(ini_file)
+        config = config_parser['STATIC']
+        return Configuration(config['output directory'],
+                             config['posts directory'],
+                             config['media directory'],
+                             config['templates directory'],
+                             config['author'],
+                             config['domain'],
+                             config['name'],
+                             config['feed link'])
+    except Exception as e:
+        logger.error("An error occurred in initial configuration, do "
+                     "you have the necessary configuration file and "
+                     "templates?\n\tTry using the --boostrap command")
+        sys.exit(1)
 
 class StaticGenerator:
-    def __init__(self, config_file=None):
-        self.config_file = config_file
-        self.config = None
+    def __init__(self, configuration: Configuration):
+        self.config = configuration
         self.all_posts = []
-        self.index_template = 'index.html'
-        self.archive_template = 'archive.html'
-        self.post_template = 'post.html'
+        self.post_template = self.render_page(POST_TEMPLATE)
+        self.index_template = self.render_page(INDEX_TEMPLATE)
+        self.archive_template = self.render_page(ARCHIVE_TEMPLATE)
 
-    def configure(self):
-        try:
-            config = configparser.ConfigParser(interpolation=None)
-            config.read(self.config_file)
-            self.config = config['STATIC']
-            self.output_dir = self.config['output directory']
-            self.posts_dir = self.config['posts directory']
-            self.media_dir = self.config['media directory']
-            self.author = self.config['author']
-            self.domain = self.config['domain']
-            self.feed_name = self.config['name']
-            self.feed_link = self.config['feed link']
-            self.template_dir = self.config['templates directory']
-        except Exception as e:
-            logger.error("An error occurred in initial configuration, do "
-                         "you have the necessary configuration file and "
-                         "templates?\n\tTry using the --boostrap command")
-            sys.exit(1)
+    def render_page(self, template_name):
+        template_file = os.path.join(self.config.template_dir, template_name)
+        with open(template_file, encoding='utf-8') as f:
+            template_text = f.read()
+        template = Templite(template_text)
+        return template
 
     def collect_posts(self, from_dir):
         '''

          
@@ 73,65 95,57 @@ class StaticGenerator:
         return post_files
 
     def find_media_directories(self, directory, media_directory):
-        directory_paths = []
-        for root, directories, _ in os.walk(directory):
-            for dir in directories:
-                if dir == media_directory:
-                    directory_paths.append(os.path.join(root, dir))
-        return directory_paths
+        return [os.path.join(root, dir) for root, directories, _ in os.walk(directory)
+                for dir in directories if dir == media_directory]
 
     def copy_media(self):
-        # There may be some potential for optimization here, everything is
-        # copied every time, which has the nice effect of grabbing updated
-        # files with the same name
-        media_dirs = self.find_media_directories(self.posts_dir, self.media_dir)
-        for each_dir in media_dirs:
-            relative_dest_dir = os.path.relpath(each_dir, self.posts_dir)
-            out_path = os.path.join(self.output_dir, relative_dest_dir)
+        def copies_required(directory):
+            relative_dest_dir = os.path.relpath(directory, self.config.posts_dir)
+            out_path = os.path.join(self.config.output_dir, relative_dest_dir)
             os.makedirs(out_path, exist_ok=True)
-            for filename in os.listdir(each_dir):
-                shutil.copy(os.path.join(each_dir, filename), out_path)
+            return ((os.path.join(directory, filename), out_path)
+                    for filename in os.listdir(directory))
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            media_dirs = self.find_media_directories(self.config.posts_dir,
+                                                     self.config.media_dir)
+            for directory in media_dirs:
+                for paths in copies_required(directory):
+                    executor.submit(shutil.copy, *paths)
 
     def process_posts(self):
-        for directory, filename in self.collect_posts(self.posts_dir):
+        for directory, filename in self.collect_posts(self.config.posts_dir):
             file_path = os.path.join(directory, filename)
             with open(file_path, encoding='utf-8') as f:
                 text = f.read()
             try:
-                relative_dir = os.path.relpath(directory, self.posts_dir)
-                post = Post(relative_dir=relative_dir).parse(text)
+                post = process(parse(text))
                 self.all_posts.append(post)
-            except ValueError as e:
+            except Exception as e:
                 logger.warning('Failed to create post: {post}\n\t{e}'
                                .format(post=post, e=e))
-        self.all_posts = sorted(self.all_posts)
-
-    def render_page(self, template_name, **kwargs):
-        template_file = os.path.join(self.template_dir, template_name)
-        with open(template_file, encoding='utf-8') as f:
-            template_text = f.read()
-        template = Templite(template_text)
-        return template.render(kwargs)
+        self.all_posts = sorted(self.all_posts, key=lambda p: p.date_time, reverse=True)
 
     def write_generated_files(self):
-        for post in self.all_posts:
-            post_page = self.render_page(self.post_template, post=post)
-            output_tree = os.path.join(self.output_dir, post.relative_dir)
+        def write_post(post):
+            post_page = self.post_template.render({'post': post})
+            output_tree = os.path.dirname(os.path.join(self.config.output_dir, post.path))
             # reconstitute the input tree in the output directory
             os.makedirs(output_tree, exist_ok=True)
-            output_path = os.path.join(self.output_dir, post.path)
+            output_path = os.path.join(self.config.output_dir, post.path)
             with open(output_path, 'w', encoding='utf-8') as f:
                 f.write(post_page)
 
-        index_path = os.path.join(self.output_dir, self.index_template)
-        index = self.render_page(self.index_template,
-                                 front_posts=self.all_posts[:10])
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            [executor.submit(write_post, post) for post in self.all_posts]
+
+        index_path = os.path.join(self.config.output_dir, INDEX_TEMPLATE)
+        index = self.index_template.render({'front_posts': self.all_posts[:10]})
         with open(index_path, 'w', encoding='utf-8') as f:
             f.write(index)
 
-        archive_path = os.path.join(self.output_dir, self.archive_template)
-        archive = self.render_page(self.archive_template,
-                                   all_posts=self.all_posts)
+        archive_path = os.path.join(self.config.output_dir, ARCHIVE_TEMPLATE)
+        archive = self.archive_template.render({'all_posts': self.all_posts})
         with open(archive_path, 'w', encoding='utf-8') as f:
             f.write(archive)
 

          
@@ 141,10 155,10 @@ class StaticGenerator:
         recent_posts = self.all_posts[:post_limit]
         feed_string = feed(recent_posts,
                            date=datetime.now(timezone.utc),
-                           name=self.feed_name,
-                           domain=self.domain,
-                           feed_link=self.feed_link,
-                           feed_author=self.author)
-        output_path = os.path.join(self.output_dir, self.feed_link)
+                           name=self.config.feed_name,
+                           domain=self.config.domain,
+                           feed_link=self.config.feed_link,
+                           feed_author=self.config.author)
+        output_path = os.path.join(self.config.output_dir, self.config.feed_link)
         with open(output_path, 'wb') as f:
             f.write(feed_string.encode())

          
M quiescent/tests/test_feed.py +21 -20
@@ 14,10 14,10 @@ 
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import unittest
-from datetime import datetime
+from datetime import datetime, timezone
 
 from quiescent.feed import feed
-from quiescent.post import Post
+from quiescent.post import RenderedPost
 
 
 class FeedTests(unittest.TestCase):

          
@@ 29,42 29,43 @@ class FeedTests(unittest.TestCase):
         '''
         self.maxDiff = None # for long diffs
         f = feed([], # no posts
-                 date=datetime.strptime('12-2017-11', '%m-%Y-%d'),
+                 date=datetime.strptime('12-2017-11', '%m-%Y-%d').replace(tzinfo=timezone.utc),
                  name='test configuration',
-                 domain='example.com',
+                 domain='http://example.com',
                  feed_link='feed.xml',
                  feed_author='unit tester')
         expected_string = (
             '<feed xmlns="http://www.w3.org/2005/Atom">'
             '<title>test configuration</title>'
-            '<link href="example.com" /><link href="example.com" rel="self" />'
-            '<updated>2017-12-11T00:00:00</updated>'
+            '<link href="http://example.com" /><link href="http://example.com" rel="self" />'
+            '<updated>2017-12-11T00:00:00+00:00</updated>'
             '<author><name>unit tester</name></author>'
-            '<id>example.com</id>'
+            '<id>http://example.com</id>'
             '</feed>')
         self.assertEqual(f, expected_string)
 
     def test_feed_with_post_is_valid(self):
         self.maxDiff = None # for long diffs
 
-        p = Post()
-        p.title = 'First Post'
-        p._date = datetime.strptime('12-2017-01', '%m-%Y-%d')
-        p.body = '<h1>not much here</h1>'
-
+        p = RenderedPost('First Post', datetime.strptime('12-2017-01', '%m-%Y-%d').replace(tzinfo=timezone.utc), '12-2017-01', '<h1>not much here</h1>', '<h1>not much here</h1>', '2019/a-path.html')
         f = feed([p],
-                 date=datetime.strptime('12-2017-11', '%m-%Y-%d'),
+                 date=datetime.strptime('12-2017-11', '%m-%Y-%d').replace(tzinfo=timezone.utc),
                  name='testing is important',
-                 domain='example.com',
+                 domain='http://example.com',
                  feed_link='feed.xml',
                  feed_author='fizz buzz')
         expected_string = ('<feed xmlns="http://www.w3.org/2005/Atom">'
                            '<title>testing is important</title>'
-                           '<link href="example.com" /><link href="example.com" rel="self" />'
-                           '<updated>2017-12-11T00:00:00</updated>'
+                           '<link href="http://example.com" /><link href="http://example.com" rel="self" />'
+                           '<updated>2017-12-11T00:00:00+00:00</updated>'
                            '<author><name>fizz buzz</name></author>'
-                           '<id>example.com</id>'
-                           '<entry><title>First Post</title><link href="example.com" />'
-                           '<id>example.com</id><updated>2017-12-01T00:00:00</updated>'
-                           '<content type="html">&lt;h1&gt;not much here&lt;/h1&gt;</content></entry></feed>')
+                           '<id>http://example.com</id>'
+                           '<entry>'
+                           '<title>First Post</title>'
+                           '<link href="http://example.com/2019/a-path.html" />'
+                           '<id>http://example.com/2019/a-path.html</id>'
+                           '<updated>2017-12-01T00:00:00+00:00</updated>'
+                           '<content type="html">&lt;h1&gt;not much here&lt;/h1&gt;</content>'
+                           '</entry>'
+                           '</feed>')
         self.assertEqual(f, expected_string)

          
M quiescent/tests/test_post.py +18 -43
@@ 14,59 14,34 @@ 
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import unittest
-import datetime
-
-from quiescent.post import Post, slugify
+from quiescent.post import slugify, parse, process
 
 
 class PostsTests(unittest.TestCase):
-
     def test_front_matter_parsing(self):
-        leading_space = ('\n  title       :    test\ndate: 2017-01-01\n+++\n', 'test')
-        mixed_case = ('\nTitle: test\nDate: 2017-01-01\n+++\n', 'test')
-        correct = ('\ntitle: test\ndate: 2017-01-01\n+++\n', 'test')
-        non_greedy = ('\ntitle:: test\ndate: 2017-01-01\n+++\n', ': test')
+        leading_space = ('\n  title       :    test\ndate: 2017-01-01\n+++\nasdf\n\nasdf', 'test')
+        mixed_case = ('\nTitle: test\nDate: 2017-01-01\n+++\nasdf\n\nasdf', 'test')
+        correct = ('\ntitle: test\ndate: 2017-01-01\n+++\nasdf\n\nasdf', 'test')
+        non_greedy = ('\ntitle:: test\ndate: 2017-01-01\n+++\nasdf\n\nasdf', ': test')
         for case, result in (leading_space, mixed_case, correct, non_greedy):
             with self.subTest():
-                frontmatter, _ = Post._split(case)
-                self.assertEqual(frontmatter['title'], result)
-
-    def test_front_matter_parsing_negative(self):
-        too_many = '\ntitle: test\ndate: 2017-01-01\n++++\n'
-        too_few = '\ntitle: test\ndate: 2017-01-01\n++\n'
-        post = Post()
-        for i in (too_many, too_few):
-            with self.subTest():
-                with self.assertRaises(ValueError):
-                    post.parse(i)
-
-    def test_date_parsing(self):
-        raw_text = '\ntitle: test\ndate: 2017-01-02\n+++\n'
-        meta, _ = Post._split(raw_text)
-        date = Post._parse_date(meta['date'])
-        self.assertTrue(all([date.day == 2,
-                             date.month == 1,
-                             date.year == 2017]))
+                post = parse(case)
+                self.assertEqual(post.title, result)
 
     def test_leader_parsing(self):
-        leader = Post._parse_leader('\nfoo bar baz\n\nfoo bar baz\n')
-        self.assertEqual(leader, 'foo bar baz')
+        post = parse('\ntitle: test\ndate: 2017-01-01\n+++\nsample text\n\nbody text')
+        self.assertEqual(post.leader, 'sample text')
 
-    def test_post_parsing_leader(self):
-        _, body = Post._split('\ntitle: test\ndate: 2017-01-01\n+++\nfoo \nfoo \n\nthe rest\n')
-        leader = Post._parse_leader(body)
-        self.assertEqual(leader, 'foo \nfoo ')
+    def test_leader_parsing_with_newlines(self):
+        post = parse('\ntitle: test\ndate: 2017-01-01\n+++\nsample text\nand some other text\n\nbody text')
+        self.assertEqual(post.leader, 'sample text\nand some other text')
 
-    def test_leader_parsing_single_paragraph(self):
-        leader = Post._parse_leader('\nfoo bar baz\n')
-        self.assertEqual(leader, 'foo bar baz')
-
-    def test_post_sort_by_date(self):
-        earlier = Post().parse('\ntitle: test\ndate: 2016-01-01\n+++\nfoo\n')
-        later = Post().parse('\ntitle: test\ndate: 2017-01-01\n+++\nbar\n')
-        latest = Post().parse('\ntitle: test\ndate: 2017-01-02\n+++\nbaz\n')
-        self.assertEqual(sorted([earlier, latest, later]),
-                                [latest, later, earlier])
+    def test_post_sorting(self):
+        earlier = process(parse('\ntitle: test\ndate: 2016-01-01\n+++\nasdf\n\nasdf'))
+        later = process(parse('\ntitle: test\ndate: 2017-01-01\n+++\nasdf\n\nasdf'))
+        latest = process(parse('\ntitle: test\ndate: 2017-01-03\n+++\nasdf\n\nasdf'))
+        self.assertEqual(sorted([earlier, latest, later], key=lambda p: p.date_time),
+                                [earlier, later, latest])
 
 
 class SlugifyTests(unittest.TestCase):