gensite.py (16247B) - raw


      1 #!/usr/bin/env python3
      2 # gensite.py: Static site generator based on makesite.py.
      3 # Copyright (C) 2020-2021 Oscar Benedito <oscar@oscarbenedito.com>
      4 #
      5 # This program is free software: you can redistribute it and/or modify
      6 # it under the terms of the GNU Affero General Public License as published by
      7 # the Free Software Foundation, either version 3 of the License, or
      8 # (at your option) any later version.
      9 #
     10 # This program is distributed in the hope that it will be useful,
     11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 # GNU Affero General Public License for more details.
     14 #
     15 # You should have received a copy of the GNU Affero General Public License
     16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
     17 #
     18 # This file incorporates work covered by the following copyright and
     19 # permission notice:
     20 #
     21 #     Copyright (c) 2018 Sunaina Pai
     22 #
     23 #     Permission is hereby granted, free of charge, to any person obtaining
     24 #     a copy of this software and associated documentation files (the
     25 #     "Software"), to deal in the Software without restriction, including
     26 #     without limitation the rights to use, copy, modify, merge, publish,
     27 #     distribute, sublicense, and/or sell copies of the Software, and to
     28 #     permit persons to whom the Software is furnished to do so, subject to
     29 #     the following conditions:
     30 #
     31 #     The above copyright notice and this permission notice shall be
     32 #     included in all copies or substantial portions of the Software.
     33 #
     34 #     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     35 #     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     36 #     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     37 #     IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     38 #     CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     39 #     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     40 #     SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     41 
     42 
     43 """Static site generator based on makesite.py."""
     44 
     45 
     46 import os
     47 import shutil
     48 import re
     49 import glob
     50 import sys
     51 import datetime
     52 import hashlib
     53 import markdown
     54 
     55 
     56 def fread(filename):
     57     """Read file and close the file."""
     58     with open(filename, 'r') as f:
     59         return f.read()
     60 
     61 
     62 def fwrite(filename, text):
     63     """Write content to file and close the file."""
     64     filename = filename + 'index.html' if filename.endswith('/') or filename == '' else filename
     65     filename = os.path.join('_site', filename)
     66     if os.path.exists(filename):
     67         log('W', 'Overwritting file: {}', filename)
     68 
     69     basedir = os.path.dirname(filename)
     70     if not os.path.isdir(basedir):
     71         os.makedirs(basedir)
     72 
     73     with open(filename, 'w') as f:
     74         f.write(text)
     75 
     76 
     77 def log(type, msg, *args):
     78     """Log message with specified arguments."""
     79     if type == 'E':
     80         sys.stderr.write('Error: ' + msg.format(*args) + '\n')
     81         sys.exit(1)
     82     if type == 'W':
     83         sys.stderr.write('Warning: ' + msg.format(*args) + '\n')
     84     # if type == 'I':
     85     #     sys.stderr.write('Info: ' + msg.format(*args) + '\n')
     86 
     87 
     88 def urlize(name):
     89     """Convert string tu URL."""
     90     return name.lower().replace(' ', '-')
     91 
     92 
     93 def add_to_sitemap(path, lastmod=None, freq=None, priority=None):
     94     """Add URL to sitemap."""
     95     global sitemap
     96     path = '<loc>https://oscarbenedito.com/' + path + '</loc>'
     97     if lastmod == '1970-01-01T00:00:00Z':
     98         lastmod = None
     99     lastmod = '<lastmod>' + lastmod + '</lastmod>' if lastmod else ''
    100     freq = '<changefreq>' + freq + '</changefreq>' if freq else ''
    101     priority = '<priority>' + priority + '</priority>' if priority else ''
    102     sitemap += '<url>' + path + lastmod + freq + priority + '</url>'
    103 
    104 
    105 def set_redirect(src, dst):
    106     """Create HTML redirect."""
    107     fwrite(src, '<!DOCTYPE html><html><head><meta charset="utf-8">'
    108                 '<meta http-equiv="refresh" content="0; url=/{}"/>'
    109                 '<link rel="canonical" href="/{}"/><meta name="robots" content="noindex">'
    110                 '</head><body><p>This page has been moved to '
    111                 '<a href="/{}">https://oscarbenedito.com/{}</a>.</p>'
    112                 '</body></html>'.format(dst, dst, dst, dst))
    113     log('I', 'redirect /{} => /{}', src, dst)
    114     # uncomment next line to print apache redirects
    115     # sys.stdout.write('Redirect permanent "/{}" "/{}"\n'.format(src, dst))
    116 
    117 
    118 def read_headers(text):
    119     """Parse headers in text and yield (key, value, end-index) tuples."""
    120     for match in re.finditer(r'\s*<!--\s*(.+?)\s*:\s*(.+?)\s*-->\s*|.+', text):
    121         if not match.group(1):
    122             break
    123         yield match.group(1), match.group(2), match.end()
    124 
    125 
    126 def prettify_date(date_str):
    127     """Convert ISO 8601 date string to human friendly date string."""
    128     d = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ')
    129     return d.strftime('%B %-d, %Y')
    130 
    131 
    132 def render(template, pre=False, **params):
    133     """Replace placeholders in template with values from params."""
    134     if not pre:
    135         template = re.sub(r'{{\s*_if\s+([^}\s]+)\s*}}(.*?){{\s*_fi\s*}}',
    136                           lambda m: m.group(2) if m.group(1) in params else '',
    137                           template, flags=re.DOTALL)
    138     return re.sub(r'{{\s*([^}\s]+)\s*}}',
    139                   lambda m: str(params.get(m.group(1), m.group(0))),
    140                   template)
    141 
    142 
    143 def read_content(filename):
    144     """Read content and metadata from file into a dictionary."""
    145     # read file content
    146     text = fread(filename)
    147 
    148     # read metadata and save it in a dictionary
    149     date_slug = os.path.basename(filename).split('.')[0]
    150     match = re.search(r'^(?:(\d\d\d\d-\d\d-\d\d)-)?(.+)$', date_slug)
    151     content = {
    152         'date': (match.group(1) or '1970-01-01') + 'T00:00:00Z',
    153         'slug': match.group(2)
    154     }
    155 
    156     # read headers
    157     end = 0
    158     for key, val, end in read_headers(text):
    159         content[key] = val
    160 
    161     if 'lastmod' in content:
    162         content['modified'] = '1'
    163     else:
    164         content['lastmod'] = content['date']
    165 
    166     # separate content from headers
    167     text = text[end:]
    168 
    169     # convert Markdown content to HTML
    170     if filename.endswith('.md'):
    171         text = markdown.markdown(text, extensions=['footnotes', 'fenced_code'])
    172 
    173     content.update({
    174         'content': text,
    175         'year': content['date'][:4],
    176         'month': content['date'][5:7],
    177         'day': content['date'][8:10],
    178         'date_nice': prettify_date(content['date']),
    179         'lastmod_nice': prettify_date(content['lastmod'])
    180     })
    181 
    182     if 'categories' in content:
    183         # convert the categories string to array of categories
    184         categories = [c.strip() for c in content['categories'].split(',')]
    185         categories_html = ', '.join(['<a class="p-category" href="/blog/categories/' + urlize(c) + '/">' + c + '</a>' for c in categories])
    186         content.update({
    187             'categories': categories,
    188             'categories_html': categories_html
    189         })
    190 
    191     return content
    192 
    193 
    194 def make_pages(src, dst, layout, blog=False, **params):
    195     """Generate pages from page content."""
    196     items = []
    197     categories = {}
    198 
    199     for src_path in glob.glob(src):
    200         content = read_content(src_path)
    201 
    202         page_params = dict(params, **content)
    203 
    204         # populate placeholders in content if content-rendering is enabled
    205         if page_params.get('render') == 'yes':
    206             rendered_content = render(page_params['content'], **page_params)
    207             page_params['content'] = rendered_content
    208 
    209         if 'url' not in page_params:
    210             page_params['url'] = render(dst, **page_params)
    211         else:   # can be deleted, just to warn since I have never used it
    212             log('W', 'parameter \'url\' set in {}', src_path)
    213 
    214         if blog:
    215             page_params['src_path'] = src_path
    216             items.append(page_params)
    217         else:
    218             fwrite(page_params['url'], render(layout, **page_params))
    219             pri = page_params['priority'] if 'priority' in page_params else None
    220             add_to_sitemap(page_params['url'], lastmod=page_params['lastmod'], priority=pri)
    221             log('I', 'page {} => /{}', src_path, page_params['url'])
    222 
    223     # the following is only executed if blog == True, otherwise items is empty
    224     items.sort(key=lambda x: x['date'], reverse=True)
    225     for i, item in enumerate(items):
    226         if i != 0:
    227             item['next_url'] = items[i-1]['url']
    228             item['next_title'] = items[i-1]['title']
    229             item['multiple_pages'] = '1'
    230         if i < len(items)-1:
    231             item['prev_url'] = items[i+1]['url']
    232             item['prev_title'] = items[i+1]['title']
    233             item['multiple_pages'] = '1'
    234 
    235         for category in item['categories']:
    236             if category not in categories:
    237                 categories[category] = [item]
    238             else:
    239                 categories[category].append(item)
    240 
    241         fwrite(item['url'], render(layout, **item))
    242         pri = item['priority'] if 'priority' in item else None
    243         add_to_sitemap(item['url'], lastmod=item['lastmod'], priority=pri)
    244         log('I', 'post {} => /{}', item['src_path'], item['url'])
    245 
    246     return items, categories
    247 
    248 
    249 def make_lists(posts, dst, l_html, l_html_item, l_feed, l_feed_item, **params):
    250     """Generate HTML lists and Atom feed for a set of posts."""
    251     if os.path.isfile('content/' + dst + '_index.md'):
    252         text = fread('content/' + dst + '_index.md')
    253     else:
    254         text = fread('content/' + dst[:-1] + '.md')
    255     end = 0
    256 
    257     for key, val, end in read_headers(text):
    258         params[key] = val
    259 
    260     params['intro'] = markdown.markdown(text[end:], extensions=['footnotes', 'fenced_code'])
    261 
    262     # make HTML lists
    263     ipp = 5     # items per page
    264     params['content'] = ''
    265     title = params['title']
    266     if dst != 'blog/':  # blog feed appears on all pages already
    267         params['extraheader'] = '<link rel="alternate" type="application/atom+xml" ' \
    268                                 'title="{}" href="/{}index.xml"/>'.format(params['feed_title'], dst)
    269 
    270     for i, post in enumerate(posts):
    271         item_params = dict(params, **post)
    272 
    273         # remove tags and truncate at 50 words
    274         item_params['summary'] = ' '.join(re.sub('(?s)<.*?>', '', post['content']).split()[:50]) + '...'
    275 
    276         params['content'] += render(l_html_item, **item_params)
    277 
    278         if i % ipp == ipp-1 or i == len(posts)-1:
    279             page = i//ipp + 1
    280             curr_dst = dst + ('page/{}/'.format(page) if i >= ipp else '')
    281 
    282             if i != len(posts)-1:
    283                 params['multiple_pages'] = '1'
    284                 params['next_url'] = '{}page/{}/'.format(dst, page + 1)
    285             elif page > 1:
    286                 params.pop('next_url')
    287 
    288             if page != 1:
    289                 params['title'] = '{} (page {} of {})'.format(title, page, ((len(posts)-1)//ipp) + 1)
    290 
    291             fwrite(curr_dst, render(l_html, **params))
    292             log('I', 'list => /{}', curr_dst)
    293 
    294             params['prev_url'] = curr_dst
    295             params['content'] = ''
    296 
    297     set_redirect(dst + 'page/1/', dst)
    298 
    299     # make Atom feed
    300     ipp = 15    # item per feed
    301     params['url'] = dst
    302     page_dst = dst + 'index.xml'
    303     params['content'] = ''
    304     for i, post in enumerate(posts):
    305         if (i == ipp):
    306             break
    307         item_params = dict(params, **post)
    308 
    309         # escape HTML content
    310         item_params['c_escaped'] = post['content'].replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
    311 
    312         params['content'] += render(l_feed_item, **item_params)
    313 
    314     params['updated'] = posts[0]['lastmod']
    315     fwrite(page_dst, render(l_feed, **params))
    316     log('I', 'feed => /{}', page_dst)
    317 
    318 
    319 def make_archive(posts, categories, dst, layout, **params):
    320     year = 0
    321     params['content'] = '<h2>Posts ({})</h2>\n'.format(len(posts))
    322     for post in posts:
    323         if post['year'] != year:
    324             params['content'] += '</ul>\n' if year != 0 else ''
    325             params['content'] += '<h3>{}</h3>\n<ul>\n'.format(post['year'])
    326             year = post['year']
    327         params['content'] += '<li><a href="/{}">{}</a> ({})</li>\n' \
    328                              ''.format(post['url'], post['title'], post['date_nice'][:-6])
    329     params['content'] += '</ul>\n'
    330 
    331     params['content'] += '<h2>Categories ({})</h2>\n<ul>\n'.format(len(categories))
    332     for key in sorted(categories):
    333         val = categories[key]
    334         params['content'] += '<li><a href="/{}categories/{}/">{}</a> ({} {})</li>\n' \
    335                              ''.format(dst, urlize(key), key, len(val), 'entry' if len(val) == 1 else 'entries')
    336     params['content'] += '</ul>\n'
    337 
    338     page_dst = dst + 'archive/'
    339     fwrite(page_dst, render(layout, **params))
    340     add_to_sitemap(page_dst, lastmod=posts[0]['lastmod'], priority='0.4')
    341     log('I', 'page => /{}', page_dst)
    342 
    343 
    344 def main():
    345     # create a new _site directory from scratch
    346     if os.path.isdir('_site'):
    347         shutil.rmtree('_site')
    348     shutil.copytree('static', '_site')
    349 
    350     # initialize parameters
    351     params = {}
    352 
    353     # initialize sitemap
    354     global sitemap
    355     sitemap = '<?xml version="1.0" encoding="UTF-8"?>\n' \
    356               '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
    357 
    358     # copy assets adding part of their sha256 value to the filename
    359     for path, _, files in os.walk('assets'):
    360         for name in files:
    361             file = os.path.join(path, name)
    362             rfile = os.path.relpath(file, 'assets')
    363             with open(file, 'r') as c:
    364                 content = c.read()
    365 
    366             # minify css
    367             if os.path.splitext(file)[1] == '.css':
    368                 content = re.sub('\s*/\*(?:.|\n)*?\*/\s*', '', content)
    369                 content = re.sub('\s+', ' ', content)
    370                 content = re.sub('\s*({|}|;|,)\s*', r'\1', content)
    371                 content = re.sub(':\s*', ':', content)
    372                 rfile = '{0}.min{1}'.format(*os.path.splitext(rfile))
    373 
    374             h = hashlib.sha256()
    375             h.update(content.encode('utf-8'))
    376             name, ext = os.path.splitext(rfile)
    377             dst = '{n}.{h}{e}'.format(n=name, h=h.hexdigest()[:8], e=ext)
    378 
    379             params['_asset_' + rfile] = dst
    380             basedir = os.path.dirname(os.path.join('_site', dst))
    381             if not os.path.isdir(basedir):
    382                 os.makedirs(basedir)
    383             with open(os.path.join('_site', dst), 'w') as c:
    384                 c.write(content)
    385 
    386     # load layouts
    387     l_base = fread('layouts/base.html')
    388     l_page = render(l_base, pre=True, content=fread('layouts/page.html'))
    389     l_post = render(l_base, pre=True, content=fread('layouts/post.html'))
    390     l_list = render(l_base, pre=True, content=fread('layouts/list.html'))
    391     l_feed = fread('layouts/feed.xml')
    392     item_html = fread('layouts/item.html')
    393     item_xml = fread('layouts/item.xml')
    394 
    395     # create site pages
    396     make_pages('content/_index.md', '', l_page, **params)
    397     make_pages('content/[!_]*.*', '{{ slug }}/', l_page, **params)
    398     make_pages('content/projects/[!_]*.*', 'projects/{{ slug }}/', l_page, **params)
    399     fwrite('404.html', render(fread('layouts/404.html'), **params))
    400 
    401     # create blog post pages
    402     all_posts, categories = make_pages('content/blog/[!_]*.*',
    403                                        'blog/{{ year }}/{{ month }}/{{ slug }}/',
    404                                        l_post, blog=True, **params)
    405 
    406     # create HTML list pages and Atom feed
    407     make_lists(all_posts, 'blog/', l_list, item_html, l_feed, item_xml, **params)
    408 
    409     add_to_sitemap('blog/', lastmod=all_posts[0]['lastmod'], priority='1.0')
    410 
    411     # create blog archive
    412     make_archive(all_posts, categories, 'blog/', l_page, title='Blog archive', **params)
    413 
    414     # create blog categories
    415     for name, c_posts in categories.items():
    416         dst = 'blog/categories/' + urlize(name) + '/'
    417         make_lists(c_posts, dst, l_list, item_html, l_feed, item_xml, **params)
    418 
    419     # set redirections
    420     set_redirect('licenses/agpl-v3/', 'licenses/agpl-3.0.txt')
    421     set_redirect('licenses/gpl-v3/', 'licenses/gpl-3.0.txt')
    422     set_redirect('licenses/cc-by-4.0/', 'licenses/cc-by-4.0.txt')
    423     set_redirect('composer/', 'projects/composer/composer.html')
    424 
    425     fwrite('sitemap.xml', sitemap + '</urlset>')
    426 
    427 
    428 if __name__ == '__main__':
    429     main()