gensite.py (16034B) - raw


      1 #!/usr/bin/env python3
      2 # gensite.py: Static site generator based on makesite.py.
      3 # Copyright (C) 2020-2021 Oscar Benedito <oscar@oscarbenedito.com>
      4 #
      5 # This program is free software: you can redistribute it and/or modify
      6 # it under the terms of the GNU Affero General Public License as published by
      7 # the Free Software Foundation, either version 3 of the License, or
      8 # (at your option) any later version.
      9 #
     10 # This program is distributed in the hope that it will be useful,
     11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 # GNU Affero General Public License for more details.
     14 #
     15 # You should have received a copy of the GNU Affero General Public License
     16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
     17 #
     18 # This file incorporates work covered by the following copyright and
     19 # permission notice:
     20 #
     21 #     Copyright (c) 2018 Sunaina Pai
     22 #
     23 #     Permission is hereby granted, free of charge, to any person obtaining
     24 #     a copy of this software and associated documentation files (the
     25 #     "Software"), to deal in the Software without restriction, including
     26 #     without limitation the rights to use, copy, modify, merge, publish,
     27 #     distribute, sublicense, and/or sell copies of the Software, and to
     28 #     permit persons to whom the Software is furnished to do so, subject to
     29 #     the following conditions:
     30 #
     31 #     The above copyright notice and this permission notice shall be
     32 #     included in all copies or substantial portions of the Software.
     33 #
     34 #     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     35 #     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     36 #     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     37 #     IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     38 #     CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     39 #     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     40 #     SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     41 
     42 
     43 """Static site generator based on makesite.py."""
     44 
     45 
     46 import os
     47 import shutil
     48 import re
     49 import glob
     50 import sys
     51 import datetime
     52 import hashlib
     53 import markdown
     54 
     55 
     56 def fread(filename):
     57     """Read file and close the file."""
     58     with open(filename, 'r') as f:
     59         return f.read()
     60 
     61 
     62 def fwrite(filename, text):
     63     """Write content to file and close the file."""
     64     filename = filename + 'index.html' if filename.endswith('/') or filename == '' else filename
     65     filename = os.path.join('_site', filename)
     66     if os.path.exists(filename):
     67         log('W', 'Overwritting file: {}', filename)
     68 
     69     basedir = os.path.dirname(filename)
     70     if not os.path.isdir(basedir):
     71         os.makedirs(basedir)
     72 
     73     with open(filename, 'w') as f:
     74         f.write(text)
     75 
     76 
     77 def log(type, msg, *args):
     78     """Log message with specified arguments."""
     79     if type == 'E':
     80         sys.stderr.write('Error: ' + msg.format(*args) + '\n')
     81         sys.exit(1)
     82     if type == 'W':
     83         sys.stderr.write('Warning: ' + msg.format(*args) + '\n')
     84     # if type == 'I':
     85     #     sys.stderr.write('Info: ' + msg.format(*args) + '\n')
     86 
     87 
     88 def urlize(name):
     89     """Convert string tu URL."""
     90     return name.lower().replace(' ', '-')
     91 
     92 
     93 def add_to_sitemap(path, lastmod=None, freq=None, priority=None):
     94     """Add URL to sitemap."""
     95     global sitemap
     96     path = '<loc>https://oscarbenedito.com/' + path + '</loc>'
     97     if lastmod == '1970-01-01T00:00:00Z':
     98         lastmod = None
     99     lastmod = '<lastmod>' + lastmod + '</lastmod>' if lastmod else ''
    100     freq = '<changefreq>' + freq + '</changefreq>' if freq else ''
    101     priority = '<priority>' + priority + '</priority>' if priority else ''
    102     sitemap += '<url>' + path + lastmod + freq + priority + '</url>'
    103 
    104 
    105 def set_redirect(src, dst):
    106     """Create HTML redirect."""
    107     fwrite(src, '<!DOCTYPE html><html><head><meta charset="utf-8">'
    108                 '<meta http-equiv="refresh" content="0; url=/{}"/>'
    109                 '<link rel="canonical" href="/{}"/><meta name="robots" content="noindex">'
    110                 '</head><body><p>This page has been moved to '
    111                 '<a href="/{}">https://oscarbenedito.com/{}</a>.</p>'
    112                 '</body></html>'.format(dst, dst, dst, dst))
    113     log('I', 'redirect /{} => /{}', src, dst)
    114 
    115 
    116 def read_headers(text):
    117     """Parse headers in text and yield (key, value, end-index) tuples."""
    118     for match in re.finditer(r'\s*<!--\s*(.+?)\s*:\s*(.+?)\s*-->\s*|.+', text):
    119         if not match.group(1):
    120             break
    121         yield match.group(1), match.group(2), match.end()
    122 
    123 
    124 def prettify_date(date_str):
    125     """Convert ISO 8601 date string to human friendly date string."""
    126     d = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ')
    127     return d.strftime('%B %-d, %Y')
    128 
    129 
    130 def render(template, pre=False, **params):
    131     """Replace placeholders in template with values from params."""
    132     if not pre:
    133         template = re.sub(r'{{\s*_if\s+([^}\s]+)\s*}}(.*?){{\s*_fi\s*}}',
    134                           lambda m: m.group(2) if m.group(1) in params else '',
    135                           template, flags=re.DOTALL)
    136     return re.sub(r'{{\s*([^}\s]+)\s*}}',
    137                   lambda m: str(params.get(m.group(1), m.group(0))),
    138                   template)
    139 
    140 
    141 def read_content(filename):
    142     """Read content and metadata from file into a dictionary."""
    143     # read file content
    144     text = fread(filename)
    145 
    146     # read metadata and save it in a dictionary
    147     date_slug = os.path.basename(filename).split('.')[0]
    148     match = re.search(r'^(?:(\d\d\d\d-\d\d-\d\d)-)?(.+)$', date_slug)
    149     content = {
    150         'date': (match.group(1) or '1970-01-01') + 'T00:00:00Z',
    151         'slug': match.group(2)
    152     }
    153 
    154     # read headers
    155     end = 0
    156     for key, val, end in read_headers(text):
    157         content[key] = val
    158 
    159     if 'lastmod' in content:
    160         content['modified'] = '1'
    161     else:
    162         content['lastmod'] = content['date']
    163 
    164     # separate content from headers
    165     text = text[end:]
    166 
    167     # convert Markdown content to HTML
    168     if filename.endswith('.md'):
    169         text = markdown.markdown(text, extensions=['footnotes', 'fenced_code'])
    170 
    171     content.update({
    172         'content': text,
    173         'year': content['date'][:4],
    174         'month': content['date'][5:7],
    175         'day': content['date'][8:10],
    176         'date_nice': prettify_date(content['date']),
    177         'lastmod_nice': prettify_date(content['lastmod'])
    178     })
    179 
    180     if 'categories' in content:
    181         # convert the categories string to array of categories
    182         categories = [c.strip() for c in content['categories'].split(',')]
    183         categories_html = ', '.join(['<a class="p-category" href="/blog/categories/' + urlize(c) + '/">' + c + '</a>' for c in categories])
    184         content.update({
    185             'categories': categories,
    186             'categories_html': categories_html
    187         })
    188 
    189     return content
    190 
    191 
    192 def make_pages(src, dst, layout, blog=False, **params):
    193     """Generate pages from page content."""
    194     items = []
    195     categories = {}
    196 
    197     for src_path in glob.glob(src):
    198         content = read_content(src_path)
    199 
    200         page_params = dict(params, **content)
    201 
    202         # populate placeholders in content if content-rendering is enabled
    203         if page_params.get('render') == 'yes':
    204             rendered_content = render(page_params['content'], **page_params)
    205             page_params['content'] = rendered_content
    206 
    207         if 'url' not in page_params:
    208             page_params['url'] = render(dst, **page_params)
    209         else:   # can be deleted, just to warn since I have never used it
    210             log('W', 'parameter \'url\' set in {}', src_path)
    211 
    212         if blog:
    213             page_params['src_path'] = src_path
    214             items.append(page_params)
    215         else:
    216             fwrite(page_params['url'], render(layout, **page_params))
    217             pri = page_params['priority'] if 'priority' in page_params else None
    218             add_to_sitemap(page_params['url'], lastmod=page_params['lastmod'], priority=pri)
    219             log('I', 'page {} => /{}', src_path, page_params['url'])
    220 
    221     # the following is only executed if blog == True, otherwise items is empty
    222     items.sort(key=lambda x: x['date'], reverse=True)
    223     for i, item in enumerate(items):
    224         if i != 0:
    225             item['next_url'] = items[i-1]['url']
    226             item['next_title'] = items[i-1]['title']
    227             item['multiple_pages'] = '1'
    228         if i < len(items)-1:
    229             item['prev_url'] = items[i+1]['url']
    230             item['prev_title'] = items[i+1]['title']
    231             item['multiple_pages'] = '1'
    232 
    233         for category in item['categories']:
    234             if category not in categories:
    235                 categories[category] = [item]
    236             else:
    237                 categories[category].append(item)
    238 
    239         fwrite(item['url'], render(layout, **item))
    240         pri = item['priority'] if 'priority' in item else None
    241         add_to_sitemap(item['url'], lastmod=item['lastmod'], priority=pri)
    242         log('I', 'post {} => /{}', item['src_path'], item['url'])
    243 
    244     return items, categories
    245 
    246 
    247 def make_lists(posts, dst, l_html, l_html_item, l_feed, l_feed_item, **params):
    248     """Generate HTML lists and Atom feed for a set of posts."""
    249     if os.path.isfile('content/' + dst + '_index.md'):
    250         text = fread('content/' + dst + '_index.md')
    251     else:
    252         text = fread('content/' + dst[:-1] + '.md')
    253     end = 0
    254 
    255     for key, val, end in read_headers(text):
    256         params[key] = val
    257 
    258     params['intro'] = markdown.markdown(text[end:], extensions=['footnotes', 'fenced_code'])
    259 
    260     # make HTML lists
    261     ipp = 5     # items per page
    262     params['content'] = ''
    263     title = params['title']
    264     if dst != 'blog/':  # blog feed appears on all pages already
    265         params['extraheader'] = '<link rel="alternate" type="application/atom+xml" ' \
    266                                 'title="{}" href="/{}index.xml"/>'.format(params['feed_title'], dst)
    267 
    268     for i, post in enumerate(posts):
    269         item_params = dict(params, **post)
    270 
    271         # remove tags and truncate at 50 words
    272         item_params['summary'] = ' '.join(re.sub('(?s)<.*?>', '', post['content']).split()[:50]) + '...'
    273 
    274         params['content'] += render(l_html_item, **item_params)
    275 
    276         if i % ipp == ipp-1 or i == len(posts)-1:
    277             page = i//ipp + 1
    278             curr_dst = dst + ('page/{}/'.format(page) if i >= ipp else '')
    279 
    280             if i != len(posts)-1:
    281                 params['multiple_pages'] = '1'
    282                 params['next_url'] = '{}page/{}/'.format(dst, page + 1)
    283             elif page > 1:
    284                 params.pop('next_url')
    285 
    286             if page != 1:
    287                 params['title'] = '{} (page {} of {})'.format(title, page, ((len(posts)-1)//ipp) + 1)
    288 
    289             fwrite(curr_dst, render(l_html, **params))
    290             log('I', 'list => /{}', curr_dst)
    291 
    292             params['prev_url'] = curr_dst
    293             params['content'] = ''
    294 
    295     set_redirect(dst + 'page/1/', dst)
    296 
    297     # make Atom feed
    298     ipp = 15    # item per feed
    299     params['url'] = dst
    300     page_dst = dst + 'index.xml'
    301     params['content'] = ''
    302     for i, post in enumerate(posts):
    303         if (i == ipp):
    304             break
    305         item_params = dict(params, **post)
    306 
    307         # escape HTML content
    308         item_params['c_escaped'] = post['content'].replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
    309 
    310         params['content'] += render(l_feed_item, **item_params)
    311 
    312     params['updated'] = posts[0]['lastmod']
    313     fwrite(page_dst, render(l_feed, **params))
    314     log('I', 'feed => /{}', page_dst)
    315 
    316 
    317 def make_archive(posts, categories, dst, layout, **params):
    318     year = 0
    319     params['content'] = '<h2>Posts ({})</h2>\n'.format(len(posts))
    320     for post in posts:
    321         if post['year'] != year:
    322             params['content'] += '</ul>\n' if year != 0 else ''
    323             params['content'] += '<h3>{}</h3>\n<ul>\n'.format(post['year'])
    324             year = post['year']
    325         params['content'] += '<li><a href="/{}">{}</a> ({})</li>\n' \
    326                              ''.format(post['url'], post['title'], post['date_nice'][:-6])
    327     params['content'] += '</ul>\n'
    328 
    329     params['content'] += '<h2>Categories ({})</h2>\n<ul>\n'.format(len(categories))
    330     for key in sorted(categories):
    331         val = categories[key]
    332         params['content'] += '<li><a href="/{}categories/{}/">{}</a> ({} {})</li>\n' \
    333                              ''.format(dst, urlize(key), key, len(val), 'entry' if len(val) == 1 else 'entries')
    334     params['content'] += '</ul>\n'
    335 
    336     page_dst = dst + 'archive/'
    337     fwrite(page_dst, render(layout, **params))
    338     add_to_sitemap(page_dst, lastmod=posts[0]['lastmod'], priority='0.4')
    339     log('I', 'page => /{}', page_dst)
    340 
    341 
    342 def main():
    343     # create a new _site directory from scratch
    344     if os.path.isdir('_site'):
    345         shutil.rmtree('_site')
    346     shutil.copytree('static', '_site')
    347 
    348     # initialize parameters
    349     params = {}
    350 
    351     # initialize sitemap
    352     global sitemap
    353     sitemap = '<?xml version="1.0" encoding="UTF-8"?>\n' \
    354               '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
    355 
    356     # copy assets adding part of their sha256 value to the filename
    357     for path, _, files in os.walk('assets'):
    358         for name in files:
    359             file = os.path.join(path, name)
    360             rfile = os.path.relpath(file, 'assets')
    361             with open(file, 'r') as c:
    362                 content = c.read()
    363 
    364             # minify css
    365             if os.path.splitext(file)[1] == '.css':
    366                 content = re.sub('\s*/\*(?:.|\n)*?\*/\s*', '', content)
    367                 content = re.sub('\s+', ' ', content)
    368                 content = re.sub('\s*({|}|;|,)\s*', r'\1', content)
    369                 content = re.sub(':\s*', ':', content)
    370                 rfile = '{0}.min{1}'.format(*os.path.splitext(rfile))
    371 
    372             h = hashlib.sha256()
    373             h.update(content.encode('utf-8'))
    374             name, ext = os.path.splitext(rfile)
    375             dst = '{n}.{h}{e}'.format(n=name, h=h.hexdigest()[:8], e=ext)
    376 
    377             params['_asset_' + rfile] = dst
    378             basedir = os.path.dirname(os.path.join('_site', dst))
    379             if not os.path.isdir(basedir):
    380                 os.makedirs(basedir)
    381             with open(os.path.join('_site', dst), 'w') as c:
    382                 c.write(content)
    383 
    384     # load layouts
    385     l_base = fread('layouts/base.html')
    386     l_page = render(l_base, pre=True, content=fread('layouts/page.html'))
    387     l_post = render(l_base, pre=True, content=fread('layouts/post.html'))
    388     l_list = render(l_base, pre=True, content=fread('layouts/list.html'))
    389     l_feed = fread('layouts/feed.xml')
    390     item_html = fread('layouts/item.html')
    391     item_xml = fread('layouts/item.xml')
    392 
    393     # create site pages
    394     make_pages('content/_index.md', '', l_page, **params)
    395     make_pages('content/[!_]*.*', '{{ slug }}/', l_page, **params)
    396     fwrite('404.html', render(fread('layouts/404.html'), **params))
    397 
    398     # create blog post pages
    399     all_posts, categories = make_pages('content/blog/[!_]*.*',
    400                                        'blog/{{ year }}/{{ month }}/{{ slug }}/',
    401                                        l_post, blog=True, **params)
    402 
    403     # create HTML list pages and Atom feed
    404     make_lists(all_posts, 'blog/', l_list, item_html, l_feed, item_xml, **params)
    405 
    406     add_to_sitemap('blog/', lastmod=all_posts[0]['lastmod'], priority='1.0')
    407 
    408     # create blog archive
    409     make_archive(all_posts, categories, 'blog/', l_page, title='Blog archive', **params)
    410 
    411     # create blog categories
    412     for name, c_posts in categories.items():
    413         dst = 'blog/categories/' + urlize(name) + '/'
    414         make_lists(c_posts, dst, l_list, item_html, l_feed, item_xml, **params)
    415 
    416     # set redirections
    417     set_redirect('licenses/agpl-v3/', 'licenses/agpl-3.0.txt')
    418     set_redirect('licenses/gpl-v3/', 'licenses/gpl-3.0.txt')
    419     set_redirect('licenses/cc-by-4.0/', 'licenses/cc-by-4.0.txt')
    420     set_redirect('composer/', 'projects/composer/composer.html')
    421 
    422     fwrite('sitemap.xml', sitemap + '</urlset>')
    423 
    424 
    425 if __name__ == '__main__':
    426     main()