gensite.py (16354B) - raw


      1 #!/usr/bin/env python3
      2 # gensite.py: Static site generator based on makesite.py.
      3 # Copyright (C) 2020 Oscar Benedito <oscar@oscarbenedito.com>
      4 #
      5 # This program is free software: you can redistribute it and/or modify
      6 # it under the terms of the GNU Affero General Public License as published by
      7 # the Free Software Foundation, either version 3 of the License, or
      8 # (at your option) any later version.
      9 #
     10 # This program is distributed in the hope that it will be useful,
     11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 # GNU Affero General Public License for more details.
     14 #
     15 # You should have received a copy of the GNU Affero General Public License
     16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
     17 #
     18 # This file incorporates work covered by the following copyright and
     19 # permission notice:
     20 #
     21 #     Copyright (c) 2018 Sunaina Pai
     22 #
     23 #     Permission is hereby granted, free of charge, to any person obtaining
     24 #     a copy of this software and associated documentation files (the
     25 #     "Software"), to deal in the Software without restriction, including
     26 #     without limitation the rights to use, copy, modify, merge, publish,
     27 #     distribute, sublicense, and/or sell copies of the Software, and to
     28 #     permit persons to whom the Software is furnished to do so, subject to
     29 #     the following conditions:
     30 #
     31 #     The above copyright notice and this permission notice shall be
     32 #     included in all copies or substantial portions of the Software.
     33 #
     34 #     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     35 #     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     36 #     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     37 #     IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     38 #     CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     39 #     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     40 #     SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     41 
     42 
     43 """Static site generator based on makesite.py."""
     44 
     45 
     46 import os
     47 import shutil
     48 import re
     49 import glob
     50 import sys
     51 import datetime
     52 import hashlib
     53 import markdown
     54 
     55 
     56 def fread(filename):
     57     """Read file and close the file."""
     58     with open(filename, 'r') as f:
     59         return f.read()
     60 
     61 
     62 def fwrite(filename, text):
     63     """Write content to file and close the file."""
     64     filename = filename + 'index.html' if filename.endswith('/') or filename == '' else filename
     65     filename = os.path.join('_site', filename)
     66     if os.path.exists(filename):
     67         log('W', 'Warning: Overwritting file: {}', filename)
     68 
     69     basedir = os.path.dirname(filename)
     70     if not os.path.isdir(basedir):
     71         os.makedirs(basedir)
     72 
     73     with open(filename, 'w') as f:
     74         f.write(text)
     75 
     76 
     77 def log(type, msg, *args):
     78     """Log message with specified arguments."""
     79     if type == 'E' or type == 'W':  # or type == 'I':
     80         sys.stderr.write(msg.format(*args) + '\n')
     81 
     82 
     83 def truncate(text, words=50):
     84     """Remove tags and truncate text to the specified number of words."""
     85     return ' '.join(re.sub('(?s)<.*?>', '', text).split()[:words]) + '...'
     86 
     87 
     88 def urlize(name):
     89     """Convert string tu URL."""
     90     return name.lower().replace(' ', '-')
     91 
     92 
     93 def add_to_sitemap(path, lastmod=None, freq=None, priority=None):
     94     """Add URL to sitemap."""
     95     global sitemap
     96     path = '<loc>https://oscarbenedito.com/' + path + '</loc>'
     97     if lastmod == '1970-01-01T00:00:00Z':
     98         lastmod = None
     99     lastmod = '<lastmod>' + lastmod + '</lastmod>' if lastmod else ''
    100     freq = '<changefreq>' + freq + '</changefreq>' if freq else ''
    101     priority = '<priority>' + priority + '</priority>' if priority else ''
    102     sitemap += '<url>' + path + lastmod + freq + priority + '</url>'
    103 
    104 
    105 def set_redirect(src, dst):
    106     """Create HTML redirect."""
    107     fwrite(src, '<!DOCTYPE html><html><head><meta charset="utf-8"><meta http-equiv="refresh" content="0; url=/' + dst + '"/><link rel="canonical" href="/' + dst + '"/><meta name="robots" content="noindex"></head><body><p>This page has been moved to <a href="/' + dst + '">https://oscarbenedito.com/' + dst + '</a>.</p></body></html>')
    108     log('I', 'Info: redirect /{} => /{}', src, dst)
    109 
    110 
    111 def read_headers(text):
    112     """Parse headers in text and yield (key, value, end-index) tuples."""
    113     for match in re.finditer(r'\s*<!--\s*(.+?)\s*:\s*(.+?)\s*-->\s*|.+', text):
    114         if not match.group(1):
    115             break
    116         yield match.group(1), match.group(2), match.end()
    117 
    118 
    119 def prettify_date(date_str):
    120     """Convert ISO 8601 date string to human friendly date string."""
    121     d = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ')
    122     return d.strftime('%B %-d, %Y')
    123 
    124 
    125 def render(template, pre=False, **params):
    126     """Replace placeholders in template with values from params."""
    127     if not pre:
    128         template = re.sub(r'{{\s*_if\s+([^}\s]+)\s*}}(.*?){{\s*_fi\s*}}',
    129                           lambda m: m.group(2) if m.group(1) in params else '',
    130                           template, flags=re.DOTALL)
    131     return re.sub(r'{{\s*([^}\s]+)\s*}}',
    132                   lambda m: str(params.get(m.group(1), m.group(0))),
    133                   template)
    134 
    135 
    136 def read_content(filename):
    137     """Read content and metadata from file into a dictionary."""
    138     # read file content
    139     text = fread(filename)
    140 
    141     # read metadata and save it in a dictionary
    142     date_slug = os.path.basename(filename).split('.')[0]
    143     match = re.search(r'^(?:(\d\d\d\d-\d\d-\d\d)-)?(.+)$', date_slug)
    144     content = {
    145         'date': (match.group(1) or '1970-01-01') + 'T00:00:00Z',
    146         'slug': match.group(2)
    147     }
    148 
    149     # read headers
    150     end = 0
    151     for key, val, end in read_headers(text):
    152         content[key] = val
    153 
    154     if 'lastmod' in content:
    155         content['modified'] = '1'
    156     else:
    157         content['lastmod'] = content['date']
    158 
    159     # separate content from headers
    160     text = text[end:]
    161 
    162     # convert Markdown content to HTML
    163     if filename.endswith('.md'):
    164         text = markdown.markdown(text, extensions=['footnotes', 'fenced_code'])
    165 
    166     content.update({
    167         'content': text,
    168         'year': content['date'][:4],
    169         'month': content['date'][5:7],
    170         'day': content['date'][8:10],
    171         'date_nice': prettify_date(content['date']),
    172         'lastmod_nice': prettify_date(content['lastmod'])
    173     })
    174 
    175     if 'categories' in content:
    176         # convert the categories string to array of categories
    177         categories = [c.strip() for c in content['categories'].split(',')]
    178         categories_html = ', '.join(['<a class="p-category" href="/blog/categories/' + urlize(c) + '/">' + c + '</a>' for c in categories])
    179         content.update({
    180             'categories': categories,
    181             'categories_html': categories_html
    182         })
    183 
    184     return content
    185 
    186 
    187 def make_pages(src, dst, layout, blog=False, **params):
    188     """Generate pages from page content."""
    189     items = []
    190     categories = {}
    191 
    192     for src_path in glob.glob(src):
    193         content = read_content(src_path)
    194 
    195         page_params = dict(params, **content)
    196 
    197         # populate placeholders in content if content-rendering is enabled
    198         if page_params.get('render') == 'yes':
    199             rendered_content = render(page_params['content'], **page_params)
    200             page_params['content'] = rendered_content
    201 
    202         page_dst = render(dst, **page_params)
    203 
    204         if 'url' in page_params:
    205             page_dst = page_params['url']
    206         else:
    207             page_params.update({'url': page_dst})
    208 
    209         if blog:
    210             w = int(len(re.sub('(?s)<.*?>', ' ', page_params['content']).split())/140)
    211             page_params.update({
    212                 'read_time': str(w) + ' minutes' if w > 1 else '1 minute',
    213                 'src_path': src_path,
    214             })
    215             items.append(page_params)
    216         else:
    217             fwrite(page_dst, render(layout, **page_params))
    218             pri = page_params['priority'] if 'priority' in page_params else None
    219             add_to_sitemap(page_dst, lastmod=page_params['lastmod'], priority=pri)
    220             log('I', 'Info: page {} => /{}', src_path, page_dst)
    221 
    222     items.sort(key=lambda x: x['date'], reverse=True)
    223     for i, item in enumerate(items):
    224         if i != 0:
    225             item['next_url'] = items[i-1]['url']
    226             item['next_title'] = items[i-1]['title']
    227             item['more_pages'] = '1'
    228         if i < len(items)-1:
    229             item['prev_url'] = items[i+1]['url']
    230             item['prev_title'] = items[i+1]['title']
    231             item['more_pages'] = '1'
    232 
    233         for category in item['categories']:
    234             if category not in categories:
    235                 categories[category] = [item]
    236             else:
    237                 categories[category].append(item)
    238 
    239         fwrite(item['url'], render(layout, **item))
    240         pri = item['priority'] if 'priority' in item else None
    241         add_to_sitemap(item['url'], lastmod=item['lastmod'], priority=pri)
    242         log('I', 'Info: post {} => /{}', item['src_path'], item['url'])
    243 
    244     return items, categories
    245 
    246 
    247 def make_lists(posts, dst, list_layout, item_layout, src=None, **params):
    248     """Generate HTML lists for a blog."""
    249     item_per_page = 5
    250     items = []
    251     count = 1
    252     page_dst = dst
    253     text = fread(src) if src else fread('content/' + dst + '_index.md')
    254     end = 0
    255     for key, val, end in read_headers(text):
    256         params[key] = val
    257     params['intro'] = markdown.markdown(text[end:], extensions=['footnotes', 'fenced_code'])
    258     for i, post in enumerate(posts):
    259         item_params = dict(params, **post)
    260         item_params['summary'] = truncate(post['content'])
    261         items.append(render(item_layout, **item_params))
    262         if i % item_per_page == item_per_page-1 and len(posts)-1 > i:
    263             params['more_pages'] = '1'
    264             params['content'] = ''.join(items)
    265             params['next_url'] = dst + 'page/' + str(count+1) + '/'
    266             if count != 1:
    267                 params['prev_url'] = dst + ('page/' + str(count-1) + '/' if count != 2 else '')
    268             fwrite(page_dst, render(list_layout, **params))
    269             log('I', 'Info: list => /{}', page_dst)
    270             count = count+1
    271             page_dst = dst + 'page/' + str(count) + '/'
    272             items = []
    273 
    274     if count != 1:
    275         del params['next_url']
    276         params['prev_url'] = dst + ('page/' + str(count-1) + '/' if count != 2 else '')
    277     params['content'] = ''.join(items)
    278     fwrite(page_dst, render(list_layout, **params))
    279     log('I', 'Info: list => /{}', page_dst)
    280 
    281     set_redirect(dst + 'page/1/', dst)
    282 
    283 
    284 def make_feed(posts, dst, list_layout, item_layout, **params):
    285     """Generate feed for a blog."""
    286     max = 15
    287     params['url'] = dst
    288     page_dst = dst + 'index.xml'
    289     items = []
    290     for i, post in enumerate(posts):
    291         if (i == max):
    292             break
    293         item_params = dict(params, **post)
    294         item_params['c_escaped'] = post['content'].replace('&', '&amp;').replace('>', '&gt;').replace('<', '&lt;')
    295         item = render(item_layout, **item_params)
    296         items.append(item)
    297 
    298     params['content'] = ''.join(items)
    299     params['updated'] = posts[0]['lastmod']
    300     fwrite(page_dst, render(list_layout, **params))
    301     log('I', 'Info: feed => /{}', page_dst)
    302 
    303 
    304 def make_archive(posts, categories, dst, layout, **params):
    305     year = 0
    306     params['content'] = '<h2>Posts (' + str(len(posts)) + ')</h2>\n'
    307     for post in posts:
    308         if post['year'] != year:
    309             params['content'] += ('</ul>\n' if year != 0 else '') + '<h3>' + post['year'] + '</h3>\n<ul>\n'
    310             year = post['year']
    311         params['content'] += '<li><a href="/' + post['url'] + '">' + post['title'] + '</a> (' + post['date_nice'][:-6] + ')</li>\n'
    312     params['content'] += '</ul>\n'
    313 
    314     params['content'] += '<h2>Categories (' + str(len(categories)) + ')</h2>\n<ul>\n'
    315     for key in sorted(categories):
    316         val = categories[key]
    317         params['content'] += '<li><a href="/' + dst + 'categories/' + urlize(key) + '/">' + key + '</a> (' + str(len(val)) + (' entry' if len(val) == 1 else ' entries') + ')</li>\n'
    318     params['content'] += '</ul>\n'
    319 
    320     page_dst = dst + 'archive/'
    321     fwrite(page_dst, render(layout, **params))
    322     add_to_sitemap(page_dst, lastmod=posts[0]['lastmod'], priority='0.4')
    323     log('I', 'Info: page => /{}', page_dst)
    324 
    325 
    326 def main():
    327     # create a new _site directory from scratch
    328     if os.path.isdir('_site'):
    329         shutil.rmtree('_site')
    330     shutil.copytree('static', '_site')
    331 
    332     # initialize parameters
    333     params = {}
    334 
    335     # initialize sitemap
    336     global sitemap
    337     sitemap = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
    338 
    339     # copy assets adding part of their sha256 value to the filename
    340     for path, _, files in os.walk('assets'):
    341         for name in files:
    342             file = os.path.join(path, name)
    343             rfile = os.path.relpath(file, 'assets')
    344             with open(file, 'r') as c:
    345                 content = c.read()
    346 
    347             # minify css
    348             if os.path.splitext(file)[1] == '.css':
    349                 content = re.sub('\s*/\*(?:.|\n)*?\*/\s*', '', content)
    350                 content = re.sub('\s+', ' ', content)
    351                 content = re.sub('\s*({|}|;|,)\s*', r'\1', content)
    352                 content = re.sub(':\s*', ':', content)
    353                 rfile = '{0}.min{1}'.format(*os.path.splitext(rfile))
    354 
    355             h = hashlib.sha256()
    356             h.update(content.encode('utf-8'))
    357             name, ext = os.path.splitext(rfile)
    358             dst = '{n}.{h}{e}'.format(n=name, h=h.hexdigest()[:8], e=ext)
    359 
    360             params['_asset_' + rfile] = dst
    361             basedir = os.path.dirname(os.path.join('_site', dst))
    362             if not os.path.isdir(basedir):
    363                 os.makedirs(basedir)
    364             with open(os.path.join('_site', dst), 'w') as c:
    365                 c.write(content)
    366 
    367     # load layouts
    368     base_layout = fread('layouts/base.html')
    369     page_layout = fread('layouts/page.html')
    370     post_layout = fread('layouts/post.html')
    371     list_html = fread('layouts/list.html')
    372     item_html = fread('layouts/item.html')
    373     feed_xml = fread('layouts/feed.xml')
    374     item_xml = fread('layouts/item.xml')
    375     layout_404 = fread('layouts/404.html')
    376 
    377     # combine layouts to form final layouts
    378     page_layout = render(base_layout, pre=True, content=page_layout)
    379     post_layout = render(base_layout, pre=True, content=post_layout)
    380     list_html = render(base_layout, pre=True, content=list_html)
    381 
    382     # create site pages
    383     make_pages('content/_index.md', '', page_layout, **params)
    384     make_pages('content/[!_]*.*', '{{ slug }}/', page_layout, **params)
    385     fwrite('404.html', render(layout_404, **params))
    386 
    387     # create blog post pages
    388     blog_posts, categories = make_pages('content/blog/[!_]*.*',
    389                                         'blog/{{ year }}/{{ month }}/{{ slug }}/',
    390                                         post_layout, True, **params)
    391     # create HTML list pages
    392     make_lists(blog_posts, 'blog/', list_html, item_html, **params)
    393     add_to_sitemap('blog/', lastmod=blog_posts[0]['lastmod'], priority='1.0')
    394     # create Atom feeds
    395     make_feed(blog_posts, 'blog/', feed_xml, item_xml, title='Personal blog',
    396               long_title='Oscar\'s Blog', **params)
    397     # create blog archive
    398     make_archive(blog_posts, categories, 'blog/', page_layout,
    399                  title='Blog archive', **params)
    400     # create blog categories
    401     for name, posts in categories.items():
    402         dst = 'blog/categories/' + urlize(name) + '/'
    403         src = 'content/blog/categories/' + urlize(name) + '.md'
    404         lt = name + ' on Oscar\'s Blog'
    405         eh = '<link rel="alternate" type="application/atom+xml" title="' + lt + '" href="/' + dst + 'index.xml"/>'
    406         make_lists(posts, dst, list_html, item_html, src=src, title=name,
    407                    extraheader=eh, **params)
    408         make_feed(posts, dst, feed_xml, item_xml, title=name, long_title=lt,
    409                   **params)
    410 
    411     # set redirections
    412     set_redirect('licenses/agpl-v3/', 'licenses/agpl-3.0.txt')
    413     set_redirect('licenses/gpl-v3/', 'licenses/gpl-3.0.txt')
    414     set_redirect('licenses/cc-by-4.0/', 'licenses/cc-by-4.0.txt')
    415     set_redirect('composer/', 'projects/composer/composer.html')
    416 
    417     fwrite('sitemap.xml', sitemap + '</urlset>')
    418 
    419 
    420 if __name__ == '__main__':
    421     main()