gensite.py (16285B) - raw


      1 #!/usr/bin/env python3
      2 # gensite.py: Static site generator based on makesite.py.
      3 # Copyright (C) 2020 Oscar Benedito <oscar@oscarbenedito.com>
      4 #
      5 # This program is free software: you can redistribute it and/or modify
      6 # it under the terms of the GNU Affero General Public License as published by
      7 # the Free Software Foundation, either version 3 of the License, or
      8 # (at your option) any later version.
      9 #
     10 # This program is distributed in the hope that it will be useful,
     11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 # GNU Affero General Public License for more details.
     14 #
     15 # You should have received a copy of the GNU Affero General Public License
     16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
     17 #
     18 # This file incorporates work covered by the following copyright and
     19 # permission notice:
     20 #
     21 #     Copyright (c) 2018 Sunaina Pai
     22 #
     23 #     Permission is hereby granted, free of charge, to any person obtaining
     24 #     a copy of this software and associated documentation files (the
     25 #     "Software"), to deal in the Software without restriction, including
     26 #     without limitation the rights to use, copy, modify, merge, publish,
     27 #     distribute, sublicense, and/or sell copies of the Software, and to
     28 #     permit persons to whom the Software is furnished to do so, subject to
     29 #     the following conditions:
     30 #
     31 #     The above copyright notice and this permission notice shall be
     32 #     included in all copies or substantial portions of the Software.
     33 #
     34 #     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     35 #     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     36 #     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     37 #     IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     38 #     CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     39 #     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     40 #     SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     41 
     42 
     43 """Static site generator based on makesite.py."""
     44 
     45 
     46 import os
     47 import shutil
     48 import re
     49 import glob
     50 import sys
     51 import datetime
     52 import hashlib
     53 import markdown
     54 
     55 
     56 def fread(filename):
     57     """Read file and close the file."""
     58     with open(filename, 'r') as f:
     59         return f.read()
     60 
     61 
     62 def fwrite(filename, text):
     63     """Write content to file and close the file."""
     64     filename = filename + 'index.html' if filename.endswith('/') or filename == '' else filename
     65     filename = os.path.join('_site', filename)
     66     if os.path.exists(filename):
     67         log('W', 'Warning: Overwritting file: {}', filename)
     68 
     69     basedir = os.path.dirname(filename)
     70     if not os.path.isdir(basedir):
     71         os.makedirs(basedir)
     72 
     73     with open(filename, 'w') as f:
     74         f.write(text)
     75 
     76 
     77 def log(type, msg, *args):
     78     """Log message with specified arguments."""
     79     if type == 'E' or type == 'W':  # or type == 'I':
     80         sys.stderr.write(msg.format(*args) + '\n')
     81 
     82 
     83 def truncate(text, words=50):
     84     """Remove tags and truncate text to the specified number of words."""
     85     return ' '.join(re.sub('(?s)<.*?>', '', text).split()[:words]) + '...'
     86 
     87 
     88 def urlize(name):
     89     """Convert string tu URL."""
     90     return name.lower().replace(' ', '-')
     91 
     92 
     93 def add_to_sitemap(path, lastmod=None, freq=None, priority=None):
     94     """Add URL to sitemap."""
     95     global sitemap
     96     path = '<loc>https://oscarbenedito.com/' + path + '</loc>'
     97     if lastmod == '1970-01-01T00:00:00Z':
     98         lastmod = None
     99     lastmod = '<lastmod>' + lastmod + '</lastmod>' if lastmod else ''
    100     freq = '<changefreq>' + freq + '</changefreq>' if freq else ''
    101     priority = '<priority>' + priority + '</priority>' if priority else ''
    102     sitemap += '<url>' + path + lastmod + freq + priority + '</url>'
    103 
    104 
    105 def set_redirect(src, dst):
    106     """Create HTML redirect."""
    107     fwrite(src, '<!DOCTYPE html><html><head><meta charset="utf-8"><meta http-equiv="refresh" content="0; url=\'/' + dst + '\'"/><link rel="canonical" href="/' + dst + '"/><meta name="robots" content="noindex"></head><body><p>This page has been moved to <a href="/' + dst + '">https://oscarbenedito.com/' + dst + '</a>.</p></body></html>')
    108     log('I', 'Info: redirect /{} => /{}', src, dst)
    109 
    110 
    111 def read_headers(text):
    112     """Parse headers in text and yield (key, value, end-index) tuples."""
    113     for match in re.finditer(r'\s*<!--\s*(.+?)\s*:\s*(.+?)\s*-->\s*|.+', text):
    114         if not match.group(1):
    115             break
    116         yield match.group(1), match.group(2), match.end()
    117 
    118 
    119 def prettify_date(date_str):
    120     """Convert ISO 8601 date string to human friendly date string."""
    121     d = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ')
    122     return d.strftime('%B %-d, %Y')
    123 
    124 
    125 def render(template, pre=False, **params):
    126     """Replace placeholders in template with values from params."""
    127     if not pre:
    128         template = re.sub(r'{{\s*_if\s+([^}\s]+)\s*}}(.*?){{\s*_fi\s*}}',
    129                           lambda m: m.group(2) if m.group(1) in params else '',
    130                           template, flags=re.DOTALL)
    131     return re.sub(r'{{\s*([^}\s]+)\s*}}',
    132                   lambda m: str(params.get(m.group(1), m.group(0))),
    133                   template)
    134 
    135 
    136 def read_content(filename):
    137     """Read content and metadata from file into a dictionary."""
    138     # read file content
    139     text = fread(filename)
    140 
    141     # read metadata and save it in a dictionary
    142     date_slug = os.path.basename(filename).split('.')[0]
    143     match = re.search(r'^(?:(\d\d\d\d-\d\d-\d\d)-)?(.+)$', date_slug)
    144     content = {
    145         'date': (match.group(1) or '1970-01-01') + 'T00:00:00Z',
    146         'slug': match.group(2)
    147     }
    148 
    149     # read headers
    150     end = 0
    151     for key, val, end in read_headers(text):
    152         content[key] = val
    153 
    154     if 'lastmod' in content:
    155         content['modified'] = '1'
    156     else:
    157         content['lastmod'] = content['date']
    158 
    159     # separate content from headers
    160     text = text[end:]
    161 
    162     # convert Markdown content to HTML
    163     if filename.endswith('.md'):
    164         text = markdown.markdown(text, extensions=['footnotes', 'fenced_code'])
    165 
    166     content.update({
    167         'content': text,
    168         'year': content['date'][:4],
    169         'month': content['date'][5:7],
    170         'day': content['date'][8:10],
    171         'date_nice': prettify_date(content['date']),
    172         'lastmod_nice': prettify_date(content['lastmod'])
    173     })
    174 
    175     if 'categories' in content:
    176         # convert the categories string to array of categories
    177         categories = [c.strip() for c in content['categories'].split(',')]
    178         categories_html = ', '.join(['<a class="p-category" href="/blog/categories/' + urlize(c) + '/">' + c + '</a>' for c in categories])
    179         content.update({
    180             'categories': categories,
    181             'categories_html': categories_html
    182         })
    183 
    184     return content
    185 
    186 
    187 def make_pages(src, dst, layout, blog=False, **params):
    188     """Generate pages from page content."""
    189     items = []
    190     categories = {}
    191 
    192     for src_path in glob.glob(src):
    193         content = read_content(src_path)
    194 
    195         page_params = dict(params, **content)
    196 
    197         # populate placeholders in content if content-rendering is enabled
    198         if page_params.get('render') == 'yes':
    199             rendered_content = render(page_params['content'], **page_params)
    200             page_params['content'] = rendered_content
    201 
    202         page_dst = render(dst, **page_params)
    203 
    204         if 'url' in page_params:
    205             page_dst = page_params['url']
    206         else:
    207             page_params.update({'url': page_dst})
    208 
    209         if blog:
    210             w = int(len(re.sub('(?s)<.*?>', ' ', page_params['content']).split())/140)
    211             page_params.update({
    212                 'read_time': str(w) + ' minutes' if w > 1 else '1 minute',
    213                 'src_path': src_path,
    214             })
    215             items.append(page_params)
    216         else:
    217             fwrite(page_dst, render(layout, **page_params))
    218             pri = page_params['priority'] if 'priority' in page_params else None
    219             add_to_sitemap(page_dst, lastmod=page_params['lastmod'], priority=pri)
    220             log('I', 'Info: page {} => /{}', src_path, page_dst)
    221 
    222     items.sort(key=lambda x: x['date'], reverse=True)
    223     for i, item in enumerate(items):
    224         if i != 0:
    225             item['next_url'] = items[i-1]['url']
    226             item['next_title'] = items[i-1]['title']
    227             item['more_pages'] = '1'
    228         if i < len(items)-1:
    229             item['prev_url'] = items[i+1]['url']
    230             item['prev_title'] = items[i+1]['title']
    231             item['more_pages'] = '1'
    232 
    233         for category in item['categories']:
    234             if category not in categories:
    235                 categories[category] = [item]
    236             else:
    237                 categories[category].append(item)
    238 
    239         fwrite(item['url'], render(layout, **item))
    240         pri = item['priority'] if 'priority' in item else None
    241         add_to_sitemap(item['url'], lastmod=item['lastmod'], priority=pri)
    242         log('I', 'Info: post {} => /{}', item['src_path'], item['url'])
    243 
    244     return items, categories
    245 
    246 
    247 def make_lists(posts, dst, list_layout, item_layout, src=None, **params):
    248     """Generate HTML lists for a blog."""
    249     item_per_page = 5
    250     items = []
    251     count = 1
    252     page_dst = dst
    253     text = fread(src) if src else fread('content/' + dst + '_index.md')
    254     params['intro'] = markdown.markdown(text, extensions=['footnotes', 'fenced_code'])
    255     for i, post in enumerate(posts):
    256         item_params = dict(params, **post)
    257         item_params['summary'] = truncate(post['content'])
    258         items.append(render(item_layout, **item_params))
    259         if i % item_per_page == item_per_page-1 and len(posts)-1 > i:
    260             params['more_pages'] = '1'
    261             params['content'] = ''.join(items)
    262             params['next_url'] = dst + 'page/' + str(count+1) + '/'
    263             if count != 1:
    264                 params['prev_url'] = dst + ('page/' + str(count-1) + '/' if count != 2 else '')
    265             fwrite(page_dst, render(list_layout, **params))
    266             log('I', 'Info: list => /{}', page_dst)
    267             count = count+1
    268             page_dst = dst + 'page/' + str(count) + '/'
    269             items = []
    270 
    271     if count != 1:
    272         del params['next_url']
    273         params['prev_url'] = dst + ('page/' + str(count-1) + '/' if count != 2 else '')
    274     params['content'] = ''.join(items)
    275     fwrite(page_dst, render(list_layout, **params))
    276     log('I', 'Info: list => /{}', page_dst)
    277 
    278     set_redirect(dst + 'page/1/', dst)
    279 
    280 
    281 def make_feed(posts, dst, list_layout, item_layout, **params):
    282     """Generate feed for a blog."""
    283     max = 15
    284     params['url'] = dst
    285     page_dst = dst + 'index.xml'
    286     items = []
    287     for i, post in enumerate(posts):
    288         if (i == max):
    289             break
    290         item_params = dict(params, **post)
    291         item_params['c_escaped'] = post['content'].replace('>', '&gt;').replace('<', '&lt;')
    292         item = render(item_layout, **item_params)
    293         items.append(item)
    294 
    295     params['content'] = ''.join(items)
    296     params['updated'] = posts[0]['lastmod']
    297     fwrite(page_dst, render(list_layout, **params))
    298     log('I', 'Info: feed => /{}', page_dst)
    299 
    300 
    301 def make_archive(posts, categories, dst, layout, **params):
    302     year = 0
    303     params['content'] = '<h2>Posts (' + str(len(posts)) + ')</h2>\n'
    304     for post in posts:
    305         if post['year'] != year:
    306             params['content'] += ('</ul>\n' if year != 0 else '') + '<h3>' + post['year'] + '</h3>\n<ul>\n'
    307             year = post['year']
    308         params['content'] += '<li><a href="/' + post['url'] + '">' + post['title'] + '</a> (' + post['date_nice'][:-6] + ')</li>\n'
    309     params['content'] += '</ul>\n'
    310 
    311     params['content'] += '<h2>Categories (' + str(len(categories)) + ')</h2>\n<ul>\n'
    312     for key in sorted(categories):
    313         val = categories[key]
    314         params['content'] += '<li><a href="/' + dst + 'categories/' + urlize(key) + '/">' + key + '</a> (' + str(len(val)) + (' entry' if len(val) == 1 else ' entries') + ')</li>\n'
    315     params['content'] += '</ul>\n'
    316 
    317     page_dst = dst + 'archive/'
    318     fwrite(page_dst, render(layout, **params))
    319     add_to_sitemap(page_dst, lastmod=posts[0]['lastmod'], priority='0.4')
    320     log('I', 'Info: page => /{}', page_dst)
    321 
    322 
    323 def main():
    324     # create a new _site directory from scratch
    325     if os.path.isdir('_site'):
    326         shutil.rmtree('_site')
    327     shutil.copytree('static', '_site')
    328 
    329     # initialize parameters
    330     params = {}
    331 
    332     # initialize sitemap
    333     global sitemap
    334     sitemap = '<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
    335 
    336     # copy assets adding part of their sha256 value to the filename
    337     for path, _, files in os.walk('assets'):
    338         for name in files:
    339             file = os.path.join(path, name)
    340             rfile = os.path.relpath(file, 'assets')
    341             with open(file, 'r') as c:
    342                 content = c.read()
    343 
    344             # minify css
    345             if os.path.splitext(file)[1] == '.css':
    346                 content = re.sub('\s*/\*(?:.|\n)*?\*/\s*', '', content)
    347                 content = re.sub('\s+', ' ', content)
    348                 content = re.sub('\s*({|}|;|,)\s*', r'\1', content)
    349                 content = re.sub(':\s*', ':', content)
    350                 rfile = '{0}.min{1}'.format(*os.path.splitext(rfile))
    351 
    352             h = hashlib.sha256()
    353             h.update(content.encode('utf-8'))
    354             name, ext = os.path.splitext(rfile)
    355             dst = '{n}.{h}{e}'.format(n=name, h=h.hexdigest()[:8], e=ext)
    356 
    357             params['_asset_' + rfile] = dst
    358             basedir = os.path.dirname(os.path.join('_site', dst))
    359             if not os.path.isdir(basedir):
    360                 os.makedirs(basedir)
    361             with open(os.path.join('_site', dst), 'w') as c:
    362                 c.write(content)
    363 
    364     # load layouts
    365     base_layout = fread('layouts/base.html')
    366     page_layout = fread('layouts/page.html')
    367     post_layout = fread('layouts/post.html')
    368     list_html = fread('layouts/list.html')
    369     item_html = fread('layouts/item.html')
    370     feed_xml = fread('layouts/feed.xml')
    371     item_xml = fread('layouts/item.xml')
    372     layout_404 = fread('layouts/404.html')
    373 
    374     # combine layouts to form final layouts
    375     page_layout = render(base_layout, pre=True, content=page_layout)
    376     post_layout = render(base_layout, pre=True, content=post_layout)
    377     list_html = render(base_layout, pre=True, content=list_html)
    378 
    379     # create site pages
    380     make_pages('content/_index.md', '', page_layout, **params)
    381     make_pages('content/[!_]*.*', '{{ slug }}/', page_layout, **params)
    382     fwrite('404.html', render(layout_404, **params))
    383 
    384     # create blog post pages
    385     blog_posts, categories = make_pages('content/blog/[!_]*.*',
    386                                         'blog/{{ year }}/{{ month }}/{{ slug }}/',
    387                                         post_layout, True, **params)
    388     # create HTML list pages
    389     make_lists(blog_posts, 'blog/', list_html, item_html, title='Personal blog',
    390                **params)
    391     add_to_sitemap('blog/', lastmod=blog_posts[0]['lastmod'], priority='1.0')
    392     # create Atom feeds
    393     make_feed(blog_posts, 'blog/', feed_xml, item_xml, title='Personal blog',
    394               long_title='Oscar\'s Blog', **params)
    395     # create blog archive
    396     make_archive(blog_posts, categories, 'blog/', page_layout,
    397                  title='Blog archive', **params)
    398     # create blog categories
    399     for name, posts in categories.items():
    400         dst = 'blog/categories/' + urlize(name) + '/'
    401         src = 'content/blog/categories/' + urlize(name) + '.md'
    402         lt = name + ' on Oscar\'s Blog'
    403         eh = '<link rel="alternate" type="application/atom+xml" title="' + lt + '" href="/' + dst + 'index.xml"/>'
    404         make_lists(posts, dst, list_html, item_html, src=src, title=name,
    405                    extraheader=eh, **params)
    406         make_feed(posts, dst, feed_xml, item_xml, title=name, long_title=lt,
    407                   **params)
    408 
    409     # set redirections
    410     set_redirect('licenses/agpl-v3/', 'licenses/agpl-3.0.txt')
    411     set_redirect('licenses/gpl-v3/', 'licenses/gpl-3.0.txt')
    412     set_redirect('licenses/cc-by-4.0/', 'licenses/cc-by-4.0.txt')
    413     set_redirect('composer/', 'projects/composer/composer.html')
    414 
    415     fwrite('sitemap.xml', sitemap + '</urlset>')
    416 
    417 
    418 if __name__ == '__main__':
    419     main()