gensite.py - oscarbenedito.com - Source code for my personal website.

gensite.py (16944B) - raw
      1 #!/usr/bin/env python3
      2 # gensite.py: Static site generator based on makesite.py.
      3 # Copyright (C) 2020-2021 Oscar Benedito <oscar@oscarbenedito.com>
      4 #
      5 # This program is free software: you can redistribute it and/or modify
      6 # it under the terms of the GNU Affero General Public License as published by
      7 # the Free Software Foundation, either version 3 of the License, or
      8 # (at your option) any later version.
      9 #
     10 # This program is distributed in the hope that it will be useful,
     11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
     12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13 # GNU Affero General Public License for more details.
     14 #
     15 # You should have received a copy of the GNU Affero General Public License
     16 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
     17 #
     18 # This file incorporates work covered by the following copyright and
     19 # permission notice:
     20 #
     21 #     Copyright (c) 2018 Sunaina Pai
     22 #
     23 #     Permission is hereby granted, free of charge, to any person obtaining
     24 #     a copy of this software and associated documentation files (the
     25 #     "Software"), to deal in the Software without restriction, including
     26 #     without limitation the rights to use, copy, modify, merge, publish,
     27 #     distribute, sublicense, and/or sell copies of the Software, and to
     28 #     permit persons to whom the Software is furnished to do so, subject to
     29 #     the following conditions:
     30 #
     31 #     The above copyright notice and this permission notice shall be
     32 #     included in all copies or substantial portions of the Software.
     33 #
     34 #     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     35 #     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     36 #     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
     37 #     IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
     38 #     CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
     39 #     TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
     40 #     SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
     41 
     42 
     43 """Static site generator based on makesite.py."""
     44 
     45 
     46 import os
     47 import shutil
     48 import re
     49 import glob
     50 import sys
     51 import datetime
     52 import hashlib
     53 
     54 
     55 def fread(filename):
     56     """Read file and close the file."""
     57     with open(filename, "r") as f:
     58         return f.read()
     59 
     60 
     61 def fwrite(filename, text):
     62     """Write content to file and close the file."""
     63     filename = filename + "index.html" if filename.endswith("/") or filename == "" else filename  # fmt: skip
     64     filename = os.path.join("_site", filename)
     65     if os.path.exists(filename):
     66         log("W", "Overwritting file: {}", filename)
     67 
     68     basedir = os.path.dirname(filename)
     69     if not os.path.isdir(basedir):
     70         os.makedirs(basedir)
     71 
     72     with open(filename, "w") as f:
     73         f.write(text)
     74 
     75 
     76 def log(type, msg, *args):
     77     """Log message with specified arguments."""
     78     if type == "E":
     79         sys.stderr.write("Error: " + msg.format(*args) + "\n")
     80         sys.exit(1)
     81     if type == "W":
     82         sys.stderr.write("Warning: " + msg.format(*args) + "\n")
     83     # if type == 'I':
     84     #     sys.stderr.write('Info: ' + msg.format(*args) + '\n')
     85 
     86 
     87 def urlize(name):
     88     """Convert string tu URL."""
     89     return name.lower().replace(" ", "-")
     90 
     91 
     92 def add_to_sitemap(path, lastmod=None, freq=None, priority=None):
     93     """Add URL to sitemap."""
     94     global sitemap
     95     path = "<loc>https://oscarbenedito.com/" + path + "</loc>"
     96     if lastmod == "1970-01-01T00:00:00Z":
     97         lastmod = None
     98     lastmod = "<lastmod>" + lastmod + "</lastmod>" if lastmod else ""
     99     freq = "<changefreq>" + freq + "</changefreq>" if freq else ""
    100     priority = "<priority>" + priority + "</priority>" if priority else ""
    101     sitemap += "<url>" + path + lastmod + freq + priority + "</url>"
    102 
    103 
    104 def set_redirect(src, dst):
    105     """Create HTML redirect."""
    106     fwrite(
    107         src,
    108         '<!DOCTYPE html><html><head><meta charset="utf-8">'
    109         '<meta http-equiv="refresh" content="0; url=/{}"/>'
    110         '<link rel="canonical" href="/{}"/><meta name="robots" content="noindex">'
    111         "</head><body><p>This page has been moved to "
    112         '<a href="/{}">https://oscarbenedito.com/{}</a>.</p>'
    113         "</body></html>".format(dst, dst, dst, dst),
    114     )
    115     log("I", "redirect /{} => /{}", src, dst)
    116     # uncomment next line to print apache redirects
    117     # sys.stdout.write('Redirect permanent "/{}" "/{}"\n'.format(src, dst))
    118 
    119 
    120 def read_headers(text):
    121     """Parse headers in text and yield (key, value, end-index) tuples."""
    122     for match in re.finditer(r"\s*<!--\s*(.+?)\s*:\s*(.+?)\s*-->\s*|.+", text):
    123         if not match.group(1):
    124             break
    125         yield match.group(1), match.group(2), match.end()
    126 
    127 
    128 def prettify_date(date_str):
    129     """Convert ISO 8601 date string to human friendly date string."""
    130     d = datetime.datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
    131     return d.strftime("%B %-d, %Y")
    132 
    133 
    134 def render(template, pre=False, **params):
    135     """Replace placeholders in template with values from params."""
    136     if not pre:
    137         template = re.sub(
    138             r"{{\s*_if\s+([^}\s]+)\s*}}(.*?){{\s*_fi\s*}}",
    139             lambda m: m.group(2) if m.group(1) in params else "",
    140             template,
    141             flags=re.DOTALL,
    142         )
    143         template = re.sub(
    144             r"{{\s*_ife\s+([^}\s]+)\s*}}(.*?){{\s*_else\s*}}(.*?){{\s*_fi\s*}}",
    145             lambda m: m.group(2) if m.group(1) in params else m.group(3),
    146             template,
    147             flags=re.DOTALL,
    148         )
    149     return re.sub(
    150         r"{{\s*([^}\s]+)\s*}}",
    151         lambda m: str(params.get(m.group(1), m.group(0))),
    152         template,
    153     )
    154 
    155 
    156 def read_content(filename):
    157     """Read content and metadata from file into a dictionary."""
    158     # read file content
    159     text = fread(filename)
    160 
    161     # read metadata and save it in a dictionary
    162     date_slug = os.path.basename(filename).split(".")[0]
    163     match = re.search(r"^(?:(\d\d\d\d-\d\d-\d\d)-)?(.+)$", date_slug)
    164     content = {
    165         "date": (match.group(1) or "1970-01-01") + "T00:00:00Z",
    166         "slug": match.group(2),
    167     }
    168 
    169     # read headers
    170     end = 0
    171     for key, val, end in read_headers(text):
    172         content[key] = val
    173 
    174     if "lastmod" in content:
    175         content["modified"] = "1"
    176     else:
    177         content["lastmod"] = content["date"]
    178 
    179     # separate content from headers
    180     text = text[end:]
    181 
    182     # convert Markdown content to HTML
    183     # if filename.endswith('.md'):
    184     #     import markdown
    185     #     text = markdown.markdown(text, extensions=['footnotes', 'fenced_code'])
    186 
    187     content.update(
    188         {
    189             "content": text,
    190             "year": content["date"][:4],
    191             "month": content["date"][5:7],
    192             "day": content["date"][8:10],
    193             "date_nice": prettify_date(content["date"]),
    194             "lastmod_nice": prettify_date(content["lastmod"]),
    195         }
    196     )
    197 
    198     if "categories" in content:
    199         # convert the categories string to array of categories
    200         categories = [c.strip() for c in content["categories"].split(",")]
    201         categories_html = ", ".join(
    202             [
    203                 '<a class="p-category" href="/blog/categories/{}/">{}</a>'.format(
    204                     urlize(c), c
    205                 )
    206                 for c in categories
    207             ]
    208         )
    209         content.update({"categories": categories, "categories_html": categories_html})
    210 
    211     return content
    212 
    213 
    214 def make_pages(src, dst, layout, blog=False, **params):
    215     """Generate pages from page content."""
    216     items = []
    217     categories = {}
    218 
    219     for src_path in glob.glob(src):
    220         content = read_content(src_path)
    221 
    222         page_params = dict(params, **content)
    223 
    224         # populate placeholders in content if content-rendering is enabled
    225         if page_params.get("render") == "yes":
    226             rendered_content = render(page_params["content"], **page_params)
    227             page_params["content"] = rendered_content
    228 
    229         if "url" not in page_params:
    230             page_params["url"] = render(dst, **page_params)
    231         else:  # can be deleted, just to warn since I have never used it
    232             log("W", "parameter 'url' set in {}", src_path)
    233 
    234         if blog:
    235             page_params["src_path"] = src_path
    236             items.append(page_params)
    237         else:
    238             fwrite(page_params["url"], render(layout, **page_params))
    239             pri = page_params["priority"] if "priority" in page_params else None
    240             add_to_sitemap(
    241                 page_params["url"], lastmod=page_params["lastmod"], priority=pri
    242             )
    243             log("I", "page {} => /{}", src_path, page_params["url"])
    244 
    245     # the following is only executed if blog == True, otherwise items is empty
    246     items.sort(key=lambda x: x["date"], reverse=True)
    247     for i, item in enumerate(items):
    248         if i != 0:
    249             item["next_url"] = items[i - 1]["url"]
    250             item["next_title"] = items[i - 1]["title"]
    251             item["multiple_pages"] = "1"
    252         if i < len(items) - 1:
    253             item["prev_url"] = items[i + 1]["url"]
    254             item["prev_title"] = items[i + 1]["title"]
    255             item["multiple_pages"] = "1"
    256 
    257         for category in item["categories"]:
    258             if category not in categories:
    259                 categories[category] = [item]
    260             else:
    261                 categories[category].append(item)
    262 
    263         fwrite(item["url"], render(layout, **item))
    264         pri = item["priority"] if "priority" in item else None
    265         add_to_sitemap(item["url"], lastmod=item["lastmod"], priority=pri)
    266         log("I", "post {} => /{}", item["src_path"], item["url"])
    267 
    268     return items, categories
    269 
    270 
    271 def make_lists(posts, dst, l_html, l_html_item, l_feed, l_feed_item, **params):
    272     """Generate HTML lists and Atom feed for a set of posts."""
    273     if os.path.isfile("content/" + dst + "_index.html"):
    274         text = fread("content/" + dst + "_index.html")
    275     else:
    276         text = fread("content/" + dst[:-1] + ".html")
    277     end = 0
    278 
    279     for key, val, end in read_headers(text):
    280         params[key] = val
    281 
    282     params["intro"] = text[end:]
    283 
    284     # make HTML lists
    285     ipp = 5  # items per page
    286     params["content"] = ""
    287     title = params["title"]
    288     if dst != "blog/":  # blog feed appears on all pages already
    289         params["extraheader"] = (
    290             '<link rel="alternate" type="application/atom+xml" '
    291             'title="{}" href="/{}index.xml"/>'.format(params["feed_title"], dst)
    292         )
    293 
    294     for i, post in enumerate(posts):
    295         item_params = dict(params, **post)
    296 
    297         # remove tags and truncate at 50 words
    298         item_params["summary"] = (
    299             " ".join(re.sub("(?s)<.*?>", "", post["content"]).split()[:50]) + "..."
    300         )
    301 
    302         params["content"] += render(l_html_item, **item_params)
    303 
    304         if i % ipp == ipp - 1 or i == len(posts) - 1:
    305             page = i // ipp + 1
    306             curr_dst = dst + ("page/{}/".format(page) if i >= ipp else "")
    307 
    308             if i != len(posts) - 1:
    309                 params["multiple_pages"] = "1"
    310                 params["next_url"] = "{}page/{}/".format(dst, page + 1)
    311             elif page > 1:
    312                 params.pop("next_url")
    313 
    314             if page != 1:
    315                 params["title"] = "{} (page {} of {})".format(
    316                     title, page, ((len(posts) - 1) // ipp) + 1
    317                 )
    318 
    319             fwrite(curr_dst, render(l_html, **params))
    320             log("I", "list => /{}", curr_dst)
    321 
    322             params["prev_url"] = curr_dst
    323             params["content"] = ""
    324 
    325     set_redirect(dst + "page/1/", dst)
    326 
    327     # make Atom feed
    328     ipp = 15  # item per feed
    329     params["url"] = dst
    330     page_dst = dst + "index.xml"
    331     params["content"] = ""
    332     for i, post in enumerate(posts):
    333         if i == ipp:
    334             break
    335         item_params = dict(params, **post)
    336 
    337         # escape HTML content
    338         item_params["c_escaped"] = (
    339             post["content"]
    340             .replace("&", "&amp;")
    341             .replace(">", "&gt;")
    342             .replace("<", "&lt;")
    343         )
    344 
    345         params["content"] += render(l_feed_item, **item_params)
    346 
    347     params["updated"] = posts[0]["lastmod"]
    348     fwrite(page_dst, render(l_feed, **params))
    349     log("I", "feed => /{}", page_dst)
    350 
    351 
    352 def make_archive(posts, categories, dst, layout, **params):
    353     year = 0
    354     params["content"] = "<h2>Posts ({})</h2>\n".format(len(posts))
    355     for post in posts:
    356         if post["year"] != year:
    357             params["content"] += "</ul>\n" if year != 0 else ""
    358             params["content"] += "<h3>{}</h3>\n<ul>\n".format(post["year"])
    359             year = post["year"]
    360         params["content"] += '<li><a href="/{}">{}</a> ({})</li>\n' "".format(
    361             post["url"], post["title"], post["date_nice"][:-6]
    362         )
    363     params["content"] += "</ul>\n"
    364 
    365     params["content"] += "<h2>Categories ({})</h2>\n<ul>\n".format(len(categories))
    366     for key in sorted(categories):
    367         val = categories[key]
    368         params[
    369             "content"
    370         ] += '<li><a href="/{}categories/{}/">{}</a> ({} {})</li>\n' "".format(
    371             dst, urlize(key), key, len(val), "entry" if len(val) == 1 else "entries"
    372         )
    373     params["content"] += "</ul>\n"
    374 
    375     page_dst = dst + "archive/"
    376     fwrite(page_dst, render(layout, **params))
    377     add_to_sitemap(page_dst, lastmod=posts[0]["lastmod"], priority="0.4")
    378     log("I", "page => /{}", page_dst)
    379 
    380 
    381 def main():
    382     # create a new _site directory from scratch
    383     if os.path.isdir("_site"):
    384         shutil.rmtree("_site")
    385     shutil.copytree("static", "_site")
    386 
    387     # initialize parameters
    388     params = {}
    389 
    390     # initialize sitemap
    391     global sitemap
    392     sitemap = (
    393         '<?xml version="1.0" encoding="UTF-8"?>\n'
    394         '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
    395     )
    396 
    397     # copy assets adding part of their sha256 value to the filename
    398     for path, _, files in os.walk("assets"):
    399         for name in files:
    400             file = os.path.join(path, name)
    401             rfile = os.path.relpath(file, "assets")
    402             with open(file, "r") as c:
    403                 content = c.read()
    404 
    405             # minify css
    406             if os.path.splitext(file)[1] == ".css":
    407                 content = re.sub(r"\s*/\*(?:.|\n)*?\*/\s*", "", content)
    408                 content = re.sub(r"\s+", " ", content)
    409                 content = re.sub(r"\s*({|}|;|,)\s*", r"\1", content)
    410                 content = re.sub(r":\s*", ":", content)
    411                 rfile = "{0}.min{1}".format(*os.path.splitext(rfile))
    412 
    413             h = hashlib.sha256()
    414             h.update(content.encode("utf-8"))
    415             name, ext = os.path.splitext(rfile)
    416             dst = "{n}.{h}{e}".format(n=name, h=h.hexdigest()[:8], e=ext)
    417 
    418             params["_asset_" + rfile] = dst
    419             basedir = os.path.dirname(os.path.join("_site", dst))
    420             if not os.path.isdir(basedir):
    421                 os.makedirs(basedir)
    422             with open(os.path.join("_site", dst), "w") as c:
    423                 c.write(content)
    424 
    425     # load layouts
    426     l_base = fread("layouts/base.html")
    427     l_page = render(l_base, pre=True, content=fread("layouts/page.html"))
    428     l_post = render(l_base, pre=True, content=fread("layouts/post.html"))
    429     l_list = render(l_base, pre=True, content=fread("layouts/list.html"))
    430     l_feed = fread("layouts/feed.xml")
    431     item_html = fread("layouts/item.html")
    432     item_xml = fread("layouts/item.xml")
    433 
    434     # create site pages
    435     make_pages("content/_index.html", "", l_page, **params)
    436     make_pages("content/[!_]*.*", "{{ slug }}/", l_page, **params)
    437     make_pages("content/projects/[!_]*.*", "projects/{{ slug }}/", l_page, **params)
    438     make_pages("content/en/_index.html", "en/", l_page, **params)
    439     make_pages("content/en/[!_]*.*", "en/{{ slug }}/", l_page, **params)
    440     fwrite("404.html", render(fread("layouts/404.html"), **params))
    441 
    442     # create blog post pages
    443     all_posts, categories = make_pages(
    444         "content/blog/[!_]*.*",
    445         "blog/{{ year }}/{{ month }}/{{ slug }}/",
    446         l_post,
    447         blog=True,
    448         **params
    449     )
    450 
    451     # create HTML list pages and Atom feed
    452     make_lists(all_posts, "blog/", l_list, item_html, l_feed, item_xml, **params)
    453 
    454     add_to_sitemap("blog/", lastmod=all_posts[0]["lastmod"], priority="1.0")
    455 
    456     # create blog archive
    457     make_archive(all_posts, categories, "blog/", l_page, title="Blog archive", **params)
    458 
    459     # create blog categories
    460     for name, c_posts in categories.items():
    461         dst = "blog/categories/" + urlize(name) + "/"
    462         make_lists(c_posts, dst, l_list, item_html, l_feed, item_xml, **params)
    463 
    464     # set redirections
    465     set_redirect("licenses/agpl-v3/", "licenses/agpl-3.0.txt")
    466     set_redirect("licenses/gpl-v3/", "licenses/gpl-3.0.txt")
    467     set_redirect("licenses/cc-by-4.0/", "licenses/cc-by-4.0.txt")
    468     set_redirect("composer/", "projects/composer/composer.html")
    469     set_redirect("contact/", "en/#contact-me")
    470     set_redirect("about/", "en/about/")
    471 
    472     fwrite("sitemap.xml", sitemap + "</urlset>")
    473 
    474 
    475 if __name__ == "__main__":
    476     main()