#!/usr/bin/env python __doc__ = """ Utility to extract pages from Discourse posts in a Markdown format @author: Zeglius """ import html import json import os import re from argparse import ArgumentParser from datetime import UTC, datetime from string import Template from sys import stderr, stdout from typing import NamedTuple import requests _is_debug: bool = False _BASE_URL = "https://universal-blue.discourse.group" class UrlBatch(NamedTuple): raw_url: str json_url: str source_url: str type HTMLPage = str type Markdown = str def todo(msg: str = "TODO"): """Equivalent to rust `todo!()`""" msg = str.removeprefix(msg, "TODO") raise NotImplementedError(msg) def debug(*msg) -> None: """Print to stderr if `_is_debug` is `True`""" global _is_debug if _is_debug: return print( f"[DEBUG {__file__}, PID={os.getpid()}]:", *(o.__str__() for o in msg), file=stderr, ) session = requests.Session() class DiscourseProcessor: class Patterns: post_sep_markdown = re.compile(r"-------------------------") imgs_urls = re.compile( r"https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">" ) hashed_images_urls = re.compile(r"upload://([a-zA-Z0-9]+)", flags=re.I | re.M) author_header_pttrn = re.compile( r"^(?P\w+)\s\|\s(?P(?P\d{4})-(?P\d{2})-(?P\d{2}))\s(?P