#!/usr/bin/env python
__doc__ = """ Utility to extract pages from Discourse posts in a Markdown format
@author: Zeglius """
import html
import json
import os
import re
from argparse import ArgumentParser
from datetime import UTC, datetime
from string import Template
from sys import stderr, stdout
from typing import NamedTuple
import requests
_is_debug: bool = False
_BASE_URL = "https://universal-blue.discourse.group"
class UrlBatch(NamedTuple):
raw_url: str
json_url: str
source_url: str
type HTMLPage = str
type Markdown = str
def todo(msg: str = "TODO"):
"""Equivalent to rust `todo!()`"""
msg = str.removeprefix(msg, "TODO")
raise NotImplementedError(msg)
def debug(*msg) -> None:
"""Print to stderr if `_is_debug` is `True`"""
global _is_debug
if _is_debug:
return print(
f"[DEBUG {__file__}, PID={os.getpid()}]:",
*(o.__str__() for o in msg),
file=stderr,
)
session = requests.Session()
class DiscourseProcessor:
class Patterns:
post_sep_markdown = re.compile(r"-------------------------")
imgs_urls = re.compile(
r"https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">"
)
hashed_images_urls = re.compile(r"upload://([a-zA-Z0-9]+)", flags=re.I | re.M)
author_header_pttrn = re.compile(
r"^(?P\w+)\s\|\s(?P(?P\d{4})-(?P\d{2})-(?P\d{2}))\s(?P