#!/usr/bin/env python __doc__ = """ Utility to extract pages from Discourse posts in a Markdown format @author: Zeglius Quick Glossary: - HTMLPage: an HTML page contents - Markdown: a Markdown page contents How does this script work: 1. Pass Discourse docs topic URL as argument ```sh ./fetch_discourse_md.py "https://universal-blue.discourse.group/docs?topic=1146" ``` 2. Create an UrlBatch for each URL passed. An UrlBatch is a named tuple which contents are: - An URL pointing to the raw markdown of the topic (ex.: https://universal-blue.discourse.group/raw/1146) - An URL pointing to the json format of the topic (ex.: https://universal-blue.discourse.group/t/1146.json) We have a problem. We could simply use the raw markdown, but in that format, images URLs point to a route that Discourse use internally to fetch the images (ex:. upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg). The solution lies in the json format of the topic. Specifically its field '.post_stream.posts[].cooked'. That field contains the already rendered html page, including the URLs pointing to the images in the CDN. Problem is, how do we match the images images in the markdown (ex.: upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg) with the ones from the json (ex.: https://canada1.discourse-cdn.com/free1/uploads/univeral_blue/original/2X/f/feb6c68dc90b80d9432b6ce74a38f639b05202d5.jpeg)? This is an extract of the img element: ```html Desktop ``` Bingo! `data-base62-sha1` contents match with that of the image URL in the markdown (ex.: upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg). 3. Obtain the HTML page contents using the json url stored in the UrlBatch ```python @classmethod def get_page_from_json(cls, batch: UrlBatch) -> HTMLPage: json_content = requests.get(batch.json_url).json() return json_content["post_stream"]["posts"][0]["cooked"] ``` 4. From the HTML page, find the `` tags with the next regex expression: ```regex https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\"> ``` Using this regex expression we obtain: - URL used by the CDN to store the image (group 'image_cdn_url') - SHA1 used by the markdown (ex.: upload://.jpeg) (group 'sha1') 5. Create a `img_url_assocs: list[tuple[str,str]]`, `dict` following this schema: `{"": ""}` ```python @classmethod def get_images_url_assocs_from_page(cls, page: HTMLPage) -> list[tuple[str, str]]: result: list[dict[str, str]] = [] for match in re.finditer(DiscourseProcessor.Patterns.imgs_urls, page): (sha1, image_cdn_url) = match.group("sha1", "image_cdn_url") result.append({sha1: image_cdn_url}) return result ``` Once we have associated each SHA1 with an image_cdn_url, its time to fetch the Markdown 6. Obtain markdown ```python @classmethod def get_markdown_from_raw(cls, batch: UrlBatch) -> Markdown: return requests.get(batch.raw_url).text ``` 7. For each key in the `img_url_assocs` list, search with regex the _hashed urls_ (ex.: upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg) and replace them with the image_cdn_url ```python for assoc in img_url_assocs: # TODO: Add example here ``` """ from argparse import ArgumentParser from datetime import datetime, UTC import fcntl import html import json import os import re from string import Template from sys import stdout, stderr from time import sleep from typing import NamedTuple import requests _is_debug: bool = False _BASE_URL = "https://universal-blue.discourse.group" class UrlBatch(NamedTuple): raw_url: str json_url: str source_url: str type HTMLPage = str type Markdown = str type ImageUrlAssocs = list[tuple[str, str]] def todo(msg: str = "TODO"): """Equivalent to rust `todo!()`""" msg = str.removeprefix(msg, "TODO") raise NotImplementedError(msg) def debug(*msg) -> None: """Print to stderr if `_is_debug` is `True`""" global _is_debug if _is_debug: return print( f"[DEBUG {__file__}, PID={os.getpid()}]:", *(o.__str__() for o in msg), file=stderr, ) def acquire_lock(lock_file_path="/tmp/mylock.lock"): lock_file = open(lock_file_path, "w") fcntl.flock(lock_file, fcntl.LOCK_EX) return lock_file class DiscourseProcessor: class Patterns: post_sep_markdown = re.compile(r"-------------------------") imgs_urls = re.compile( r"https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">" ) hashed_images_urls = re.compile(r"upload://([a-zA-Z0-9]+)", flags=re.I | re.M) @classmethod def transform_to_url_batch(cls, url: str) -> UrlBatch: """Input a discourse url topic and return a batch of urls such as `/raw/{id}` and `/t/{id}.json` Args: url (str) """ res = None # Get topic id id = re.search(rf"{re.escape(_BASE_URL)}/docs\?topic=(\d+)", url) if id is None: raise Exception("id was not found") id = int(id.group(1)) res = UrlBatch( json_url=f"https://universal-blue.discourse.group/t/{id}.json", raw_url=f"https://universal-blue.discourse.group/raw/{id}", source_url=url, ) return res @classmethod def fetch(cls, url: str) -> requests.Response: tries = 2 retry_pattern = r"Slow down, too many requests from this IP address. Please retry again in (\d+) seconds?\. Error code: ip_10_secs_limit\.$" while tries > 0: res = requests.get(url) if re.match(retry_pattern, res.text): debug("Timeout was hit: ", res.text) tries = tries - 1 sleep(12) # Usually is 10 seconds, +2 to be safe continue else: break return res @staticmethod def get_markdown_from_url(url: str): return requests.get(url).text @staticmethod def add_metadata_to_markdown(md: Markdown, url_discourse: str) -> Markdown: """Add commented metadata to a markdown page""" meta_tmpl = Template( "\n".join( [ "", "", "", ] ) .lstrip() .rstrip() ) metadata = html.escape( json.dumps( dict( url_discourse=url_discourse, fetched_at=datetime.now(UTC).__str__(), ), ), quote=False, ) md_split = md.splitlines() author_header_pttrn = r"^(?P\w+)\s\|\s(?P(?P\d{4})-(?P\d{2})-(?P\d{2}))\s(?P