1
0
mirror of https://github.com/ublue-os/bazzite.git synced 2025-02-21 09:39:49 +00:00
bazzite/docs/utils/fetch_discourse_md.py
Zeglius 0e18978fe1
feat(docs): Replace mdbook with mkdocs workflow ()
* feat: add mdbook docs

* chore: add several articles to docs

* docs: add documentation at surface level

Using Discourse urls as fallback for missing content for now

docs: add missing image files

* docs: Add missing chapter emojis

docs: Add missing warning in Advanced docs in summary

docs: add missing waydroid guide

docs: rename files to avoid spaces

docs: fix badly set docs build params

docs: remove unnecesary placeholders

* docs: Realocate 'Gaming' section under 'General'

* docs: Add 'Introduction' section

This section contains a table of contents of the documentation

* docs: Add unstable documentation warning

* docs: Add missing github url

docs: add missing symlink to resources

* docs: Add discourse scrapper utility

* docs: minor discourse scrapper docs changes

* docs: Add youtube embeding preprocessor

* minor reformat for youtube-embed

* docs: Add mdbook preprocessor template

* docs: add format-author preprocessor

* docs: add git lib to mdbook toolset

* docs: Always fetch the highest quality image by fetch_discourse_md

* docs: fix youtube-embed ignoring new line requirement

* docs: Add documentation transcription guide

* docs: Missing url in transcription guide

* docs: Remove YAML header from doc guide

* docs: Minor tweaks to transcription guide

* docs: Add utilities preprocessor module

docs: Move debug preprocessor util to utils

* docs: tweak debug function

* docs: Add 'replace-urls' preprocessor

* chore: Move mappings parameter in replace-urls preprocessor

* docs: add ignore field to replace-urls

* docs: add Mdbook python types

* docs: Add ignore field to replace-urls

Now we can exclude files from being processed with blob patterns

* chore(ci): add deploy_docs

* chore(ci): Add dynamic edit url template to deploy_docs

* chore(ci): Add html.site-url to deploy_docs

* chore(readme): Use relative paths for repo_content

* chore(ci): Add README to included paths for deploy_docs

* chore(ci): Disable deploy_docs

* chore(ci): Use main in deploy_docs.on.push.branches

* docs: Rephrase unstable docs warning

* chore(ci): Exclude docs from triggering build workflow

* chore(ci): Enable deploy_docs

* fix(docs): Remove unnecessary imports in preprocessors

* docs: Move unstable docs warning to index.hbs

* docs: Add page metadata inclusion with fetch_discourse_md.py

* docs: Move fetch_discourse_md.py to docs/utils

* docs: Add 'fetched_at' metadata field in fetch_discourse_md.py

* docs: Update fetch_discourse_md.py to format metadata in json

* Revert "chore(readme): Use relative paths for repo_content"

This reverts commit 6a781c659607e0c83c19248241684c5785c7e93b.

* docs: Replace include with an url to repo README

* ci(docs): Add multilanguage doc build support

* docs: add Justfile utility

* docs: update Justfile utility

* ci(docs): Add stricter workflow trigger to deploy_docs

* docs: add 'preview_translation' to Justfile

* docs: add documentation translation guide

* ci(docs): Add mdbook cache

* ci(docs): Add i18n-report

* ci(docs): tweak deploy_docs workflow triggers

* ci(docs): remove unnecessary slash at build.yml

* ci(docs): remove unnecessary slash at deploy_docs.yml

* ci(docs): add docs/book.toml to deploy_docs trigger

* ci(docs): Add schedule trigger

* ci(docs): add github-pages cleaning

* ci(docs): Exclude docs from generate_changelog

* docs: Add dependencies installation script

* ci(docs): Add mdbook pdf build

* docs: Tweak Justfile to support pdf generation

* Revert "docs: Always fetch the highest quality image by fetch_discourse_md"

This reverts commit 74130ee1fe9264dc7a4c4c49fb416ef3dc12e322.

* ci(docs): Exclude deploy_docs.yml from cache-mdbook keys

* docs: Add 'mdbook_build' to Justfile

* docs: Add 'mdbook_serve' to Justfile

* docs: Add debug flag to fetch_discourse_md

* docs: Automate discourse documentation scrapping

* docs: Add flock to fetch_discourse_md

* docs: Add translation file generation with Justfile

* docs: Prefix url replacements with site-url in replace-urls.py preprocessor

* docs: Add installation guides

docs: Replace print button

* Revert "docs: Prefix url replacements with site-url in replace-urls.py preprocessor"

This reverts commit a685de4dce54debc900607d743069b79202a26ac.

* Reapply "docs: Prefix url replacements with site-url in replace-urls.py preprocessor"

This reverts commit 777d8055eac7543001200834939c960fb490e666.

* docs: fix replace-urls.py

* docs: fix fetch_discourse_md.py hitting discourse ip_10_secs_limit

* ci(docs): Remove duplicate '/' in build translation step

* ci(docs): Update actions/cache

* ci(docs): Reduce deploy_docs schedule timespan between triggers

* docs: update install-deps.sh

* docs: Update Advanced docs

* docs: Add favicon

* docs: Reword unstable documentation warning

* docs: Change default theme to 'navy'

* ci(docs): Move permisions to job scope

* docs: refactor fetch_discourse_md.py

Now it will export the function 'fetch', which other python scripts can
use

* docs: Add mkdocs skeleton

* docs: Add cmdrun hook

* ci(docs): Migrade deploy_docs to use mkdocs

* chore: remove mdbook leftover files

* docs: add support for markdown emojies

* docs: add support for i18n translations to mkdocs

* docs: add resource prefetching

* docs: enable navigation indexes in toc

* docs: add unstable documentation warning

* docs: normalize toc

docs: Add markdown magiclinks

* docs: remove unnecesary extensions

* ci(docs): Separate docs build into its own action

* ci(docs): fix build docs action

* ci(docs): Add default parameters to build_mkdocs action

* ci(docs): Clean up leftover mdbook files

* docs: remove leftover mkdocs-print-site-plugin

* chore: add mkdocs offline documentation

* docs: fix list indentation

* ci(docs): Add github links to mkdocs

* ci(docs): Add github authors to mkdocs

* docs: Update documentation guide and scripts to mkdocs

* docs: Add cache capabilities to cmdrun hook

* docs: Enable instant loading

docs: Enable toc in sidebar

* docs: Update summary and add more posts

* docs: Add mkdocs-material social plugin

* docs: Disable instant loading

* docs: Fix section url

* docs: Fix fetch_discourse not fetching images properly

* docs: Disable warning for using absolute links

* docs: Add url replacement hook

* docs: Restore 'General' section

* docs: Remove aditional languages for now

* docs: Add missing page titles

* docs: move and rename index.md to docs/src/Handheld_and_HTPC_edition/Steam_Gaming_Mode.md

* docs: remove leftover Bazzite_resources.md

* docs: Add time fallback to git-revision-date

* docs: Add navigation tabs

* docs: Clear cmdrun cache with Justfile

* docs: Add missing dual-boot guide url

* docs: Change to a shorter section name for handheld and HTPC

* docs: Add embed_youtube hook

* docs: Remove leftover resources entry in index

docs: Fix outdated 'Steam Gaming Overview' link in index

* docs: Limit vertical image size

* docs: add more url replacements

* docs: Enable search features
2024-08-28 15:48:19 -07:00

288 lines
9.0 KiB
Python
Executable File

#!/usr/bin/env python
__doc__ = """ Utility to extract pages from Discourse posts in a Markdown format
@author: Zeglius
Quick Glossary:
- HTMLPage: an HTML page contents
- Markdown: a Markdown page contents
How does this script work:
1. Pass Discourse docs topic URL as argument
```sh
./fetch_discourse_md.py "https://universal-blue.discourse.group/docs?topic=1146"
```
2. Create an UrlBatch for each URL passed.
An UrlBatch is a named tuple which contents are:
- An URL pointing to the raw markdown of the topic (ex.: https://universal-blue.discourse.group/raw/1146)
- An URL pointing to the json format of the topic (ex.: https://universal-blue.discourse.group/t/1146.json)
We have a problem. We could simply use the raw markdown, but in that format, images URLs point to a route that Discourse
use internally to fetch the images (ex:. upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg).
The solution lies in the json format of the topic. Specifically its field '.post_stream.posts[].cooked'.
That field contains the already rendered html page, including the URLs pointing to the images in the CDN.
Problem is, how do we match the images images in the markdown (ex.: upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg) with the ones
from the json (ex.: https://canada1.discourse-cdn.com/free1/uploads/univeral_blue/original/2X/f/feb6c68dc90b80d9432b6ce74a38f639b05202d5.jpeg)?
This is an extract of the img element:
```html
<img src="https://canada1.discourse-cdn.com/free1/uploads/univeral_blue/original/2X/f/feb6c68dc90b80d9432b6ce74a38f639b05202d5.jpeg"
alt="Desktop"
data-base62-sha1="AliS5ytuj3Rro4xsxfHMnPiJMsR"
width="690" height="448" data-dominant-color="4D5A5D">
```
Bingo! `data-base62-sha1` contents match with that of the image URL in the markdown (ex.: upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg).
3. Obtain the HTML page contents using the json url stored in the UrlBatch
```python
@classmethod
def get_page_from_json(cls, batch: UrlBatch) -> HTMLPage:
json_content = requests.get(batch.json_url).json()
return json_content["post_stream"]["posts"][0]["cooked"]
```
4. From the HTML page, find the `<img>` tags with the next regex expression:
```regex
<img src=\"(?P<image_cdn_url>https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P<sha1>[a-zA-Z0-9]+)\".*\">
```
Using this regex expression we obtain:
- URL used by the CDN to store the image (group 'image_cdn_url')
- SHA1 used by the markdown (ex.: upload://<SHA1>.jpeg) (group 'sha1')
5. Create a `img_url_assocs: list[tuple[str,str]]`, `dict` following this schema: `{"<SHA1>": "<image_cdn_url>"}`
```python
@classmethod
def get_images_url_assocs_from_page(cls, page: HTMLPage) -> list[tuple[str, str]]:
result: list[dict[str, str]] = []
for match in re.finditer(DiscourseProcessor.Patterns.imgs_urls, page):
(sha1, image_cdn_url) = match.group("sha1", "image_cdn_url")
result.append({sha1: image_cdn_url})
return result
```
Once we have associated each SHA1 with an image_cdn_url, its time to fetch the Markdown
6. Obtain markdown
```python
@classmethod
def get_markdown_from_raw(cls, batch: UrlBatch) -> Markdown:
return requests.get(batch.raw_url).text
```
7. For each key in the `img_url_assocs` list, search with regex the _hashed urls_ (ex.: upload://AliS5ytuj3Rro4xsxfHMnPiJMsR.jpeg)
and replace them with the image_cdn_url
```python
for assoc in img_url_assocs:
# TODO: Add example here
```
"""
from argparse import ArgumentParser
from datetime import datetime, UTC
import fcntl
import html
import json
import os
import re
from string import Template
from sys import stdout, stderr
from time import sleep
from typing import NamedTuple
import requests
_is_debug: bool = False
_BASE_URL = os.getenv("BASE_URL", "https://universal-blue.discourse.group").rstrip("/")
class UrlBatch(NamedTuple):
raw_url: str
json_url: str
source_url: str
type HTMLPage = str
type Markdown = str
type ImageUrlAssocs = list[tuple[str, str]]
def todo(msg: str = "TODO"):
"""Equivalent to rust `todo!()`"""
msg = str.removeprefix(msg, "TODO")
raise NotImplementedError(msg)
def debug(*msg) -> None:
"""Print to stderr if `_is_debug` is `True`"""
global _is_debug
if _is_debug:
return print(
f"[DEBUG {__file__}, PID={os.getpid()}]:",
*(o.__str__() for o in msg),
file=stderr,
)
def acquire_lock(lock_file_path="/tmp/mylock.lock"):
lock_file = open(lock_file_path, "w")
fcntl.flock(lock_file, fcntl.LOCK_EX)
return lock_file
class DiscourseProcessor:
class Patterns:
post_sep_markdown = re.compile(r"-------------------------")
imgs_urls = re.compile(
r"<img\ssrc=\"(?P<image_cdn_url>https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P<sha1>[a-zA-Z0-9]+)\".*\">"
)
hashed_images_urls = re.compile(r"upload://([a-zA-Z0-9]{27})")
@classmethod
def transform_to_url_batch(cls, url: str) -> UrlBatch:
"""Input a discourse url topic and return a batch of urls such as `/raw/{id}` and `/t/{id}.json`
Args:
url (str)
"""
res = None
# Get topic id
id = re.search(rf"{re.escape(_BASE_URL)}/docs\?topic=(\d+)", url)
if id is None:
raise Exception("id was not found")
id = int(id.group(1))
res = UrlBatch(
json_url=f"https://universal-blue.discourse.group/t/{id}.json",
raw_url=f"https://universal-blue.discourse.group/raw/{id}",
source_url=url,
)
return res
@classmethod
def fetch(cls, url: str) -> requests.Response:
tries = 2
retry_pattern = r"Slow down, too many requests from this IP address. Please retry again in (\d+) seconds?\. Error code: ip_10_secs_limit\.$"
while tries > 0:
res = requests.get(url)
if re.match(retry_pattern, res.text):
debug("Timeout was hit: ", res.text)
tries = tries - 1
sleep(12) # Usually is 10 seconds, +2 to be safe
continue
else:
break
return res
@staticmethod
def get_markdown_from_url(url: str):
return requests.get(url).text
@staticmethod
def add_metadata_to_markdown(md: Markdown, url_discourse: str) -> Markdown:
"""Add commented metadata to a markdown page"""
meta_tmpl = Template(
"\n".join(
[
"<!-- ANCHOR: METADATA -->",
"<!--$metadata-->",
"<!-- ANCHOR_END: METADATA -->",
]
)
.lstrip()
.rstrip()
)
metadata = html.escape(
json.dumps(
dict(
url_discourse=url_discourse,
fetched_at=datetime.now(UTC).__str__(),
),
),
quote=False,
)
md_split = md.splitlines()
author_header_pttrn = r"^(?P<username>\w+)\s\|\s(?P<date>(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}))\s(?P<time>(?P<hour>\d{2}):(?P<min>\d{2}):(?P<sec>\d{2})\s(?P<zone>\w+))\s\|\s#\d+"
if re.match(author_header_pttrn, md_split[0]):
md_split[0] = "\n".join(
[md_split[0], "", meta_tmpl.substitute(metadata=metadata)]
)
return "\n".join(md_split)
def simple_replace_match(match: re.Match) -> str:
hash = match.group(1)
if hash:
return f"{_BASE_URL}/uploads/short-url/{hash}"
return ""
def fetch(url: str) -> str | None:
batch = DiscourseProcessor.transform_to_url_batch(url)
md_url = batch.raw_url
result = DiscourseProcessor.get_markdown_from_url(md_url)
# Replace images urls
result = re.sub(
DiscourseProcessor.Patterns.hashed_images_urls,
simple_replace_match,
result,
)
# Remove comments
result = DiscourseProcessor.Patterns.post_sep_markdown.split(result, 1)[0].rstrip()
# Add metadata
result = DiscourseProcessor.add_metadata_to_markdown(result, batch.source_url)
return result
def main():
argparser = ArgumentParser()
argparser.add_argument(
"url",
type=str,
help="discourse urls to be processed",
)
argparser.add_argument(
"-d",
"--debug",
help="Show additional info in stderr",
action="store_true",
dest="debug",
default=False,
)
args = argparser.parse_args()
global _is_debug
_is_debug = os.getenv("DEBUG") == "1" or args.debug
result = fetch(args.url)
print(result, file=stdout)
if __name__ == "__main__":
lock_file = acquire_lock()
try:
main()
finally:
lock_file.close()