From 74130ee1fe9264dc7a4c4c49fb416ef3dc12e322 Mon Sep 17 00:00:00 2001
From: Zeglius <33781398+Zeglius@users.noreply.github.com>
Date: Sun, 4 Aug 2024 18:50:07 +0200
Subject: [PATCH] docs: Always fetch the highest quality image by
fetch_discourse_md
---
docs/fetch_discourse_md.py | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/docs/fetch_discourse_md.py b/docs/fetch_discourse_md.py
index e2c85f39..390c8a9b 100755
--- a/docs/fetch_discourse_md.py
+++ b/docs/fetch_discourse_md.py
@@ -109,8 +109,9 @@ class DiscourseProcessor:
class Patterns:
post_sep_markdown = re.compile(r"-------------------------")
imgs_urls = re.compile(
- r"https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">"
+ r"(?Phttps://(?:[a-zA-Z0-9./_-]+))\"\salt=\"(?P(?:\w+\s?)+)\".*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">)"
)
+ imgs_urls_biggest_img = r"(?<=srcset=\").*1\.5x,\s(https://.*)\s2x\""
hashed_images_urls = re.compile(r"upload://[a-zA-Z0-9]{27}\.(?:jpe?g|png|svg)")
@staticmethod
@@ -191,7 +192,11 @@ class DiscourseProcessor:
def get_images_url_assocs_from_page(cls, page: HTMLPage) -> ImageUrlAssocs:
result: list[tuple] = []
for match in re.finditer(DiscourseProcessor.Patterns.imgs_urls, page):
- (sha1, image_cdn_url) = match.group("sha1", "image_cdn_url")
+ (sha1, image_cdn_url, img_tag) = match.group("sha1", "image_cdn_url", "tag")
+ if img_big := re.search(
+ DiscourseProcessor.Patterns.imgs_urls_biggest_img, img_tag
+ ):
+ image_cdn_url = img_big.group(1)
result.append((sha1, image_cdn_url))
return result