docs: Always fetch the highest quality image by fetch_discourse_md

This commit is contained in:
Zeglius 2024-08-04 18:50:07 +02:00
parent 433dd2d3dd
commit 74130ee1fe

View File

@ -109,8 +109,9 @@ class DiscourseProcessor:
class Patterns: class Patterns:
post_sep_markdown = re.compile(r"-------------------------") post_sep_markdown = re.compile(r"-------------------------")
imgs_urls = re.compile( imgs_urls = re.compile(
r"<img\ssrc=\"(?P<image_cdn_url>https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P<sha1>[a-zA-Z0-9]+)\".*\">" r"(?P<tag><img\ssrc=\"(?P<image_cdn_url>https://(?:[a-zA-Z0-9./_-]+))\"\salt=\"(?P<alt>(?:\w+\s?)+)\".*data-base62-sha1=\"(?P<sha1>[a-zA-Z0-9]+)\".*\">)"
) )
imgs_urls_biggest_img = r"(?<=srcset=\").*1\.5x,\s(https://.*)\s2x\""
hashed_images_urls = re.compile(r"upload://[a-zA-Z0-9]{27}\.(?:jpe?g|png|svg)") hashed_images_urls = re.compile(r"upload://[a-zA-Z0-9]{27}\.(?:jpe?g|png|svg)")
@staticmethod @staticmethod
@ -191,7 +192,11 @@ class DiscourseProcessor:
def get_images_url_assocs_from_page(cls, page: HTMLPage) -> ImageUrlAssocs: def get_images_url_assocs_from_page(cls, page: HTMLPage) -> ImageUrlAssocs:
result: list[tuple] = [] result: list[tuple] = []
for match in re.finditer(DiscourseProcessor.Patterns.imgs_urls, page): for match in re.finditer(DiscourseProcessor.Patterns.imgs_urls, page):
(sha1, image_cdn_url) = match.group("sha1", "image_cdn_url") (sha1, image_cdn_url, img_tag) = match.group("sha1", "image_cdn_url", "tag")
if img_big := re.search(
DiscourseProcessor.Patterns.imgs_urls_biggest_img, img_tag
):
image_cdn_url = img_big.group(1)
result.append((sha1, image_cdn_url)) result.append((sha1, image_cdn_url))
return result return result