diff --git a/docs/fetch_discourse_md.py b/docs/fetch_discourse_md.py index e2c85f39..390c8a9b 100755 --- a/docs/fetch_discourse_md.py +++ b/docs/fetch_discourse_md.py @@ -109,8 +109,9 @@ class DiscourseProcessor: class Patterns: post_sep_markdown = re.compile(r"-------------------------") imgs_urls = re.compile( - r"https://(?:[a-zA-Z0-9./_-]+)).*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">" + r"(?Phttps://(?:[a-zA-Z0-9./_-]+))\"\salt=\"(?P(?:\w+\s?)+)\".*data-base62-sha1=\"(?P[a-zA-Z0-9]+)\".*\">)" ) + imgs_urls_biggest_img = r"(?<=srcset=\").*1\.5x,\s(https://.*)\s2x\"" hashed_images_urls = re.compile(r"upload://[a-zA-Z0-9]{27}\.(?:jpe?g|png|svg)") @staticmethod @@ -191,7 +192,11 @@ class DiscourseProcessor: def get_images_url_assocs_from_page(cls, page: HTMLPage) -> ImageUrlAssocs: result: list[tuple] = [] for match in re.finditer(DiscourseProcessor.Patterns.imgs_urls, page): - (sha1, image_cdn_url) = match.group("sha1", "image_cdn_url") + (sha1, image_cdn_url, img_tag) = match.group("sha1", "image_cdn_url", "tag") + if img_big := re.search( + DiscourseProcessor.Patterns.imgs_urls_biggest_img, img_tag + ): + image_cdn_url = img_big.group(1) result.append((sha1, image_cdn_url)) return result