from collections import deque import os.path from bs4 import BeautifulSoup import httpx def extract(source): doc = BeautifulSoup(source, "html.parser") blocks = deque() div_elems = doc.find_all("div") for div_el in div_elems: css_class = div_el["class"][0] if css_class == "accordion-title-text": blocks = append(blocks, extract_title(div_el)) blocks = append(blocks, extract_title_text(div_el)) elif css_class == "accordion-content-image": blocks = append(blocks, extract_content_image(div_el, as_local=True)) elif css_class == "accordion-content-text": blocks = append(blocks, extract_content_text(div_el)) else: continue return "".join(blocks) def extract_images(source): doc = BeautifulSoup(source, "html.parser") images = deque() div_elems = doc.find_all("div", class_="accordion-content-image") for div_el in div_elems: image = extract_content_image(div_el) images = append(images, image) return "".join(images) def download_images(source, to): doc = BeautifulSoup(source, "html.parser") for img_el in doc.find_all("img"): url = img_el["src"] filename = os.path.basename(url).lower() path = os.path.join(to, filename) if os.path.exists(path): continue with open(path, "wb") as file: file.write(httpx.get(url).content) def extract_title(div_el): title_el = div_el.find("h2") if not title_el: return None title = "