dataviz/scripts/engine.py

92 lines
2.5 KiB
Python
Raw Permalink Normal View History

2021-04-10 22:10:25 +00:00
from collections import deque
import os.path
from bs4 import BeautifulSoup
import httpx
def extract(source):
doc = BeautifulSoup(source, "html.parser")
blocks = deque()
div_elems = doc.find_all("div")
for div_el in div_elems:
css_class = div_el["class"][0]
if css_class == "accordion-title-text":
blocks = append(blocks, extract_title(div_el))
blocks = append(blocks, extract_title_text(div_el))
elif css_class == "accordion-content-image":
blocks = append(blocks, extract_content_image(div_el, as_local=True))
elif css_class == "accordion-content-text":
blocks = append(blocks, extract_content_text(div_el))
else:
continue
return "".join(blocks)
def extract_images(source):
doc = BeautifulSoup(source, "html.parser")
images = deque()
div_elems = doc.find_all("div", class_="accordion-content-image")
for div_el in div_elems:
image = extract_content_image(div_el)
images = append(images, image)
return "".join(images)
def download_images(source, to):
doc = BeautifulSoup(source, "html.parser")
for img_el in doc.find_all("img"):
url = img_el["src"]
filename = os.path.basename(url).lower()
path = os.path.join(to, filename)
if os.path.exists(path):
continue
with open(path, "wb") as file:
file.write(httpx.get(url).content)
def extract_title(div_el):
title_el = div_el.find("h2")
if not title_el:
return None
title = "<h2>" + inner_text(title_el) + "</h2>"
return title
def extract_title_text(div_el):
paragraph_elems = div_el.find_all("p")
if not paragraph_elems:
return None
paragraphs = [str(el) for el in paragraph_elems]
return "\n".join(paragraphs)
def extract_content_image(div_el, as_local=False):
image_el = div_el.find("img")
if as_local:
image_path = os.path.join("img", os.path.basename(image_el["src"]).lower())
else:
image_path = image_el["src"]
image = '<img alt="{0}" src="{1}">'.format(image_el["alt"], image_path)
return image
def extract_content_text(div_el):
content = inner_html(div_el)
return content
def inner_html(el):
return "".join([str(x) for x in el.contents])
def inner_text(el):
return el.find(text=True, recursive=False).strip()
def append(container, elem):
if not elem:
return container
container.append(elem)
container.append("\n")
return container