docs and scripts
This commit is contained in:
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
5
scripts/build.sh
Executable file
5
scripts/build.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
cd build
|
||||
pandoc --css epub.css -o data-visualization-guide.epub data-visualization-guide.md
|
||||
|
||||
# (</(?:h2|p|img|ul|pre)>)\s*
|
||||
# <p>Based on <a href="https://www.ibcs.com/standards/">International Business Communication Standards</a> 1.1 by <a href="https://www.ibcs.com/">IBCS Association</a>, licensed under <a href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA 4.0</a>. Adapted for the web and other formats by <a href="https://antonz.org/">Anton Zhiyanov</a>.</p>
|
17
scripts/concat.sh
Executable file
17
scripts/concat.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
cp -r docs/img build
|
||||
cp docs/epub.css build
|
||||
cat \
|
||||
docs/title.md \
|
||||
docs/01-say.md \
|
||||
docs/02-structure.md \
|
||||
docs/04-express.md \
|
||||
docs/05-simplify.md \
|
||||
docs/06-condense.md \
|
||||
docs/07-check.md \
|
||||
docs/09-unify.md \
|
||||
docs/epilogue.md \
|
||||
> build/data-visualization-guide.md
|
||||
cd build
|
||||
sed -E -e 's/docs\///g' -i '' data-visualization-guide.md
|
||||
sed -E -e 's/^.+(←|→).+$/ /g' -i '' data-visualization-guide.md
|
||||
sed -E -e 's/[0-9]+-[a-z]+\.md//g' -i '' data-visualization-guide.md
|
21
scripts/convert-to-markdown.py
Normal file
21
scripts/convert-to-markdown.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os.path
|
||||
import markdownify
|
||||
|
||||
BASE_PATH = "build"
|
||||
SOURCE_PATH = os.path.join(BASE_PATH, "raw.extract.html")
|
||||
TARGET_PATH = os.path.join(BASE_PATH, "raw.md")
|
||||
|
||||
|
||||
def main():
|
||||
file = open(SOURCE_PATH)
|
||||
source = file.read()
|
||||
file.close()
|
||||
|
||||
target = markdownify.markdownify(source)
|
||||
file = open(TARGET_PATH, "w")
|
||||
file.write(target)
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
17
scripts/download-images.py
Normal file
17
scripts/download-images.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import os.path
|
||||
from . import engine
|
||||
|
||||
BASE_PATH = "build"
|
||||
SOURCE_PATH = os.path.join(BASE_PATH, "raw.images.html")
|
||||
TARGET_PATH = os.path.join(BASE_PATH, "img")
|
||||
|
||||
|
||||
def main():
|
||||
file = open(SOURCE_PATH)
|
||||
source = file.read()
|
||||
file.close()
|
||||
engine.download_images(source, to=TARGET_PATH)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
91
scripts/engine.py
Normal file
91
scripts/engine.py
Normal file
@@ -0,0 +1,91 @@
|
||||
from collections import deque
|
||||
import os.path
|
||||
from bs4 import BeautifulSoup
|
||||
import httpx
|
||||
|
||||
|
||||
def extract(source):
|
||||
doc = BeautifulSoup(source, "html.parser")
|
||||
blocks = deque()
|
||||
div_elems = doc.find_all("div")
|
||||
for div_el in div_elems:
|
||||
css_class = div_el["class"][0]
|
||||
if css_class == "accordion-title-text":
|
||||
blocks = append(blocks, extract_title(div_el))
|
||||
blocks = append(blocks, extract_title_text(div_el))
|
||||
elif css_class == "accordion-content-image":
|
||||
blocks = append(blocks, extract_content_image(div_el, as_local=True))
|
||||
elif css_class == "accordion-content-text":
|
||||
blocks = append(blocks, extract_content_text(div_el))
|
||||
else:
|
||||
continue
|
||||
return "".join(blocks)
|
||||
|
||||
|
||||
def extract_images(source):
|
||||
doc = BeautifulSoup(source, "html.parser")
|
||||
images = deque()
|
||||
div_elems = doc.find_all("div", class_="accordion-content-image")
|
||||
for div_el in div_elems:
|
||||
image = extract_content_image(div_el)
|
||||
images = append(images, image)
|
||||
return "".join(images)
|
||||
|
||||
|
||||
def download_images(source, to):
|
||||
doc = BeautifulSoup(source, "html.parser")
|
||||
for img_el in doc.find_all("img"):
|
||||
url = img_el["src"]
|
||||
filename = os.path.basename(url).lower()
|
||||
path = os.path.join(to, filename)
|
||||
if os.path.exists(path):
|
||||
continue
|
||||
with open(path, "wb") as file:
|
||||
file.write(httpx.get(url).content)
|
||||
|
||||
|
||||
def extract_title(div_el):
|
||||
title_el = div_el.find("h2")
|
||||
if not title_el:
|
||||
return None
|
||||
title = "<h2>" + inner_text(title_el) + "</h2>"
|
||||
return title
|
||||
|
||||
|
||||
def extract_title_text(div_el):
|
||||
paragraph_elems = div_el.find_all("p")
|
||||
if not paragraph_elems:
|
||||
return None
|
||||
paragraphs = [str(el) for el in paragraph_elems]
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
|
||||
def extract_content_image(div_el, as_local=False):
|
||||
image_el = div_el.find("img")
|
||||
if as_local:
|
||||
image_path = os.path.join("img", os.path.basename(image_el["src"]).lower())
|
||||
else:
|
||||
image_path = image_el["src"]
|
||||
image = '<img alt="{0}" src="{1}">'.format(image_el["alt"], image_path)
|
||||
return image
|
||||
|
||||
|
||||
def extract_content_text(div_el):
|
||||
content = inner_html(div_el)
|
||||
return content
|
||||
|
||||
|
||||
def inner_html(el):
|
||||
return "".join([str(x) for x in el.contents])
|
||||
|
||||
|
||||
def inner_text(el):
|
||||
return el.find(text=True, recursive=False).strip()
|
||||
|
||||
|
||||
def append(container, elem):
|
||||
if not elem:
|
||||
return container
|
||||
container.append(elem)
|
||||
container.append("\n")
|
||||
return container
|
21
scripts/extract-images.py
Normal file
21
scripts/extract-images.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os.path
|
||||
from . import engine
|
||||
|
||||
BASE_PATH = "build"
|
||||
SOURCE_PATH = os.path.join(BASE_PATH, "raw.html")
|
||||
TARGET_PATH = os.path.join(BASE_PATH, "raw.images.html")
|
||||
|
||||
|
||||
def main():
|
||||
file = open(SOURCE_PATH)
|
||||
source = file.read()
|
||||
file.close()
|
||||
|
||||
target = engine.extract_images(source)
|
||||
file = open(TARGET_PATH, "w")
|
||||
file.write(target)
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
21
scripts/extract.py
Normal file
21
scripts/extract.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os.path
|
||||
from . import engine
|
||||
|
||||
BASE_PATH = "build"
|
||||
SOURCE_PATH = os.path.join(BASE_PATH, "raw.html")
|
||||
TARGET_PATH = os.path.join(BASE_PATH, "raw.extract.html")
|
||||
|
||||
|
||||
def main():
|
||||
file = open(SOURCE_PATH)
|
||||
source = file.read()
|
||||
file.close()
|
||||
|
||||
target = engine.extract(source)
|
||||
file = open(TARGET_PATH, "w")
|
||||
file.write(target)
|
||||
file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user