docs and scripts

This commit is contained in:
Anton
2021-04-11 01:10:25 +03:00
commit a9641468b8
219 changed files with 3655 additions and 0 deletions

0
scripts/__init__.py Normal file
View File

5
scripts/build.sh Executable file
View File

@@ -0,0 +1,5 @@
cd build
pandoc --css epub.css -o data-visualization-guide.epub data-visualization-guide.md
# (</(?:h2|p|img|ul|pre)>)\s*
# <p>Based on <a href="https://www.ibcs.com/standards/">International Business Communication Standards</a> 1.1 by <a href="https://www.ibcs.com/">IBCS Association</a>, licensed under <a href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA 4.0</a>. Adapted for the web and other formats by <a href="https://antonz.org/">Anton Zhiyanov</a>.</p>

17
scripts/concat.sh Executable file
View File

@@ -0,0 +1,17 @@
cp -r docs/img build
cp docs/epub.css build
cat \
docs/title.md \
docs/01-say.md \
docs/02-structure.md \
docs/04-express.md \
docs/05-simplify.md \
docs/06-condense.md \
docs/07-check.md \
docs/09-unify.md \
docs/epilogue.md \
> build/data-visualization-guide.md
cd build
sed -E -e 's/docs\///g' -i '' data-visualization-guide.md
sed -E -e 's/^.+(←|→).+$/ /g' -i '' data-visualization-guide.md
sed -E -e 's/[0-9]+-[a-z]+\.md//g' -i '' data-visualization-guide.md

View File

@@ -0,0 +1,21 @@
import os.path
import markdownify
BASE_PATH = "build"
SOURCE_PATH = os.path.join(BASE_PATH, "raw.extract.html")
TARGET_PATH = os.path.join(BASE_PATH, "raw.md")
def main():
file = open(SOURCE_PATH)
source = file.read()
file.close()
target = markdownify.markdownify(source)
file = open(TARGET_PATH, "w")
file.write(target)
file.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,17 @@
import os.path
from . import engine
BASE_PATH = "build"
SOURCE_PATH = os.path.join(BASE_PATH, "raw.images.html")
TARGET_PATH = os.path.join(BASE_PATH, "img")
def main():
file = open(SOURCE_PATH)
source = file.read()
file.close()
engine.download_images(source, to=TARGET_PATH)
if __name__ == "__main__":
main()

91
scripts/engine.py Normal file
View File

@@ -0,0 +1,91 @@
from collections import deque
import os.path
from bs4 import BeautifulSoup
import httpx
def extract(source):
doc = BeautifulSoup(source, "html.parser")
blocks = deque()
div_elems = doc.find_all("div")
for div_el in div_elems:
css_class = div_el["class"][0]
if css_class == "accordion-title-text":
blocks = append(blocks, extract_title(div_el))
blocks = append(blocks, extract_title_text(div_el))
elif css_class == "accordion-content-image":
blocks = append(blocks, extract_content_image(div_el, as_local=True))
elif css_class == "accordion-content-text":
blocks = append(blocks, extract_content_text(div_el))
else:
continue
return "".join(blocks)
def extract_images(source):
doc = BeautifulSoup(source, "html.parser")
images = deque()
div_elems = doc.find_all("div", class_="accordion-content-image")
for div_el in div_elems:
image = extract_content_image(div_el)
images = append(images, image)
return "".join(images)
def download_images(source, to):
doc = BeautifulSoup(source, "html.parser")
for img_el in doc.find_all("img"):
url = img_el["src"]
filename = os.path.basename(url).lower()
path = os.path.join(to, filename)
if os.path.exists(path):
continue
with open(path, "wb") as file:
file.write(httpx.get(url).content)
def extract_title(div_el):
title_el = div_el.find("h2")
if not title_el:
return None
title = "<h2>" + inner_text(title_el) + "</h2>"
return title
def extract_title_text(div_el):
paragraph_elems = div_el.find_all("p")
if not paragraph_elems:
return None
paragraphs = [str(el) for el in paragraph_elems]
return "\n".join(paragraphs)
def extract_content_image(div_el, as_local=False):
image_el = div_el.find("img")
if as_local:
image_path = os.path.join("img", os.path.basename(image_el["src"]).lower())
else:
image_path = image_el["src"]
image = '<img alt="{0}" src="{1}">'.format(image_el["alt"], image_path)
return image
def extract_content_text(div_el):
content = inner_html(div_el)
return content
def inner_html(el):
return "".join([str(x) for x in el.contents])
def inner_text(el):
return el.find(text=True, recursive=False).strip()
def append(container, elem):
if not elem:
return container
container.append(elem)
container.append("\n")
return container

21
scripts/extract-images.py Normal file
View File

@@ -0,0 +1,21 @@
import os.path
from . import engine
BASE_PATH = "build"
SOURCE_PATH = os.path.join(BASE_PATH, "raw.html")
TARGET_PATH = os.path.join(BASE_PATH, "raw.images.html")
def main():
file = open(SOURCE_PATH)
source = file.read()
file.close()
target = engine.extract_images(source)
file = open(TARGET_PATH, "w")
file.write(target)
file.close()
if __name__ == "__main__":
main()

21
scripts/extract.py Normal file
View File

@@ -0,0 +1,21 @@
import os.path
from . import engine
BASE_PATH = "build"
SOURCE_PATH = os.path.join(BASE_PATH, "raw.html")
TARGET_PATH = os.path.join(BASE_PATH, "raw.extract.html")
def main():
file = open(SOURCE_PATH)
source = file.read()
file.close()
target = engine.extract(source)
file = open(TARGET_PATH, "w")
file.write(target)
file.close()
if __name__ == "__main__":
main()