docs and scripts

2021-04-11 01:10:25 +03:00
commit a9641468b8
219 changed files with 3655 additions and 0 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/build.sh
+++ b/scripts/build.sh
@@ -0,0 +1,5 @@
+cd build
+pandoc --css epub.css -o data-visualization-guide.epub data-visualization-guide.md
+
+# (</(?:h2|p|img|ul|pre)>)\s*
+# <p>Based on <a href="https://www.ibcs.com/standards/">International Business Communication Standards</a> 1.1 by <a href="https://www.ibcs.com/">IBCS Association</a>, licensed under <a href="https://creativecommons.org/licenses/by-sa/4.0/">CC BY-SA 4.0</a>. Adapted for the web and other formats by <a href="https://antonz.org/">Anton Zhiyanov</a>.</p>
--- a/scripts/concat.sh
+++ b/scripts/concat.sh
@@ -0,0 +1,17 @@
+cp -r docs/img build
+cp docs/epub.css build
+cat \
+  docs/title.md \
+  docs/01-say.md \
+  docs/02-structure.md \
+  docs/04-express.md \
+  docs/05-simplify.md \
+  docs/06-condense.md \
+  docs/07-check.md \
+  docs/09-unify.md \
+  docs/epilogue.md \
+  > build/data-visualization-guide.md
+cd build
+sed -E -e 's/docs\///g' -i '' data-visualization-guide.md
+sed -E -e 's/^.+(←|→).+$/ /g' -i '' data-visualization-guide.md
+sed -E -e 's/[0-9]+-[a-z]+\.md//g' -i '' data-visualization-guide.md
--- a/scripts/convert-to-markdown.py
+++ b/scripts/convert-to-markdown.py
@@ -0,0 +1,21 @@
+import os.path
+import markdownify
+
+BASE_PATH = "build"
+SOURCE_PATH = os.path.join(BASE_PATH, "raw.extract.html")
+TARGET_PATH = os.path.join(BASE_PATH, "raw.md")
+
+
+def main():
+    file = open(SOURCE_PATH)
+    source = file.read()
+    file.close()
+
+    target = markdownify.markdownify(source)
+    file = open(TARGET_PATH, "w")
+    file.write(target)
+    file.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/download-images.py
+++ b/scripts/download-images.py
@@ -0,0 +1,17 @@
+import os.path
+from . import engine
+
+BASE_PATH = "build"
+SOURCE_PATH = os.path.join(BASE_PATH, "raw.images.html")
+TARGET_PATH = os.path.join(BASE_PATH, "img")
+
+
+def main():
+    file = open(SOURCE_PATH)
+    source = file.read()
+    file.close()
+    engine.download_images(source, to=TARGET_PATH)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/engine.py
+++ b/scripts/engine.py
@@ -0,0 +1,91 @@
+from collections import deque
+import os.path
+from bs4 import BeautifulSoup
+import httpx
+
+
+def extract(source):
+    doc = BeautifulSoup(source, "html.parser")
+    blocks = deque()
+    div_elems = doc.find_all("div")
+    for div_el in div_elems:
+        css_class = div_el["class"][0]
+        if css_class == "accordion-title-text":
+            blocks = append(blocks, extract_title(div_el))
+            blocks = append(blocks, extract_title_text(div_el))
+        elif css_class == "accordion-content-image":
+            blocks = append(blocks, extract_content_image(div_el, as_local=True))
+        elif css_class == "accordion-content-text":
+            blocks = append(blocks, extract_content_text(div_el))
+        else:
+            continue
+    return "".join(blocks)
+
+
+def extract_images(source):
+    doc = BeautifulSoup(source, "html.parser")
+    images = deque()
+    div_elems = doc.find_all("div", class_="accordion-content-image")
+    for div_el in div_elems:
+        image = extract_content_image(div_el)
+        images = append(images, image)
+    return "".join(images)
+
+
+def download_images(source, to):
+    doc = BeautifulSoup(source, "html.parser")
+    for img_el in doc.find_all("img"):
+        url = img_el["src"]
+        filename = os.path.basename(url).lower()
+        path = os.path.join(to, filename)
+        if os.path.exists(path):
+            continue
+        with open(path, "wb") as file:
+            file.write(httpx.get(url).content)
+
+
+def extract_title(div_el):
+    title_el = div_el.find("h2")
+    if not title_el:
+        return None
+    title = "<h2>" + inner_text(title_el) + "</h2>"
+    return title
+
+
+def extract_title_text(div_el):
+    paragraph_elems = div_el.find_all("p")
+    if not paragraph_elems:
+        return None
+    paragraphs = [str(el) for el in paragraph_elems]
+    return "\n".join(paragraphs)
+
+
+def extract_content_image(div_el, as_local=False):
+    image_el = div_el.find("img")
+    if as_local:
+        image_path = os.path.join("img", os.path.basename(image_el["src"]).lower())
+    else:
+        image_path = image_el["src"]
+    image = '<img alt="{0}" src="{1}">'.format(image_el["alt"], image_path)
+    return image
+
+
+def extract_content_text(div_el):
+    content = inner_html(div_el)
+    return content
+
+
+def inner_html(el):
+    return "".join([str(x) for x in el.contents])
+
+
+def inner_text(el):
+    return el.find(text=True, recursive=False).strip()
+
+
+def append(container, elem):
+    if not elem:
+        return container
+    container.append(elem)
+    container.append("\n")
+    return container
--- a/scripts/extract-images.py
+++ b/scripts/extract-images.py
@@ -0,0 +1,21 @@
+import os.path
+from . import engine
+
+BASE_PATH = "build"
+SOURCE_PATH = os.path.join(BASE_PATH, "raw.html")
+TARGET_PATH = os.path.join(BASE_PATH, "raw.images.html")
+
+
+def main():
+    file = open(SOURCE_PATH)
+    source = file.read()
+    file.close()
+
+    target = engine.extract_images(source)
+    file = open(TARGET_PATH, "w")
+    file.write(target)
+    file.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/extract.py
+++ b/scripts/extract.py
@@ -0,0 +1,21 @@
+import os.path
+from . import engine
+
+BASE_PATH = "build"
+SOURCE_PATH = os.path.join(BASE_PATH, "raw.html")
+TARGET_PATH = os.path.join(BASE_PATH, "raw.extract.html")
+
+
+def main():
+    file = open(SOURCE_PATH)
+    source = file.read()
+    file.close()
+
+    target = engine.extract(source)
+    file = open(TARGET_PATH, "w")
+    file.write(target)
+    file.close()
+
+
+if __name__ == "__main__":
+    main()