102 lines
2.4 KiB
Python
102 lines
2.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Helpers to encode Japanese characters.
|
|
|
|
I doubt that this currently works correctly.
|
|
"""
|
|
|
|
|
|
try:
|
|
import jaconv
|
|
except ImportError:
|
|
jaconv = None
|
|
|
|
|
|
def encode_katakana(text):
|
|
"""I don't think this quite works yet."""
|
|
encoded = []
|
|
for char in text:
|
|
if jaconv:
|
|
# try to convert japanese text to half-katakanas
|
|
char = jaconv.z2h(jaconv.hira2kata(char))
|
|
# TODO: "the conversion may result in multiple characters"
|
|
# If that really can happen (I am not really shure), than the string would have to be split and every single
|
|
# character has to passed through the following lines.
|
|
|
|
if char in TXT_ENC_KATAKANA_MAP:
|
|
encoded.append(TXT_ENC_KATAKANA_MAP[char])
|
|
else:
|
|
# TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only
|
|
# encodable characters? We could at least throw an exception if encoding is not possible.
|
|
pass
|
|
return b"".join(encoded)
|
|
|
|
|
|
TXT_ENC_KATAKANA_MAP = {
|
|
# Maps UTF-8 Katakana symbols to KATAKANA Page Codes
|
|
# TODO: has this really to be hardcoded?
|
|
# Half-Width Katakanas
|
|
"。": b"\xa1",
|
|
"「": b"\xa2",
|
|
"」": b"\xa3",
|
|
"、": b"\xa4",
|
|
"・": b"\xa5",
|
|
"ヲ": b"\xa6",
|
|
"ァ": b"\xa7",
|
|
"ィ": b"\xa8",
|
|
"ゥ": b"\xa9",
|
|
"ェ": b"\xaa",
|
|
"ォ": b"\xab",
|
|
"ャ": b"\xac",
|
|
"ュ": b"\xad",
|
|
"ョ": b"\xae",
|
|
"ッ": b"\xaf",
|
|
"ー": b"\xb0",
|
|
"ア": b"\xb1",
|
|
"イ": b"\xb2",
|
|
"ウ": b"\xb3",
|
|
"エ": b"\xb4",
|
|
"オ": b"\xb5",
|
|
"カ": b"\xb6",
|
|
"キ": b"\xb7",
|
|
"ク": b"\xb8",
|
|
"ケ": b"\xb9",
|
|
"コ": b"\xba",
|
|
"サ": b"\xbb",
|
|
"シ": b"\xbc",
|
|
"ス": b"\xbd",
|
|
"セ": b"\xbe",
|
|
"ソ": b"\xbf",
|
|
"タ": b"\xc0",
|
|
"チ": b"\xc1",
|
|
"ツ": b"\xc2",
|
|
"テ": b"\xc3",
|
|
"ト": b"\xc4",
|
|
"ナ": b"\xc5",
|
|
"ニ": b"\xc6",
|
|
"ヌ": b"\xc7",
|
|
"ネ": b"\xc8",
|
|
"ノ": b"\xc9",
|
|
"ハ": b"\xca",
|
|
"ヒ": b"\xcb",
|
|
"フ": b"\xcc",
|
|
"ヘ": b"\xcd",
|
|
"ホ": b"\xce",
|
|
"マ": b"\xcf",
|
|
"ミ": b"\xd0",
|
|
"ム": b"\xd1",
|
|
"メ": b"\xd2",
|
|
"モ": b"\xd3",
|
|
"ヤ": b"\xd4",
|
|
"ユ": b"\xd5",
|
|
"ヨ": b"\xd6",
|
|
"ラ": b"\xd7",
|
|
"リ": b"\xd8",
|
|
"ル": b"\xd9",
|
|
"レ": b"\xda",
|
|
"ロ": b"\xdb",
|
|
"ワ": b"\xdc",
|
|
"ン": b"\xdd",
|
|
"゙": b"\xde",
|
|
"゚": b"\xdf",
|
|
}
|