108 lines
2.6 KiB
Python
108 lines
2.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Helpers to encode Japanese characters.
|
|
|
|
I doubt that this currently works correctly.
|
|
"""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
try:
|
|
import jaconv
|
|
except ImportError:
|
|
jaconv = None
|
|
|
|
|
|
def encode_katakana(text):
|
|
"""I don't think this quite works yet."""
|
|
encoded = []
|
|
for char in text:
|
|
if jaconv:
|
|
# try to convert japanese text to half-katakanas
|
|
char = jaconv.z2h(jaconv.hira2kata(char))
|
|
# TODO: "the conversion may result in multiple characters"
|
|
# If that really can happen (I am not really shure), than the string would have to be split and every single
|
|
# character has to passed through the following lines.
|
|
|
|
if char in TXT_ENC_KATAKANA_MAP:
|
|
encoded.append(TXT_ENC_KATAKANA_MAP[char])
|
|
else:
|
|
# TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only
|
|
# encodable characters? We could at least throw an exception if encoding is not possible.
|
|
pass
|
|
return b"".join(encoded)
|
|
|
|
|
|
TXT_ENC_KATAKANA_MAP = {
|
|
# Maps UTF-8 Katakana symbols to KATAKANA Page Codes
|
|
# TODO: has this really to be hardcoded?
|
|
|
|
# Half-Width Katakanas
|
|
'。': b'\xa1',
|
|
'「': b'\xa2',
|
|
'」': b'\xa3',
|
|
'、': b'\xa4',
|
|
'・': b'\xa5',
|
|
'ヲ': b'\xa6',
|
|
'ァ': b'\xa7',
|
|
'ィ': b'\xa8',
|
|
'ゥ': b'\xa9',
|
|
'ェ': b'\xaa',
|
|
'ォ': b'\xab',
|
|
'ャ': b'\xac',
|
|
'ュ': b'\xad',
|
|
'ョ': b'\xae',
|
|
'ッ': b'\xaf',
|
|
'ー': b'\xb0',
|
|
'ア': b'\xb1',
|
|
'イ': b'\xb2',
|
|
'ウ': b'\xb3',
|
|
'エ': b'\xb4',
|
|
'オ': b'\xb5',
|
|
'カ': b'\xb6',
|
|
'キ': b'\xb7',
|
|
'ク': b'\xb8',
|
|
'ケ': b'\xb9',
|
|
'コ': b'\xba',
|
|
'サ': b'\xbb',
|
|
'シ': b'\xbc',
|
|
'ス': b'\xbd',
|
|
'セ': b'\xbe',
|
|
'ソ': b'\xbf',
|
|
'タ': b'\xc0',
|
|
'チ': b'\xc1',
|
|
'ツ': b'\xc2',
|
|
'テ': b'\xc3',
|
|
'ト': b'\xc4',
|
|
'ナ': b'\xc5',
|
|
'ニ': b'\xc6',
|
|
'ヌ': b'\xc7',
|
|
'ネ': b'\xc8',
|
|
'ノ': b'\xc9',
|
|
'ハ': b'\xca',
|
|
'ヒ': b'\xcb',
|
|
'フ': b'\xcc',
|
|
'ヘ': b'\xcd',
|
|
'ホ': b'\xce',
|
|
'マ': b'\xcf',
|
|
'ミ': b'\xd0',
|
|
'ム': b'\xd1',
|
|
'メ': b'\xd2',
|
|
'モ': b'\xd3',
|
|
'ヤ': b'\xd4',
|
|
'ユ': b'\xd5',
|
|
'ヨ': b'\xd6',
|
|
'ラ': b'\xd7',
|
|
'リ': b'\xd8',
|
|
'ル': b'\xd9',
|
|
'レ': b'\xda',
|
|
'ロ': b'\xdb',
|
|
'ワ': b'\xdc',
|
|
'ン': b'\xdd',
|
|
'゙': b'\xde',
|
|
'゚': b'\xdf',
|
|
}
|