# -*- coding: utf-8 -*- """Helpers to encode Japanese characters. I doubt that this currently works correctly. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function from __future__ import unicode_literals try: import jaconv except ImportError: jaconv = None def encode_katakana(text): """I don't think this quite works yet.""" encoded = [] for char in text: if jaconv: # try to convert japanese text to half-katakanas char = jaconv.z2h(jaconv.hira2kata(char)) # TODO: "the conversion may result in multiple characters" # If that really can happen (I am not really shure), than the string would have to be split and every single # character has to passed through the following lines. if char in TXT_ENC_KATAKANA_MAP: encoded.append(TXT_ENC_KATAKANA_MAP[char]) else: # TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only # encodable characters? We could at least throw an exception if encoding is not possible. pass return b"".join(encoded) TXT_ENC_KATAKANA_MAP = { # Maps UTF-8 Katakana symbols to KATAKANA Page Codes # TODO: has this really to be hardcoded? # Half-Width Katakanas '。': b'\xa1', '「': b'\xa2', '」': b'\xa3', '、': b'\xa4', '・': b'\xa5', 'ヲ': b'\xa6', 'ァ': b'\xa7', 'ィ': b'\xa8', 'ゥ': b'\xa9', 'ェ': b'\xaa', 'ォ': b'\xab', 'ャ': b'\xac', 'ュ': b'\xad', 'ョ': b'\xae', 'ッ': b'\xaf', 'ー': b'\xb0', 'ア': b'\xb1', 'イ': b'\xb2', 'ウ': b'\xb3', 'エ': b'\xb4', 'オ': b'\xb5', 'カ': b'\xb6', 'キ': b'\xb7', 'ク': b'\xb8', 'ケ': b'\xb9', 'コ': b'\xba', 'サ': b'\xbb', 'シ': b'\xbc', 'ス': b'\xbd', 'セ': b'\xbe', 'ソ': b'\xbf', 'タ': b'\xc0', 'チ': b'\xc1', 'ツ': b'\xc2', 'テ': b'\xc3', 'ト': b'\xc4', 'ナ': b'\xc5', 'ニ': b'\xc6', 'ヌ': b'\xc7', 'ネ': b'\xc8', 'ノ': b'\xc9', 'ハ': b'\xca', 'ヒ': b'\xcb', 'フ': b'\xcc', 'ヘ': b'\xcd', 'ホ': b'\xce', 'マ': b'\xcf', 'ミ': b'\xd0', 'ム': b'\xd1', 'メ': b'\xd2', 'モ': b'\xd3', 'ヤ': b'\xd4', 'ユ': b'\xd5', 'ヨ': b'\xd6', 'ラ': b'\xd7', 'リ': b'\xd8', 'ル': b'\xd9', 'レ': b'\xda', 'ロ': b'\xdb', 'ワ': b'\xdc', 'ン': b'\xdd', '゙': b'\xde', '゚': b'\xdf', }