efficiency improvements for backwards encoding

This commit is contained in:
Michael Billington 2016-09-11 20:21:30 +10:00
parent b5bf1125db
commit d9a6960f07
1 changed files with 44 additions and 7 deletions

View File

@ -17,7 +17,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
from __future__ import unicode_literals from __future__ import unicode_literals
from builtins import bytes, chr from builtins import bytes
from .constants import CODEPAGE_CHANGE from .constants import CODEPAGE_CHANGE
from .exceptions import CharCodeError, Error from .exceptions import CharCodeError, Error
from .capabilities import get_profile from .capabilities import get_profile
@ -46,6 +46,7 @@ class Encoder(object):
def __init__(self, codepage_map): def __init__(self, codepage_map):
self.codepages = codepage_map self.codepages = codepage_map
self.available_encodings = set(codepage_map.keys()) self.available_encodings = set(codepage_map.keys())
self.available_characters = {}
self.used_encodings = set() self.used_encodings = set()
def get_sequence(self, encoding): def get_sequence(self, encoding):
@ -66,8 +67,15 @@ class Encoder(object):
).format(encoding, ','.join(self.codepages.keys()))) ).format(encoding, ','.join(self.codepages.keys())))
return encoding return encoding
def can_encode(self, encoding, char): def _get_codepage_char_list(self, encoding):
# Compute the encodable characters in the upper half of this code page """Get codepage character list
Gets characters 128-255 for a given code page, as an array.
:param encoding: The name of the encoding. This must be a valid python encoding.
"""
# Compute the encodable characters as an array (this is the format
# that for non-standard codings come in)
encodable_chars = [u" "] * 128 encodable_chars = [u" "] * 128
for i in range(0, 128): for i in range(0, 128):
codepoint = i + 128 codepoint = i + 128
@ -76,13 +84,42 @@ class Encoder(object):
except UnicodeDecodeError: except UnicodeDecodeError:
# Non-encodable character # Non-encodable character
pass pass
return encodable_chars
def _get_codepage_char_map(self, encoding):
""" Get codepage character map
Process an encoding and return a map of UTF-characters to code points
in this encoding.
This is generated once only, and returned from a cache.
:param encoding: The name of the encoding.
"""
# Skip things that were loaded previously
if encoding in self.available_characters:
return self.available_characters[encoding]
codepage_char_list = self._get_codepage_char_list(encoding)
codepage_char_map = dict((utf8, i + 128) for (i, utf8) in enumerate(codepage_char_list))
self.available_characters[encoding] = codepage_char_map
return codepage_char_map
def can_encode(self, encoding, char):
"""Determine if a character is encodeable in the given code page.
:param encoding: The name of the encoding.
:param char: The character to attempt to encode.
"""
available_map = {}
try:
available_map = self._get_codepage_char_map(encoding)
except LookupError: except LookupError:
# We don't have this encoding
return False return False
# Decide whether this character is encodeable in this code page # Decide whether this character is encodeable in this code page
is_ascii = ord(char) < 128 is_ascii = ord(char) < 128
return is_ascii or char in encodable_chars is_encodable = char in available_map
return is_ascii or is_encodable
def __encoding_sort_func(self, item): def __encoding_sort_func(self, item):
key, index = item key, index = item