3 回答

TA貢獻1801條經(jīng)驗 獲得超16個贊
回想一下,字符串可以編碼為字節(jié),然后可以編碼為整數(shù)。然后可以反轉(zhuǎn)編碼以獲取字節(jié)后跟原始字符串。
此編碼器用于binascii生成與charel-f 的答案中相同的整數(shù)編碼。我相信它是相同的,因為我對其進行了廣泛的測試。
from binascii import hexlify, unhexlify
class BytesIntEncoder:
@staticmethod
def encode(b: bytes) -> int:
return int(hexlify(b), 16) if b != b'' else 0
@staticmethod
def decode(i: int) -> int:
return unhexlify('%x' % i) if i != 0 else b''
如果您使用的是 Python <3.6,請刪除可選的類型注釋。
快速測試:
>>> s = 'Test123'
>>> b = s.encode()
>>> b
b'Test123'
>>> BytesIntEncoder.encode(b)
23755444588720691
>>> BytesIntEncoder.decode(_)
b'Test123'
>>> _.decode()
'Test123'

TA貢獻1802條經(jīng)驗 獲得超5個贊
假設(shè)字符集只是字母數(shù)字,即 az AZ 0-9,這需要每個字符 6 位。因此,使用 8 位字節(jié)編碼在理論上是對內(nèi)存的低效使用。
此答案將輸入字節(jié)轉(zhuǎn)換為 6 位整數(shù)序列。它使用按位運算將這些小整數(shù)編碼為一個大整數(shù)。這是否真的轉(zhuǎn)化為現(xiàn)實世界的存儲效率是由 來衡量的sys.getsizeof,對于更大的字符串更有可能。
此實現(xiàn)自定義了字符集選擇的編碼。例如,如果您只使用string.ascii_lowercase(5 位)而不是string.ascii_uppercase + string.digits(6 位),則編碼將相應(yīng)地高效。
單元測試也包括在內(nèi)。
import string
class BytesIntEncoder:
def __init__(self, chars: bytes = (string.ascii_letters + string.digits).encode()):
num_chars = len(chars)
translation = ''.join(chr(i) for i in range(1, num_chars + 1)).encode()
self._translation_table = bytes.maketrans(chars, translation)
self._reverse_translation_table = bytes.maketrans(translation, chars)
self._num_bits_per_char = (num_chars + 1).bit_length()
def encode(self, chars: bytes) -> int:
num_bits_per_char = self._num_bits_per_char
output, bit_idx = 0, 0
for chr_idx in chars.translate(self._translation_table):
output |= (chr_idx << bit_idx)
bit_idx += num_bits_per_char
return output
def decode(self, i: int) -> bytes:
maxint = (2 ** self._num_bits_per_char) - 1
output = bytes(((i >> offset) & maxint) for offset in range(0, i.bit_length(), self._num_bits_per_char))
return output.translate(self._reverse_translation_table)
# Test
import itertools
import random
import unittest
class TestBytesIntEncoder(unittest.TestCase):
chars = string.ascii_letters + string.digits
encoder = BytesIntEncoder(chars.encode())
def _test_encoding(self, b_in: bytes):
i = self.encoder.encode(b_in)
self.assertIsInstance(i, int)
b_out = self.encoder.decode(i)
self.assertIsInstance(b_out, bytes)
self.assertEqual(b_in, b_out)
# print(b_in, i)
def test_thoroughly_with_small_str(self):
for s_len in range(4):
for s in itertools.combinations_with_replacement(self.chars, s_len):
s = ''.join(s)
b_in = s.encode()
self._test_encoding(b_in)
def test_randomly_with_large_str(self):
for s_len in range(256):
num_samples = {s_len <= 16: 2 ** s_len,
16 < s_len <= 32: s_len ** 2,
s_len > 32: s_len * 2,
s_len > 64: s_len,
s_len > 128: 2}[True]
# print(s_len, num_samples)
for _ in range(num_samples):
b_in = ''.join(random.choices(self.chars, k=s_len)).encode()
self._test_encoding(b_in)
if __name__ == '__main__':
unittest.main()
用法示例:
>>> encoder = BytesIntEncoder()
>>> s = 'Test123'
>>> b = s.encode()
>>> b
b'Test123'
>>> encoder.encode(b)
3908257788270
>>> encoder.decode(_)
b'Test123'
添加回答
舉報