diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 70ba036756ff32..3facb139e17d43 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,6 +182,33 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 +.. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET) + + Convert base32 data back to binary and return the binary data. + + Valid base32 data: + + * Conforms to :rfc:`4648`. + * Contains only characters from the base32 alphabet. + * Contains no excess data after padding (including excess padding, newlines, etc.). + * Does not start with padding. + + Optional *alphabet* must be a :class:`bytes` object of length 32 which + specifies an alternative alphabet. + + Invalid base32 data will raise :exc:`binascii.Error`. + + .. versionadded:: next + +.. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET) + + Convert binary data to a line(s) of ASCII characters in base32 coding, + as specified in :rfc:`4648`. The return value is the converted line. + + Optional *alphabet* must be a :term:`bytes-like object` of length 32 which + specifies an alternative alphabet. + + .. versionadded:: next .. function:: a2b_qp(data, header=False) @@ -327,6 +354,18 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: next +.. data:: BASE32_ALPHABET + + The base32 alphabet according to :rfc:`4648`. + + .. versionadded:: next + +.. data:: BASE32HEX_ALPHABET + + The "Extended Hex" base32hex alphabet according to :rfc:`4648`. + + .. versionadded:: next + .. seealso:: diff --git a/Lib/base64.py b/Lib/base64.py index a429760da79f2a..8c88add3c2595b 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -206,51 +206,8 @@ def urlsafe_b64decode(s): the letter O). For security purposes the default is None, so that 0 and 1 are not allowed in the input. ''' -_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567' -_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' -_b32tab2 = {} -_b32rev = {} - -def _b32encode(alphabet, s): - # Delay the initialization of the table to not waste memory - # if the function is never called - if alphabet not in _b32tab2: - b32tab = [bytes((i,)) for i in alphabet] - _b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab] - b32tab = None - - if not isinstance(s, bytes_types): - s = memoryview(s).tobytes() - leftover = len(s) % 5 - # Pad the last quantum with zero bits if necessary - if leftover: - s = s + b'\0' * (5 - leftover) # Don't use += ! - encoded = bytearray() - from_bytes = int.from_bytes - b32tab2 = _b32tab2[alphabet] - for i in range(0, len(s), 5): - c = from_bytes(s[i: i + 5]) # big endian - encoded += (b32tab2[c >> 30] + # bits 1 - 10 - b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20 - b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30 - b32tab2[c & 0x3ff] # bits 31 - 40 - ) - # Adjust for any leftover partial quanta - if leftover == 1: - encoded[-6:] = b'======' - elif leftover == 2: - encoded[-4:] = b'====' - elif leftover == 3: - encoded[-3:] = b'===' - elif leftover == 4: - encoded[-1:] = b'=' - return encoded.take_bytes() - -def _b32decode(alphabet, s, casefold=False, map01=None): - # Delay the initialization of the table to not waste memory - # if the function is never called - if alphabet not in _b32rev: - _b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)} + +def _b32decode_prepare(s, casefold=False, map01=None): s = _bytes_from_decode_data(s) if len(s) % 8: raise binascii.Error('Incorrect padding') @@ -263,51 +220,27 @@ def _b32decode(alphabet, s, casefold=False, map01=None): s = s.translate(bytes.maketrans(b'01', b'O' + map01)) if casefold: s = s.upper() - # Strip off pad characters from the right. We need to count the pad - # characters because this will tell us how many null bytes to remove from - # the end of the decoded string. - l = len(s) - s = s.rstrip(b'=') - padchars = l - len(s) - # Now decode the full quanta - decoded = bytearray() - b32rev = _b32rev[alphabet] - for i in range(0, len(s), 8): - quanta = s[i: i + 8] - acc = 0 - try: - for c in quanta: - acc = (acc << 5) + b32rev[c] - except KeyError: - raise binascii.Error('Non-base32 digit found') from None - decoded += acc.to_bytes(5) # big endian - # Process the last, partial quanta - if l % 8 or padchars not in {0, 1, 3, 4, 6}: - raise binascii.Error('Incorrect padding') - if padchars and decoded: - acc <<= 5 * padchars - last = acc.to_bytes(5) # big endian - leftover = (43 - 5 * padchars) // 8 # 1: 4, 3: 3, 4: 2, 6: 1 - decoded[-5:] = last[:leftover] - return decoded.take_bytes() + return s def b32encode(s): - return _b32encode(_b32alphabet, s) + return binascii.b2a_base32(s) b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') def b32decode(s, casefold=False, map01=None): - return _b32decode(_b32alphabet, s, casefold, map01) + s = _b32decode_prepare(s, casefold, map01) + return binascii.a2b_base32(s) b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32', extra_args=_B32_DECODE_MAP01_DOCSTRING) def b32hexencode(s): - return _b32encode(_b32hexalphabet, s) + return binascii.b2a_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): # base32hex does not have the 01 mapping - return _b32decode(_b32hexalphabet, s, casefold) + s = _b32decode_prepare(s, casefold) + return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 667ec9b5241aa9..7fedd8e1a8b3ca 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -10,10 +10,10 @@ # Note: "*_hex" functions are aliases for "(un)hexlify" -b2a_functions = ['b2a_ascii85', 'b2a_base64', 'b2a_base85', +b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base64', 'b2a_base85', 'b2a_hex', 'b2a_qp', 'b2a_uu', 'hexlify'] -a2b_functions = ['a2b_ascii85', 'a2b_base64', 'a2b_base85', +a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base64', 'a2b_base85', 'a2b_hex', 'a2b_qp', 'a2b_uu', 'unhexlify'] all_functions = a2b_functions + b2a_functions + ['crc32', 'crc_hqx'] @@ -670,6 +670,182 @@ def test_base85_alphabet(self): with self.assertRaises(TypeError): binascii.a2b_base64(data, alphabet=bytearray(alphabet)) + def test_base32_valid(self): + # Test base32 with valid data + lines = [] + step = 0 + i = 0 + while i < len(self.rawdata): + b = self.type2test(self.rawdata[i:i + step]) + a = binascii.b2a_base32(b) + lines.append(a) + i += step + step += 1 + res = bytes() + for line in lines: + a = self.type2test(line) + b = binascii.a2b_base32(a) + res += b + self.assertEqual(res, self.rawdata) + + def test_base32_errors(self): + def _fixPadding(data): + fixed = data.replace(b"=", b"") + len_8 = len(fixed) % 8 + p = 8 - len_8 if len_8 else 0 + return fixed + b"=" * p + + def _assertRegexTemplate(assert_regex, data, good_padding_result=None): + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base32(self.type2test(data)) + if good_padding_result: + fixed = self.type2test(_fixPadding(data)) + self.assertEqual(binascii.a2b_base32(fixed), good_padding_result) + + def assertNonBase32Data(*args): + _assertRegexTemplate(r"(?i)Only base32 data", *args) + + def assertExcessData(*args): + _assertRegexTemplate(r"(?i)Excess data", *args) + + def assertExcessPadding(*args): + _assertRegexTemplate(r"(?i)Excess padding", *args) + + def assertLeadingPadding(*args): + _assertRegexTemplate(r"(?i)Leading padding", *args) + + def assertIncorrectPadding(*args): + _assertRegexTemplate(r"(?i)Incorrect padding", *args) + + def assertDiscontinuousPadding(*args): + _assertRegexTemplate(r"(?i)Discontinuous padding", *args) + + def assertInvalidLength(*args): + _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) + + assertNonBase32Data(b"a") + assertNonBase32Data(b"AA-") + assertNonBase32Data(b"ABCDE==!") + assertNonBase32Data(b"ab:(){:|:&};:==") + + assertExcessData(b"AB======C") + assertExcessData(b"AB======CD") + assertExcessData(b"ABCD====E") + assertExcessData(b"ABCDE===FGH") + assertExcessData(b"ABCDEFG=H") + assertExcessData(b"432Z====55555555") + + assertExcessData(b"BE======EF", b"\t\x08") + assertExcessData(b"BEEF====C", b"\t\x08Q") + assertExcessData(b"BEEFC===AK", b"\t\x08Q\x01") + assertExcessData(b"BEEFCAK=E", b"\t\x08Q\x01D") + + assertExcessPadding(b"BE=======", b"\t") + assertExcessPadding(b"BE========", b"\t") + assertExcessPadding(b"BEEF=====", b"\t\x08") + assertExcessPadding(b"BEEF======", b"\t\x08") + assertExcessPadding(b"BEEFC====", b"\t\x08Q") + assertExcessPadding(b"BEEFC=====", b"\t\x08Q") + assertExcessPadding(b"BEEFCAK==", b"\t\x08Q\x01") + assertExcessPadding(b"BEEFCAK===", b"\t\x08Q\x01") + assertExcessPadding(b"BEEFCAKE=", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE==", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE===", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE====", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=====", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE======", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=======", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE========", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=========", b"\t\x08Q\x01D") + + assertLeadingPadding(b"=", b"") + assertLeadingPadding(b"==", b"") + assertLeadingPadding(b"===", b"") + assertLeadingPadding(b"====", b"") + assertLeadingPadding(b"=====", b"") + assertLeadingPadding(b"======", b"") + assertLeadingPadding(b"=======", b"") + assertLeadingPadding(b"========", b"") + assertLeadingPadding(b"=========", b"") + assertLeadingPadding(b"=BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"==BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"===BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"====BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=====BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"======BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=======BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"========BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=========BEEFCAKE", b"\t\x08Q\x01D") + + assertIncorrectPadding(b"A") + assertIncorrectPadding(b"AB") + assertIncorrectPadding(b"ABC") + assertIncorrectPadding(b"ABCD") + assertIncorrectPadding(b"ABCDE") + assertIncorrectPadding(b"ABCDEF") + assertIncorrectPadding(b"ABCDEFG") + + assertIncorrectPadding(b"BE=", b"\t") + assertIncorrectPadding(b"BE==", b"\t") + assertIncorrectPadding(b"BE===", b"\t") + assertIncorrectPadding(b"BE====", b"\t") + assertIncorrectPadding(b"BE=====", b"\t") + assertIncorrectPadding(b"BEEF=", b"\t\x08") + assertIncorrectPadding(b"BEEF==", b"\t\x08") + assertIncorrectPadding(b"BEEF===", b"\t\x08") + assertIncorrectPadding(b"BEEFC=", b"\t\x08Q") + assertIncorrectPadding(b"BEEFC==", b"\t\x08Q") + + assertDiscontinuousPadding(b"BE=EF===", b"\t\x08") + assertDiscontinuousPadding(b"BE==EF==", b"\t\x08") + assertDiscontinuousPadding(b"BEEF=C==", b"\t\x08Q") + assertDiscontinuousPadding(b"BEEFC=AK", b"\t\x08Q\x01") + + assertInvalidLength(b"A=") + assertInvalidLength(b"A==") + assertInvalidLength(b"A===") + assertInvalidLength(b"A====") + assertInvalidLength(b"A=====") + assertInvalidLength(b"A======") + assertInvalidLength(b"ABC=") + assertInvalidLength(b"ABC==") + assertInvalidLength(b"ABC===") + assertInvalidLength(b"ABC====") + assertInvalidLength(b"ABCDEF=") + + assertInvalidLength(b"B=E=====", b"\t") + assertInvalidLength(b"B==E====", b"\t") + assertInvalidLength(b"BEE=F===", b"\t\x08") + assertInvalidLength(b"BEE==F==", b"\t\x08") + assertInvalidLength(b"BEEFCA=K", b"\t\x08Q\x01") + assertInvalidLength(b"BEEFCA=====K", b"\t\x08Q\x01") + + def test_base32_alphabet(self): + alphabet = b'0Aa1Bb2Cc3Dd4Ee5Ff6Gg7Hh8Ii9JjKk' + data = self.type2test(self.rawdata) + encoded = binascii.b2a_base32(data, alphabet=alphabet) + trans = bytes.maketrans(binascii.BASE32_ALPHABET, alphabet) + expected = binascii.b2a_base32(data).translate(trans) + self.assertEqual(encoded, expected) + self.assertEqual(binascii.a2b_base32(encoded, alphabet=alphabet), self.rawdata) + self.assertEqual(binascii.b2a_base32(data, alphabet=self.type2test(alphabet)), expected) + + data = self.type2test(b'') + self.assertEqual(binascii.b2a_base32(data, alphabet=alphabet), b'') + self.assertEqual(binascii.a2b_base32(data, alphabet=alphabet), b'') + + for func in binascii.b2a_base32, binascii.a2b_base32: + with self.assertRaises(TypeError): + func(data, alphabet=None) + with self.assertRaises(TypeError): + func(data, alphabet=alphabet.decode()) + with self.assertRaises(ValueError): + func(data, alphabet=alphabet[:-1]) + with self.assertRaises(ValueError): + func(data, alphabet=alphabet+b'?') + with self.assertRaises(TypeError): + binascii.a2b_base32(data, alphabet=bytearray(alphabet)) + def test_uu(self): MAX_UU = 45 for backtick in (True, False): diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst new file mode 100644 index 00000000000000..a27639d2908651 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst @@ -0,0 +1,2 @@ +Add base32 support to :mod:`binascii` and improve the performance of the +base-32 converters in :mod:`base64`. Patch by James Seo. diff --git a/Modules/binascii.c b/Modules/binascii.c index f85f32b32e962c..44d7986b6c0415 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -244,6 +244,129 @@ static const unsigned char table_b2a_base85_a85[] Py_ALIGNED(64) = #define BASE85_A85_Z 0x00000000 #define BASE85_A85_Y 0x20202020 + +static const unsigned char table_a2b_base32[] Py_ALIGNED(64) = { + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,26,27, 28,29,30,31, -1,-1,-1,-1, -1,-1,-1,-1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, + 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, +}; + +static const unsigned char table_b2a_base32[] Py_ALIGNED(64) = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; + +#define BASE32_PAD '=' + +/* + * Fast base32 encoding/decoding helpers. + * + * Analogous to the helpers for base64. + */ + +/* Encode 5 bytes into 8 base32 characters. */ +static inline void +base32_encode_quintet(const unsigned char *in, unsigned char *out, + const unsigned char table[]) +{ + uint64_t combined = ((uint64_t)in[0] << 32) | + ((uint64_t)in[1] << 24) | + ((uint64_t)in[2] << 16) | + ((uint64_t)in[3] << 8) | + (uint64_t)in[4]; + out[0] = table[(combined >> 35) & 0x1f]; + out[1] = table[(combined >> 30) & 0x1f]; + out[2] = table[(combined >> 25) & 0x1f]; + out[3] = table[(combined >> 20) & 0x1f]; + out[4] = table[(combined >> 15) & 0x1f]; + out[5] = table[(combined >> 10) & 0x1f]; + out[6] = table[(combined >> 5) & 0x1f]; + out[7] = table[combined & 0x1f]; +} + +/* + * Encode multiple complete 5-byte groups. + * Returns the number of input bytes processed (always a multiple of 5). + */ +static inline Py_ssize_t +base32_encode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char table[]) +{ + Py_ssize_t n_quintets = in_len / 5; + const unsigned char *in_end = in + n_quintets * 5; + + while (in < in_end) { + base32_encode_quintet(in, out, table); + in += 5; + out += 8; + } + + return n_quintets * 5; +} + +/* + * Decode 8 base32 characters into 5 bytes. + * Returns 1 on success, 0 if any character is invalid. + */ +static inline int +base32_decode_octet(const unsigned char *in, unsigned char *out, + const unsigned char table[]) +{ + unsigned char v0 = table[in[0]]; + unsigned char v1 = table[in[1]]; + unsigned char v2 = table[in[2]]; + unsigned char v3 = table[in[3]]; + unsigned char v4 = table[in[4]]; + unsigned char v5 = table[in[5]]; + unsigned char v6 = table[in[6]]; + unsigned char v7 = table[in[7]]; + + if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) & 0xe0) { + return 0; + } + + out[0] = (v0 << 3) | (v1 >> 2); + out[1] = (v1 << 6) | (v2 << 1) | (v3 >> 4); + out[2] = (v3 << 4) | (v4 >> 1); + out[3] = (v4 << 7) | (v5 << 2) | (v6 >> 3); + out[4] = (v6 << 5) | v7; + return 1; +} + +/* + * Decode multiple complete 8-character groups (no padding allowed). + * Returns the number of input characters processed. + * Stops at the first invalid character, padding, or incomplete group. + */ +static inline Py_ssize_t +base32_decode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char table[]) +{ + Py_ssize_t n_quintets = in_len / 8; + Py_ssize_t i; + + for (i = 0; i < n_quintets; i++) { + if (!base32_decode_octet(in + i * 8, out + i * 5, table)) { + break; + } + } + + return i * 8; +} + + static const unsigned short crctab_hqx[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, @@ -1367,6 +1490,294 @@ binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, int pad, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } +/*[clinic input] +binascii.a2b_base32 + + data: ascii_buffer + / + * + alphabet: PyBytesObject(c_default="NULL") = BASE32_ALPHABET + +Decode a line of base32 data. +[clinic start generated code]*/ + +static PyObject * +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, + PyBytesObject *alphabet) +/*[clinic end generated code: output=12cb58bf547237e2 input=426055ea49ac147e]*/ +{ + const unsigned char *ascii_data = data->buf; + Py_ssize_t ascii_len = data->len; + binascii_state *state = NULL; + PyObject *table_obj = NULL; + const unsigned char *table_a2b = table_a2b_base32; + + assert(ascii_len >= 0); + + if (alphabet != NULL) { + state = get_binascii_state(module); + table_obj = get_reverse_table(state, (PyObject *)alphabet, 32, BASE32_PAD); + if (table_obj == NULL) { + return NULL; + } + table_a2b = (const unsigned char *)PyBytes_AS_STRING(table_obj); + } + + /* Allocate output buffer. */ + size_t bin_len = ((size_t)ascii_len + 7) / 8 * 5; + PyBytesWriter *writer = PyBytesWriter_Create(bin_len); + if (writer == NULL) { + Py_XDECREF(table_obj); + return NULL; + } + unsigned char *bin_data = PyBytesWriter_GetData(writer); + + /* + * Fast path: use optimized decoder for complete octets (groups of 8 bytes). + * The fast path stops at padding, invalid chars, or incomplete octets. + */ + if (ascii_len >= 8) { + Py_ssize_t fast_chars = base32_decode_fast(ascii_data, ascii_len, + bin_data, table_a2b); + if (fast_chars > 0) { + ascii_data += fast_chars; + ascii_len -= fast_chars; + bin_data += (fast_chars / 8) * 5; + } + } + + /* Slow path: handle remaining input (padding, invalid chars, incomplete octets). */ + unsigned char leftchar = 0; + int octet_pos = 0; + int pads = 0; + for (; ascii_len; ascii_len--, ascii_data++) { + unsigned char this_ch = *ascii_data; + + /* Check for pad sequences. They may only occur at certain positions. */ + if (this_ch == BASE32_PAD) { + pads++; + + if ((octet_pos == 2 || octet_pos == 4 + || octet_pos == 5 || octet_pos == 7) + && octet_pos + pads <= 8) + { + continue; + } + + state = get_binascii_state(module); + if (state) { + if (octet_pos == 1 || octet_pos == 3 || octet_pos == 6) { + const unsigned char *ascii_data_start = data->buf; + PyErr_Format(state->Error, + "Invalid base32-encoded string: " + "number of data characters (%zd) " + "cannot be 1, 3, or 6 more " + "than a multiple of 8", + ascii_data - ascii_data_start); + } + else { + PyErr_SetString(state->Error, + (octet_pos == 0 && ascii_data == data->buf) + ? "Leading padding not allowed" + : "Excess padding not allowed"); + } + } + goto error; + } + + unsigned char v = table_a2b[this_ch]; + if (v >= 32) { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, "Only base32 data is allowed"); + } + goto error; + } + + /* Data in the middle of/after the padding is not allowed. */ + if (pads) { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, (octet_pos + pads == 8) + ? "Excess data after padding" + : "Discontinuous padding not allowed"); + } + goto error; + } + + switch (octet_pos) { + case 0: + octet_pos = 1; + leftchar = v; + break; + case 1: + octet_pos = 2; + *bin_data++ = (leftchar << 3) | (v >> 2); + leftchar = v & 0x03; + break; + case 2: + octet_pos = 3; + leftchar = (leftchar << 5) | v; + break; + case 3: + octet_pos = 4; + *bin_data++ = (leftchar << 1) | (v >> 4); + leftchar = v & 0x0f; + break; + case 4: + octet_pos = 5; + *bin_data++ = (leftchar << 4) | (v >> 1); + leftchar = v & 0x01; + break; + case 5: + octet_pos = 6; + leftchar = (leftchar << 5) | v; + break; + case 6: + octet_pos = 7; + *bin_data++ = (leftchar << 2) | (v >> 3); + leftchar = v & 0x07; + break; + case 7: + octet_pos = 0; + *bin_data++ = (leftchar << 5) | v; + leftchar = 0; + } + } + + if ((octet_pos != 0 && octet_pos + pads != 8) + || (octet_pos == 0 && pads != 0)) + { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, "Incorrect padding"); + } + goto error; + } + + Py_XDECREF(table_obj); + return PyBytesWriter_FinishWithPointer(writer, bin_data); + +error: + PyBytesWriter_Discard(writer); + Py_XDECREF(table_obj); + return NULL; +} + +/*[clinic input] +binascii.b2a_base32 + + data: Py_buffer + / + * + alphabet: Py_buffer(c_default="{NULL, NULL}") = BASE32_ALPHABET + +base32-code line of data. +[clinic start generated code]*/ + +static PyObject * +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, + Py_buffer *alphabet) +/*[clinic end generated code: output=058d0d1aeb014d3b input=ffd4fa162a6e1cb5]*/ +{ + const unsigned char *table_b2a = table_b2a_base32; + const unsigned char *bin_data = data->buf; + Py_ssize_t bin_len = data->len; + binascii_state *state = NULL; + + assert(bin_len >= 0); + + if (alphabet->buf != NULL) { + if (alphabet->len != 32) { + PyErr_SetString(PyExc_ValueError, "alphabet must have length 32"); + return NULL; + } + table_b2a = alphabet->buf; + } + + /* + * Each group of 5 bytes (rounded up) gets encoded as 8 characters. + * Use unsigned integer arithmetic to avoid signed integer overflow. + */ + size_t ascii_len = ((size_t)bin_len + 4u) / 5u * 8u; + if (ascii_len > PY_SSIZE_T_MAX) { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, "Too much data for base32"); + } + return NULL; + } + PyBytesWriter *writer = PyBytesWriter_Create(ascii_len); + if (writer == NULL) { + return NULL; + } + unsigned char *ascii_data = PyBytesWriter_GetData(writer); + + /* Use the optimized fast path for complete 5-byte groups. */ + Py_ssize_t fast_bytes = base32_encode_fast(bin_data, bin_len, ascii_data, + table_b2a); + bin_data += fast_bytes; + ascii_data += (fast_bytes / 5) * 8; + bin_len -= fast_bytes; + + /* Handle the remaining 0-4 bytes. */ + if (bin_len == 1) { + /* 1 byte remaining: produces 2 encoded + 6 padding chars. */ + uint32_t val = bin_data[0]; + *ascii_data++ = table_b2a[(val >> 3) & 0x1f]; + *ascii_data++ = table_b2a[(val << 2) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 2) { + /* 2 bytes remaining: produces 4 encoded + 4 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 8) | bin_data[1]; + *ascii_data++ = table_b2a[(val >> 11) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 6) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 1) & 0x1f]; + *ascii_data++ = table_b2a[(val << 4) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 3) { + /* 3 bytes remaining: produces 5 encoded + 3 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 16) + | ((uint32_t)bin_data[1] << 8) + | bin_data[2]; + *ascii_data++ = table_b2a[(val >> 19) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 14) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 9) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 4) & 0x1f]; + *ascii_data++ = table_b2a[(val << 1) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 4) { + /* 4 bytes remaining: produces 7 encoded + 1 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 24) + | ((uint32_t)bin_data[1] << 16) + | ((uint32_t)bin_data[2] << 8) + | bin_data[3]; + *ascii_data++ = table_b2a[(val >> 27) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 22) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 17) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 12) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 7) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 2) & 0x1f]; + *ascii_data++ = table_b2a[(val << 3) & 0x1f]; + *ascii_data++ = BASE32_PAD; + } + + return PyBytesWriter_FinishWithPointer(writer, ascii_data); +} + /*[clinic input] binascii.crc_hqx @@ -2028,6 +2439,8 @@ static struct PyMethodDef binascii_module_methods[] = { BINASCII_A2B_ASCII85_METHODDEF BINASCII_A2B_BASE85_METHODDEF BINASCII_B2A_BASE85_METHODDEF + BINASCII_A2B_BASE32_METHODDEF + BINASCII_B2A_BASE32_METHODDEF BINASCII_A2B_HEX_METHODDEF BINASCII_B2A_HEX_METHODDEF BINASCII_HEXLIFY_METHODDEF @@ -2114,6 +2527,16 @@ binascii_exec(PyObject *module) { return -1; } + if (PyModule_Add(module, "BASE32_ALPHABET", + PyBytes_FromStringAndSize((const char *)table_b2a_base32, 32)) < 0) + { + return -1; + } + if (PyModule_Add(module, "BASE32HEX_ALPHABET", + PyBytes_FromString("0123456789ABCDEFGHIJKLMNOPQRSTUV")) < 0) + { + return -1; + } state->reverse_table_cache = PyDict_New(); if (state->reverse_table_cache == NULL) { diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 2fdecc2efbf9d4..8057d94a1fb934 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -711,6 +711,161 @@ binascii_b2a_base85(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P return return_value; } +PyDoc_STRVAR(binascii_a2b_base32__doc__, +"a2b_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" +"--\n" +"\n" +"Decode a line of base32 data."); + +#define BINASCII_A2B_BASE32_METHODDEF \ + {"a2b_base32", _PyCFunction_CAST(binascii_a2b_base32), METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base32__doc__}, + +static PyObject * +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, + PyBytesObject *alphabet); + +static PyObject * +binascii_a2b_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(alphabet), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"", "alphabet", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "a2b_base32", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; + Py_buffer data = {NULL, NULL}; + PyBytesObject *alphabet = NULL; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!ascii_buffer_converter(args[0], &data)) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + if (!PyBytes_Check(args[1])) { + _PyArg_BadArgument("a2b_base32", "argument 'alphabet'", "bytes", args[1]); + goto exit; + } + alphabet = (PyBytesObject *)args[1]; +skip_optional_kwonly: + return_value = binascii_a2b_base32_impl(module, &data, alphabet); + +exit: + /* Cleanup for data */ + if (data.obj) + PyBuffer_Release(&data); + + return return_value; +} + +PyDoc_STRVAR(binascii_b2a_base32__doc__, +"b2a_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" +"--\n" +"\n" +"base32-code line of data."); + +#define BINASCII_B2A_BASE32_METHODDEF \ + {"b2a_base32", _PyCFunction_CAST(binascii_b2a_base32), METH_FASTCALL|METH_KEYWORDS, binascii_b2a_base32__doc__}, + +static PyObject * +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, + Py_buffer *alphabet); + +static PyObject * +binascii_b2a_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(alphabet), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"", "alphabet", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "b2a_base32", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; + Py_buffer data = {NULL, NULL}; + Py_buffer alphabet = {NULL, NULL}; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + if (PyObject_GetBuffer(args[1], &alphabet, PyBUF_SIMPLE) != 0) { + goto exit; + } +skip_optional_kwonly: + return_value = binascii_b2a_base32_impl(module, &data, &alphabet); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + /* Cleanup for alphabet */ + if (alphabet.obj) { + PyBuffer_Release(&alphabet); + } + + return return_value; +} + PyDoc_STRVAR(binascii_crc_hqx__doc__, "crc_hqx($module, data, crc, /)\n" "--\n" @@ -1256,4 +1411,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=84c97096b0fb3819 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5be0c5d9b116ee17 input=a9049054013a1b77]*/