diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index abbcffbe3fcee9..93d57399640759 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -89,9 +89,9 @@ class UnicodeFunctionsTest(unittest.TestCase): # Update this if the database changes. Make sure to do a full rebuild # (e.g. 'make distclean && make') to get the correct checksum. - expectedchecksum = ('35e842600fa7ae2db93739db08ef201b726a2374' + expectedchecksum = ('1ba453ec456896f1190d849b6e9b7c2e1a4128e0' if quicktest else - '23ab09ed4abdf93db23b97359108ed630dd8311d') + '46ca89d9fe34881d0be3a4a4b29f5aa8c019640c') def test_function_checksum(self): db = self.db @@ -346,6 +346,12 @@ def test_decomposition(self): # New in 16.0.0 self.assertEqual(self.db.decomposition('\U0001CCD6'), '' if self.old else ' 0041') + # Hangul characters + self.assertEqual(self.db.decomposition('\uAC00'), '1100 1161') + self.assertEqual(self.db.decomposition('\uD4DB'), '1111 1171 11B6') + self.assertEqual(self.db.decomposition('\uC2F8'), '110A 1161') + self.assertEqual(self.db.decomposition('\uD7A3'), '1112 1175 11C2') + self.assertRaises(TypeError, self.db.decomposition) self.assertRaises(TypeError, self.db.decomposition, 'xx') @@ -649,9 +655,9 @@ def test_east_asian_width_unassigned(self): class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest): db = unicodedata.ucd_3_2_0 old = True - expectedchecksum = ('4154d8d1232837e255edf3cdcbb5ab184d71f4a4' + expectedchecksum = ('883824cb6c0ccf994e4451ebf281e2d6d479af47' if quicktest else - 'b0a8df4ce8cf910def4e75f2d03c93defcc9bb09') + 'caf1a7f2f380f927461837f1901ef20683f98683') class UnicodeMiscTest(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst b/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst new file mode 100644 index 00000000000000..15cf25052bbb46 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-02-19-10-57-40.gh-issue-88091.N7qGV-.rst @@ -0,0 +1 @@ +Fix :func:`unicodedata.decomposition` for Hangul characters. diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 05470463944854..83de1be56a7faf 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -388,6 +388,17 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr) return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]); } +// For Hangul decomposition +#define SBase 0xAC00 +#define LBase 0x1100 +#define VBase 0x1161 +#define TBase 0x11A7 +#define LCount 19 +#define VCount 21 +#define TCount 28 +#define NCount (VCount*TCount) +#define SCount (LCount*NCount) + /*[clinic input] unicodedata.UCD.decomposition @@ -418,6 +429,25 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr) return Py_GetConstant(Py_CONSTANT_EMPTY_STR); /* unassigned */ } + // Hangul Decomposition. + // See section 3.12.2, "Hangul Syllable Decomposition" + // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669 + if (SBase <= code && code < (SBase + SCount)) { + int SIndex = code - SBase; + int L = LBase + SIndex / NCount; + int V = VBase + (SIndex % NCount) / TCount; + int T = TBase + SIndex % TCount; + if (T != TBase) { + PyOS_snprintf(decomp, sizeof(decomp), + "%04X %04X %04X", L, V, T); + } + else { + PyOS_snprintf(decomp, sizeof(decomp), + "%04X %04X", L, V); + } + return PyUnicode_FromString(decomp); + } + if (code < 0 || code >= 0x110000) index = 0; else { @@ -480,16 +510,6 @@ get_decomp_record(PyObject *self, Py_UCS4 code, (*index)++; } -#define SBase 0xAC00 -#define LBase 0x1100 -#define VBase 0x1161 -#define TBase 0x11A7 -#define LCount 19 -#define VCount 21 -#define TCount 28 -#define NCount (VCount*TCount) -#define SCount (LCount*NCount) - static PyObject* nfd_nfkd(PyObject *self, PyObject *input, int k) { @@ -543,7 +563,9 @@ nfd_nfkd(PyObject *self, PyObject *input, int k) } output = new_output; } - /* Hangul Decomposition. */ + // Hangul Decomposition. + // See section 3.12.2, "Hangul Syllable Decomposition" + // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G56669 if (SBase <= code && code < (SBase+SCount)) { int SIndex = code - SBase; int L = LBase + SIndex / NCount;