From 0609eb29586bb79c1c69ff69d495bda27d0a0d17 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Wed, 26 Jul 2023 06:34:00 +0200 Subject: [PATCH 01/10] unicodedata: Fix Tangut Ideograph names --- Lib/test/test_unicodedata.py | 2 +- Modules/unicodedata.c | 51 +++++++++++++++++++++++++++++--- Tools/unicode/makeunicodedata.py | 20 ++++++++++--- 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 3dc0790ca15b41..7b49a5aa6c4210 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): # Update this if the database changes. Make sure to do a full rebuild # (e.g. 'make distclean && make') to get the correct checksum. - expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331' + expectedchecksum = '95cc75e49b140c61b884c16d0a9fbbb0b93a7fa9' @requires_resource('cpu') def test_function_checksum(self): diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index c1e22f3868931f..7359a2740615e0 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1025,7 +1025,7 @@ static const char * const hangul_syllables[][3] = { /* These ranges need to match makeunicodedata.py:cjk_ranges. */ static int -is_unified_ideograph(Py_UCS4 code) +is_cjk_unified_ideograph(Py_UCS4 code) { return (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ @@ -1039,6 +1039,15 @@ is_unified_ideograph(Py_UCS4 code) (0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */ } +/* These ranges need to match makeunicodedata.py:tangut_ranges. */ +static int +is_tangut_ideograph(Py_UCS4 code) +{ + return + (0x17000 <= code && code <= 0x187F7) || /* Tangut */ + (0x18D00 <= code && code <= 0x18D08); /* Tangut Supplement */ +} + /* macros used to determine if the given code point is in the PUA range that * we are using to store aliases and named sequences */ #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end)) @@ -1098,7 +1107,7 @@ _getucname(PyObject *self, return 1; } - if (is_unified_ideograph(code)) { + if (is_cjk_unified_ideograph(code)) { if (buflen < 28) /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ return 0; @@ -1106,6 +1115,14 @@ _getucname(PyObject *self, return 1; } + if (is_tangut_ideograph(code)) { + if (buflen < 23) + /* Worst case: TANGUT IDEOGRAPH-18D08 */ + return 0; + sprintf(buffer, "TANGUT IDEOGRAPH-%X", code); + return 1; + } + /* get offset into phrasebook */ offset = phrasebook_offset1[(code>>phrasebook_shift)]; offset = phrasebook_offset2[(offset<= '0' && *name <= '9') + v += *name - '0'; + else if (*name >= 'A' && *name <= 'F') + v += *name - 'A' + 10; + else + return 0; + name++; + } + if (!is_tangut_ideograph(v)) return 0; *code = v; return 1; } + /* the following is the same as python's dictionary lookup, with only minor changes. see the makeunicodedata script for more details */ diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 034642db06e48b..6c69ba2b946709 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -99,7 +99,7 @@ CASED_MASK = 0x2000 EXTENDED_CASE_MASK = 0x4000 -# these ranges need to match unicodedata.c:is_unified_ideograph +# these ranges need to match unicodedata.c:is_cjk_unified_ideograph cjk_ranges = [ ('3400', '4DBF'), ('4E00', '9FFF'), @@ -112,6 +112,12 @@ ('31350', '323AF'), ] +# these ranges need to match unicodedata.c:is_tangut_ideograph +tangut_ranges = [ + ('17000', '187F7'), + ('18D00', '18D08') +] + def maketables(trace=0): @@ -123,7 +129,7 @@ def maketables(trace=0): for version in old_versions: print("--- Reading", UNICODE_DATA % ("-"+version), "...") - old_unicode = UnicodeData(version, cjk_check=False) + old_unicode = UnicodeData(version, ideograph_check=False) print(len(list(filter(None, old_unicode.table))), "characters") merge_old_version(version, unicode, old_unicode) @@ -1020,7 +1026,7 @@ def from_row(row: List[str]) -> UcdRecord: class UnicodeData: # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned - def __init__(self, version, cjk_check=True): + def __init__(self, version, ideograph_check=True): self.changed = [] table = [None] * 0x110000 for s in UcdFile(UNICODE_DATA, version): @@ -1028,6 +1034,7 @@ def __init__(self, version, cjk_check=True): table[char] = from_row(s) cjk_ranges_found = [] + tangut_ranges_found = [] # expand first-last ranges field = None @@ -1044,12 +1051,17 @@ def __init__(self, version, cjk_check=True): if s.name.startswith(" Date: Wed, 26 Jul 2023 06:34:01 +0200 Subject: [PATCH 02/10] News entry --- .../next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst diff --git a/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst new file mode 100644 index 00000000000000..bbdcb4ffa0998b --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst @@ -0,0 +1 @@ +unicodedata: Fix missing Tangut Ideographs names. From b2c4e9276786cf9a230a605193a0fed7b3eaa9d4 Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Wed, 26 Jul 2023 06:34:01 +0200 Subject: [PATCH 03/10] Add test --- Lib/test/test_unicodedata.py | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 7b49a5aa6c4210..9a38ca8d89f488 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -97,6 +97,50 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) + @requires_resource('network') + def test_name(self): + TESTBASEURL = "https://www.unicode.org/Public" + TESTDATAFILE = "extracted/DerivedName.txt" + TESTDATAURL = f"{TESTBASEURL}/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}" + + # Hit the exception early + try: + testdata = open_urlresource(TESTDATAURL, encoding="utf-8") + except PermissionError: + self.skipTest(f"Permission error when downloading {TESTDATAURL} " + f"into the test data directory") + except (OSError, HTTPException) as exc: + self.skipTest(f"Failed to download {TESTDATAURL}: {exc}") + + with testdata: + self.run_name_tests(testdata) + + def run_name_tests(self, testdata): + names_ref = {} + + def parse_cp(s): + return int(s, 16) + + # Parse data + for line in testdata: + line = line.strip() + if not line or line.startswith("#"): + continue + raw_cp, name = line.split("; ") + # Check for a range + if ".." in raw_cp: + cp1, cp2 = map(parse_cp, raw_cp.split("..")) + # remove ‘*’ at the end + name = name[:-1] + for cp in range(cp1, cp2 + 1): + names_ref[cp] = f"{name}{cp:0>4X}" + else: + cp = parse_cp(raw_cp) + names_ref[cp] = name + + for cp in range(0, sys.maxunicode + 1): + self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp)) + @requires_resource('cpu') def test_name_inverse_lookup(self): for i in range(sys.maxunicode + 1): @@ -104,6 +148,7 @@ def test_name_inverse_lookup(self): if looked_name := self.db.name(char, None): self.assertEqual(self.db.lookup(looked_name), char) + def test_digit(self): self.assertEqual(self.db.digit('A', None), None) self.assertEqual(self.db.digit('9'), 9) From 2ae016c6927b11668fd327dd0b993c6e1a665c49 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Feb 2026 16:44:29 +0200 Subject: [PATCH 04/10] Fix code and tests. --- Lib/test/test_unicodedata.py | 86 +++++++++++++++----------------- Modules/unicodedata.c | 6 +-- Tools/unicode/makeunicodedata.py | 4 +- 3 files changed, 46 insertions(+), 50 deletions(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 99d3d7970ef199..89f006165f2f07 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -128,50 +128,6 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) - @requires_resource('network') - def test_name(self): - TESTBASEURL = "https://www.unicode.org/Public" - TESTDATAFILE = "extracted/DerivedName.txt" - TESTDATAURL = f"{TESTBASEURL}/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}" - - # Hit the exception early - try: - testdata = open_urlresource(TESTDATAURL, encoding="utf-8") - except PermissionError: - self.skipTest(f"Permission error when downloading {TESTDATAURL} " - f"into the test data directory") - except (OSError, HTTPException) as exc: - self.skipTest(f"Failed to download {TESTDATAURL}: {exc}") - - with testdata: - self.run_name_tests(testdata) - - def run_name_tests(self, testdata): - names_ref = {} - - def parse_cp(s): - return int(s, 16) - - # Parse data - for line in testdata: - line = line.strip() - if not line or line.startswith("#"): - continue - raw_cp, name = line.split("; ") - # Check for a range - if ".." in raw_cp: - cp1, cp2 = map(parse_cp, raw_cp.split("..")) - # remove ‘*’ at the end - name = name[:-1] - for cp in range(cp1, cp2 + 1): - names_ref[cp] = f"{name}{cp:0>4X}" - else: - cp = parse_cp(raw_cp) - names_ref[cp] = name - - for cp in range(0, sys.maxunicode + 1): - self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp)) - @requires_resource('cpu') def test_name_inverse_lookup(self): for char in iterallchars(): @@ -658,7 +614,47 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest): # (e.g. 'make distclean && make') to get the correct checksum. expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349' if quicktest else - '65670ae03a324c5f9e826a4de3e25bae4d73c9b7') + '180bdc91143d8aa2eb9dd6726e66d37606205942') + + @requires_resource('network') + def test_name(self): + TESTDATAFILE = "DerivedName.txt" + testdata = download_test_data_file(TESTDATAFILE) + + with testdata: + self.run_name_tests(testdata) + + def run_name_tests(self, testdata): + names_ref = {} + + def parse_cp(s): + return int(s, 16) + + # Parse data + for line in testdata: + line = line.strip() + if not line or line.startswith("#"): + continue + raw_cp, name = line.split("; ") + # Check for a range + if ".." in raw_cp: + cp1, cp2 = map(parse_cp, raw_cp.split("..")) + # remove ‘*’ at the end + assert name[-1] == '*', (raw_cp, name) + name = name[:-1] + for cp in range(cp1, cp2 + 1): + names_ref[cp] = f"{name}{cp:04X}" + elif name[-1] == '*': + cp = parse_cp(raw_cp) + name = name[:-1] + names_ref[cp] = f"{name}{cp:04X}" + else: + assert '*' not in name, (raw_cp, name) + cp = parse_cp(raw_cp) + names_ref[cp] = name + + for cp in range(0, sys.maxunicode + 1): + self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp)) def test_isxidstart(self): self.assertTrue(self.db.isxidstart('S')) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index b8ab06b8fb1906..5eec24b594b102 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1075,8 +1075,8 @@ static int is_tangut_ideograph(Py_UCS4 code) { return - (0x17000 <= code && code <= 0x187F7) || /* Tangut */ - (0x18D00 <= code && code <= 0x18D08); /* Tangut Supplement */ + (0x17000 <= code && code <= 0x187FF) || /* Tangut */ + (0x18D00 <= code && code <= 0x18D1E); /* Tangut Supplement */ } /* macros used to determine if the given code point is in the PUA range that @@ -1500,7 +1500,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code) /* Check for Tangut ideographs. */ if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) { /* Five hexdigits must follow. */ - v = 0; + unsigned int v = 0; name += 17; namelen -= 17; if (namelen != 5) diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 7eb6b879f68ab6..2627eebb2ee24c 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -126,8 +126,8 @@ # these ranges need to match unicodedata.c:is_tangut_ideograph tangut_ranges = [ - ('17000', '187F7'), - ('18D00', '18D08') + ('17000', '187FF'), + ('18D00', '18D1E') ] From b69fdfc483e45cf8b72804547a939a5f57a43e8d Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Feb 2026 17:07:44 +0200 Subject: [PATCH 05/10] Add lookup tests and fix case-insensitivity for Tangut ideographs. --- Lib/test/test_ucn.py | 24 ++++++++++++++++++++++++ Modules/unicodedata.c | 11 ++++++----- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 0c641a455c0747..5f4f1b8e52ccef 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -111,6 +111,30 @@ def test_cjk_unified_ideographs(self): self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd") self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd") + def test_tangut_ideographs(self): + self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000") + self.checkletter("TANGUT IDEOGRAPH-187FF", "\U000187ff") + self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00") + self.checkletter("TANGUT IDEOGRAPH-18D1E", "\U00018d1e") + self.checkletter("tangut ideograph-18d1e", "\U00018d1e") + + def test_egyptian_hieroglyphs(self): + self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460") + self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa") + self.checkletter("egyptian hieroglyph-143fa", "\U000143fa") + + def test_khitan_small_script_characters(self): + self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00") + self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5") + self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff") + self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff") + self.checkletter("khitan small script character-18cff", "\U00018cff") + + def test_nushu_characters(self): + self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170") + self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb") + self.checkletter("nushu character-1b2fb", "\U0001b2fb") + def test_bmp_characters(self): for code in range(0x10000): char = chr(code) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 5eec24b594b102..8c378ba576ba23 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1498,7 +1498,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code) } /* Check for Tangut ideographs. */ - if (strncmp(name, "TANGUT IDEOGRAPH-", 17) == 0) { + if (PyOS_strnicmp(name, "TANGUT IDEOGRAPH-", 17) == 0) { /* Five hexdigits must follow. */ unsigned int v = 0; name += 17; @@ -1507,10 +1507,11 @@ _getcode(const char* name, int namelen, Py_UCS4* code) return 0; while (namelen--) { v *= 16; - if (*name >= '0' && *name <= '9') - v += *name - '0'; - else if (*name >= 'A' && *name <= 'F') - v += *name - 'A' + 10; + Py_UCS1 c = Py_TOUPPER(*name); + if (c >= '0' && c <= '9') + v += c - '0'; + else if (c >= 'A' && c <= 'F') + v += c - 'A' + 10; else return 0; name++; From accd1500f59b7485fbdd2cf0327c6d2971685710 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Feb 2026 17:57:36 +0200 Subject: [PATCH 06/10] Add tests that work without network and with UCD 3.2.0. --- Lib/test/test_unicodedata.py | 55 +++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 89f006165f2f07..9769528c7b9ac9 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -128,6 +128,59 @@ def test_function_checksum(self): result = h.hexdigest() self.assertEqual(result, self.expectedchecksum) + def test_name(self): + name = self.db.name + self.assertRaises(ValueError, name, '\0') + self.assertRaises(ValueError, name, '\n') + self.assertRaises(ValueError, name, '\x1F') + self.assertRaises(ValueError, name, '\x7F') + self.assertRaises(ValueError, name, '\x9F') + self.assertRaises(ValueError, name, '\uFFFE') + self.assertRaises(ValueError, name, '\uFFFF') + self.assertRaises(ValueError, name, '\U0010FFFF') + self.assertEqual(name('\U0010FFFF', 42), 42) + + self.assertEqual(name(' '), 'SPACE') + self.assertEqual(name('1'), 'DIGIT ONE') + self.assertEqual(name('A'), 'LATIN CAPITAL LETTER A') + self.assertEqual(name('\xA0'), 'NO-BREAK SPACE') + self.assertEqual(name('\u0221', None), None if self.old else + 'LATIN SMALL LETTER D WITH CURL') + self.assertEqual(name('\u3400'), 'CJK UNIFIED IDEOGRAPH-3400') + self.assertEqual(name('\u9FA5'), 'CJK UNIFIED IDEOGRAPH-9FA5') + self.assertEqual(name('\uAC00'), 'HANGUL SYLLABLE GA') + self.assertEqual(name('\uD7A3'), 'HANGUL SYLLABLE HIH') + self.assertEqual(name('\uF900'), 'CJK COMPATIBILITY IDEOGRAPH-F900') + self.assertEqual(name('\uFA6A'), 'CJK COMPATIBILITY IDEOGRAPH-FA6A') + self.assertEqual(name('\uFBF9'), + 'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ' + 'ABOVE WITH ALEF MAKSURA ISOLATED FORM') + self.assertEqual(name('\U00013460', None), None if self.old else + 'EGYPTIAN HIEROGLYPH-13460') + self.assertEqual(name('\U000143FA', None), None if self.old else + 'EGYPTIAN HIEROGLYPH-143FA') + self.assertEqual(name('\U00017000', None), None if self.old else + 'TANGUT IDEOGRAPH-17000') + self.assertEqual(name('\U00018B00', None), None if self.old else + 'KHITAN SMALL SCRIPT CHARACTER-18B00') + self.assertEqual(name('\U00018CD5', None), None if self.old else + 'KHITAN SMALL SCRIPT CHARACTER-18CD5') + self.assertEqual(name('\U00018CFF', None), None if self.old else + 'KHITAN SMALL SCRIPT CHARACTER-18CFF') + self.assertEqual(name('\U00018D1E', None), None if self.old else + 'TANGUT IDEOGRAPH-18D1E') + self.assertEqual(name('\U0001B170', None), None if self.old else + 'NUSHU CHARACTER-1B170') + self.assertEqual(name('\U0001B2FB', None), None if self.old else + 'NUSHU CHARACTER-1B2FB') + self.assertEqual(name('\U0001FBA8', None), None if self.old else + 'BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO ' + 'MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE') + self.assertEqual(name('\U0002A6D6'), 'CJK UNIFIED IDEOGRAPH-2A6D6') + self.assertEqual(name('\U0002FA1D'), 'CJK COMPATIBILITY IDEOGRAPH-2FA1D') + self.assertEqual(name('\U00033479', None), None if self.old else + 'CJK UNIFIED IDEOGRAPH-33479') + @requires_resource('cpu') def test_name_inverse_lookup(self): for char in iterallchars(): @@ -617,7 +670,7 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest): '180bdc91143d8aa2eb9dd6726e66d37606205942') @requires_resource('network') - def test_name(self): + def test_all_names(self): TESTDATAFILE = "DerivedName.txt" testdata = download_test_data_file(TESTDATAFILE) From c09400c4f005fed5512df0094fb17b053d2a2369 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 13 Feb 2026 18:37:40 +0200 Subject: [PATCH 07/10] Share some common code. --- Modules/unicodedata.c | 66 +++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index 8c378ba576ba23..c2402e15bb4610 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1445,6 +1445,35 @@ _check_alias_and_seq(Py_UCS4* code, int with_named_seq) return 1; } +static Py_UCS4 +parse_hex_code(const char *name, int namelen) +{ + if (namelen < 4 || namelen > 6) { + return (Py_UCS4)-1; + } + if (*name == '0') { + return (Py_UCS4)-1; + } + int v = 0; + while (namelen--) { + v *= 16; + Py_UCS1 c = Py_TOUPPER(*name); + if (c >= '0' && c <= '9') { + v += c - '0'; + } + else if (c >= 'A' && c <= 'F') { + v += c - 'A' + 10; + } + else { + return (Py_UCS4)-1; + } + name++; + } + if (v > 0x10ffff) { + return (Py_UCS4)-1; + } + return v; +} static int _getcode(const char* name, int namelen, Py_UCS4* code) @@ -1474,25 +1503,10 @@ _getcode(const char* name, int namelen, Py_UCS4* code) /* Check for CJK unified ideographs. */ if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { /* Four or five hexdigits must follow. */ - unsigned int v; - v = 0; - name += 22; - namelen -= 22; - if (namelen != 4 && namelen != 5) + Py_UCS4 v = parse_hex_code(name + 22, namelen - 22); + if (!is_cjk_unified_ideograph(v)) { return 0; - while (namelen--) { - v *= 16; - Py_UCS1 c = Py_TOUPPER(*name); - if (c >= '0' && c <= '9') - v += c - '0'; - else if (c >= 'A' && c <= 'F') - v += c - 'A' + 10; - else - return 0; - name++; } - if (!is_cjk_unified_ideograph(v)) - return 0; *code = v; return 1; } @@ -1500,24 +1514,10 @@ _getcode(const char* name, int namelen, Py_UCS4* code) /* Check for Tangut ideographs. */ if (PyOS_strnicmp(name, "TANGUT IDEOGRAPH-", 17) == 0) { /* Five hexdigits must follow. */ - unsigned int v = 0; - name += 17; - namelen -= 17; - if (namelen != 5) + Py_UCS4 v = parse_hex_code(name + 17, namelen - 17); + if (!is_tangut_ideograph(v)) { return 0; - while (namelen--) { - v *= 16; - Py_UCS1 c = Py_TOUPPER(*name); - if (c >= '0' && c <= '9') - v += c - '0'; - else if (c >= 'A' && c <= 'F') - v += c - 'A' + 10; - else - return 0; - name++; } - if (!is_tangut_ideograph(v)) - return 0; *code = v; return 1; } From 629531f0314cc7a5664460ccf4f765b9bf67ee57 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 14 Feb 2026 14:49:44 +0200 Subject: [PATCH 08/10] Generalize code for Hangul syllables and CJK and Tangut ideographs. --- Modules/unicodedata.c | 87 ++++++++++++-------------------- Modules/unicodename_db.h | 28 ++++++++++ Tools/unicode/makeunicodedata.py | 61 +++++++++++----------- 3 files changed, 91 insertions(+), 85 deletions(-) diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index c2402e15bb4610..1ed9760874b2a6 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -1052,31 +1052,18 @@ static const char * const hangul_syllables[][3] = { { 0, 0, "H" } }; -/* These ranges need to match makeunicodedata.py:cjk_ranges. */ static int -is_cjk_unified_ideograph(Py_UCS4 code) -{ - return - (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */ - (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */ - (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */ - (0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */ - (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ - (0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */ - (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */ - (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */ - (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */ - (0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */ - (0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */ -} - -/* These ranges need to match makeunicodedata.py:tangut_ranges. */ -static int -is_tangut_ideograph(Py_UCS4 code) +find_prefix_id(Py_UCS4 code) { - return - (0x17000 <= code && code <= 0x187FF) || /* Tangut */ - (0x18D00 <= code && code <= 0x18D1E); /* Tangut Supplement */ + for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) { + if (code < derived_name_ranges[i].first) { + return -1; + } + if (code <= derived_name_ranges[i].last) { + return derived_name_ranges[i].prefixid; + } + } + return -1; } /* macros used to determine if the given code point is in the PUA range that @@ -1354,7 +1341,9 @@ _getucname(PyObject *self, } } - if (SBase <= code && code < SBase+SCount) { + int prefixid = find_prefix_id(code); + if (prefixid == 0) { + assert(SBase <= code && code < SBase+SCount); /* Hangul syllable. */ int SIndex = code - SBase; int L = SIndex / NCount; @@ -1376,19 +1365,11 @@ _getucname(PyObject *self, return 1; } - if (is_cjk_unified_ideograph(code)) { - if (buflen < 28) - /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ - return 0; - sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); - return 1; - } - - if (is_tangut_ideograph(code)) { - if (buflen < 23) - /* Worst case: TANGUT IDEOGRAPH-18D08 */ + if (prefixid > 0) { + const char *prefix = derived_name_prefixes[prefixid]; + if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) { return 0; - sprintf(buffer, "TANGUT IDEOGRAPH-%X", code); + } return 1; } @@ -1482,8 +1463,19 @@ _getcode(const char* name, int namelen, Py_UCS4* code) * Named aliases are not resolved, they are returned as a code point in the * PUA */ - /* Check for hangul syllables. */ - if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) { + int i = 0; + size_t prefixlen; + for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) { + const char *prefix = derived_name_prefixes[i]; + prefixlen = strlen(derived_name_prefixes[i]); + if (PyOS_strnicmp(name, prefix, prefixlen) == 0) { + break; + } + } + + if (i == 0) { + /* Hangul syllables. */ + assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0); int len, L = -1, V = -1, T = -1; const char *pos = name + 16; find_syllable(pos, &len, &L, LCount, 0); @@ -1500,22 +1492,9 @@ _getcode(const char* name, int namelen, Py_UCS4* code) return 0; } - /* Check for CJK unified ideographs. */ - if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { - /* Four or five hexdigits must follow. */ - Py_UCS4 v = parse_hex_code(name + 22, namelen - 22); - if (!is_cjk_unified_ideograph(v)) { - return 0; - } - *code = v; - return 1; - } - - /* Check for Tangut ideographs. */ - if (PyOS_strnicmp(name, "TANGUT IDEOGRAPH-", 17) == 0) { - /* Five hexdigits must follow. */ - Py_UCS4 v = parse_hex_code(name + 17, namelen - 17); - if (!is_tangut_ideograph(v)) { + if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) { + Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen); + if (find_prefix_id(v) != i) { return 0; } *code = v; diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h index d67e968e7a01ae..d9d062a2345974 100644 --- a/Modules/unicodename_db.h +++ b/Modules/unicodename_db.h @@ -19684,3 +19684,31 @@ static const named_sequence named_sequences[] = { {2, {0x02E5, 0x02E9}}, {2, {0x02E9, 0x02E5}}, }; + +typedef struct { + Py_UCS4 first; + Py_UCS4 last; + int prefixid; +} derived_name_range; + +static const derived_name_range derived_name_ranges[] = { + {0x3400, 0x4DBF, 1}, + {0x4E00, 0x9FFF, 1}, + {0xAC00, 0xD7A3, 0}, + {0x17000, 0x187FF, 2}, + {0x18D00, 0x18D1E, 2}, + {0x20000, 0x2A6DF, 1}, + {0x2A700, 0x2B73F, 1}, + {0x2B740, 0x2B81D, 1}, + {0x2B820, 0x2CEAD, 1}, + {0x2CEB0, 0x2EBE0, 1}, + {0x2EBF0, 0x2EE5D, 1}, + {0x30000, 0x3134A, 1}, + {0x31350, 0x323AF, 1}, + {0x323B0, 0x33479, 1}, +}; +static const char * const derived_name_prefixes[] = { + "HANGUL SYLLABLE ", + "CJK UNIFIED IDEOGRAPH-", + "TANGUT IDEOGRAPH-", +}; diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 2627eebb2ee24c..432dc3a68bf5ed 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -109,25 +109,13 @@ CASED_MASK = 0x2000 EXTENDED_CASE_MASK = 0x4000 -# these ranges need to match unicodedata.c:is_cjk_unified_ideograph -cjk_ranges = [ - ('3400', '4DBF'), # CJK Ideograph Extension A CJK - ('4E00', '9FFF'), # CJK Ideograph - ('20000', '2A6DF'), # CJK Ideograph Extension B - ('2A700', '2B73F'), # CJK Ideograph Extension C - ('2B740', '2B81D'), # CJK Ideograph Extension D - ('2B820', '2CEAD'), # CJK Ideograph Extension E - ('2CEB0', '2EBE0'), # CJK Ideograph Extension F - ('2EBF0', '2EE5D'), # CJK Ideograph Extension I - ('30000', '3134A'), # CJK Ideograph Extension G - ('31350', '323AF'), # CJK Ideograph Extension H - ('323B0', '33479'), # CJK Ideograph Extension J -] - -# these ranges need to match unicodedata.c:is_tangut_ideograph -tangut_ranges = [ - ('17000', '187FF'), - ('18D00', '18D1E') +# Maps the range names in UnicodeData.txt to prefixes for +# derived names specified by rule NR2. +# Hangul should always be at index 0, since it uses special format. +derived_name_range_names = [ + ("Hangul Syllable", "HANGUL SYLLABLE "), + ("CJK Ideograph", "CJK UNIFIED IDEOGRAPH-"), + ("Tangut Ideograph", "TANGUT IDEOGRAPH-"), ] @@ -737,6 +725,23 @@ def makeunicodename(unicode, trace): fprint(' {%d, {%s}},' % (len(sequence), seq_str)) fprint('};') + fprint(dedent(""" + typedef struct { + Py_UCS4 first; + Py_UCS4 last; + int prefixid; + } derived_name_range; + """)) + + fprint('static const derived_name_range derived_name_ranges[] = {') + for name_range in unicode.derived_name_ranges: + fprint(' {0x%s, 0x%s, %d},' % name_range) + fprint('};') + + fprint('static const char * const derived_name_prefixes[] = {') + for _, prefix in derived_name_range_names: + fprint(' "%s",' % prefix) + fprint('};') def merge_old_version(version, new, old): # Changes to exclusion file not implemented yet @@ -959,8 +964,7 @@ def __init__(self, version, ideograph_check=True): char = int(s[0], 16) table[char] = from_row(s) - cjk_ranges_found = [] - tangut_ranges_found = [] + self.derived_name_ranges = [] # expand first-last ranges field = None @@ -974,20 +978,15 @@ def __init__(self, version, ideograph_check=True): s.name = "" field = dataclasses.astuple(s)[:15] elif s.name[-5:] == "Last>": - if s.name.startswith(" Date: Sat, 14 Feb 2026 14:51:58 +0200 Subject: [PATCH 09/10] Update the NEWS file. --- .../next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst index bbdcb4ffa0998b..f82f1eeb0589c6 100644 --- a/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst +++ b/Misc/NEWS.d/next/Library/2023-02-05-20-02-30.gh-issue-80667.7LmzeA.rst @@ -1 +1 @@ -unicodedata: Fix missing Tangut Ideographs names. +Add support for Tangut Ideographs names in :mod:`unicodedata`. From 163e6c950dcba3df6d1ff8d93d76089bbb1c3f83 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 14 Feb 2026 15:08:36 +0200 Subject: [PATCH 10/10] Add tests for invalid names. --- Lib/test/test_unicodedata.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index 9769528c7b9ac9..d100dae1110b7f 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -205,6 +205,17 @@ def test_lookup_nonexistant(self): "HANDBUG", "MODIFIER LETTER CYRILLIC SMALL QUESTION MARK", "???", + "CJK UNIFIED IDEOGRAPH-03400", + "CJK UNIFIED IDEOGRAPH-020000", + "CJK UNIFIED IDEOGRAPH-33FF", + "CJK UNIFIED IDEOGRAPH-F900", + "CJK UNIFIED IDEOGRAPH-13460", + "CJK UNIFIED IDEOGRAPH-17000", + "CJK UNIFIED IDEOGRAPH-18B00", + "CJK UNIFIED IDEOGRAPH-1B170", + "CJK COMPATIBILITY IDEOGRAPH-3400", + "TANGUT IDEOGRAPH-3400", + "HANGUL SYLLABLE AC00", ]: self.assertRaises(KeyError, self.db.lookup, nonexistent)