Skip to content
Open
24 changes: 24 additions & 0 deletions Lib/test/test_ucn.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,30 @@ def test_cjk_unified_ideographs(self):
self.checkletter("cjK UniFIeD idEogRAph-2aBcD", "\U0002abcd")
self.checkletter("CJk uNIfiEd IDeOGraPH-2AbCd", "\U0002abcd")

def test_tangut_ideographs(self):
self.checkletter("TANGUT IDEOGRAPH-17000", "\U00017000")
self.checkletter("TANGUT IDEOGRAPH-187FF", "\U000187ff")
self.checkletter("TANGUT IDEOGRAPH-18D00", "\U00018D00")
self.checkletter("TANGUT IDEOGRAPH-18D1E", "\U00018d1e")
self.checkletter("tangut ideograph-18d1e", "\U00018d1e")

def test_egyptian_hieroglyphs(self):
self.checkletter("EGYPTIAN HIEROGLYPH-13460", "\U00013460")
self.checkletter("EGYPTIAN HIEROGLYPH-143FA", "\U000143fa")
self.checkletter("egyptian hieroglyph-143fa", "\U000143fa")

def test_khitan_small_script_characters(self):
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18B00", "\U00018b00")
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CD5", "\U00018cd5")
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
self.checkletter("KHITAN SMALL SCRIPT CHARACTER-18CFF", "\U00018cff")
self.checkletter("khitan small script character-18cff", "\U00018cff")

def test_nushu_characters(self):
self.checkletter("NUSHU CHARACTER-1B170", "\U0001b170")
self.checkletter("NUSHU CHARACTER-1B2FB", "\U0001b2fb")
self.checkletter("nushu character-1b2fb", "\U0001b2fb")

def test_bmp_characters(self):
for code in range(0x10000):
char = chr(code)
Expand Down
107 changes: 106 additions & 1 deletion Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,60 @@ def test_function_checksum(self):
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)

def test_name(self):
name = self.db.name
self.assertRaises(ValueError, name, '\0')
self.assertRaises(ValueError, name, '\n')
self.assertRaises(ValueError, name, '\x1F')
self.assertRaises(ValueError, name, '\x7F')
self.assertRaises(ValueError, name, '\x9F')
self.assertRaises(ValueError, name, '\uFFFE')
self.assertRaises(ValueError, name, '\uFFFF')
self.assertRaises(ValueError, name, '\U0010FFFF')
self.assertEqual(name('\U0010FFFF', 42), 42)

self.assertEqual(name(' '), 'SPACE')
self.assertEqual(name('1'), 'DIGIT ONE')
self.assertEqual(name('A'), 'LATIN CAPITAL LETTER A')
self.assertEqual(name('\xA0'), 'NO-BREAK SPACE')
self.assertEqual(name('\u0221', None), None if self.old else
'LATIN SMALL LETTER D WITH CURL')
self.assertEqual(name('\u3400'), 'CJK UNIFIED IDEOGRAPH-3400')
self.assertEqual(name('\u9FA5'), 'CJK UNIFIED IDEOGRAPH-9FA5')
self.assertEqual(name('\uAC00'), 'HANGUL SYLLABLE GA')
self.assertEqual(name('\uD7A3'), 'HANGUL SYLLABLE HIH')
self.assertEqual(name('\uF900'), 'CJK COMPATIBILITY IDEOGRAPH-F900')
self.assertEqual(name('\uFA6A'), 'CJK COMPATIBILITY IDEOGRAPH-FA6A')
self.assertEqual(name('\uFBF9'),
'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA '
'ABOVE WITH ALEF MAKSURA ISOLATED FORM')
self.assertEqual(name('\U00013460', None), None if self.old else
'EGYPTIAN HIEROGLYPH-13460')
self.assertEqual(name('\U000143FA', None), None if self.old else
'EGYPTIAN HIEROGLYPH-143FA')
self.assertEqual(name('\U00017000', None), None if self.old else
'TANGUT IDEOGRAPH-17000')
self.assertEqual(name('\U00018B00', None), None if self.old else
'KHITAN SMALL SCRIPT CHARACTER-18B00')
self.assertEqual(name('\U00018CD5', None), None if self.old else
'KHITAN SMALL SCRIPT CHARACTER-18CD5')
self.assertEqual(name('\U00018CFF', None), None if self.old else
'KHITAN SMALL SCRIPT CHARACTER-18CFF')
self.assertEqual(name('\U00018D1E', None), None if self.old else
'TANGUT IDEOGRAPH-18D1E')
self.assertEqual(name('\U0001B170', None), None if self.old else
'NUSHU CHARACTER-1B170')
self.assertEqual(name('\U0001B2FB', None), None if self.old else
'NUSHU CHARACTER-1B2FB')
self.assertEqual(name('\U0001FBA8', None), None if self.old else
'BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO '
'MIDDLE LEFT AND MIDDLE RIGHT TO LOWER CENTRE')
self.assertEqual(name('\U0002A6D6'), 'CJK UNIFIED IDEOGRAPH-2A6D6')
self.assertEqual(name('\U0002FA1D'), 'CJK COMPATIBILITY IDEOGRAPH-2FA1D')
self.assertEqual(name('\U00033479', None), None if self.old else
'CJK UNIFIED IDEOGRAPH-33479')

@requires_resource('cpu')
def test_name_inverse_lookup(self):
for char in iterallchars():
looked_name = self.db.name(char, None)
Expand All @@ -151,6 +205,17 @@ def test_lookup_nonexistant(self):
"HANDBUG",
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
"???",
"CJK UNIFIED IDEOGRAPH-03400",
"CJK UNIFIED IDEOGRAPH-020000",
"CJK UNIFIED IDEOGRAPH-33FF",
"CJK UNIFIED IDEOGRAPH-F900",
"CJK UNIFIED IDEOGRAPH-13460",
"CJK UNIFIED IDEOGRAPH-17000",
"CJK UNIFIED IDEOGRAPH-18B00",
"CJK UNIFIED IDEOGRAPH-1B170",
"CJK COMPATIBILITY IDEOGRAPH-3400",
"TANGUT IDEOGRAPH-3400",
"HANGUL SYLLABLE AC00",
]:
self.assertRaises(KeyError, self.db.lookup, nonexistent)

Expand Down Expand Up @@ -613,7 +678,47 @@ class UnicodeFunctionsTest(unittest.TestCase, BaseUnicodeFunctionsTest):
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = ('83cc43a2fbb779185832b4c049217d80b05bf349'
if quicktest else
'65670ae03a324c5f9e826a4de3e25bae4d73c9b7')
'180bdc91143d8aa2eb9dd6726e66d37606205942')

@requires_resource('network')
def test_all_names(self):
TESTDATAFILE = "DerivedName.txt"
testdata = download_test_data_file(TESTDATAFILE)

with testdata:
self.run_name_tests(testdata)

def run_name_tests(self, testdata):
names_ref = {}

def parse_cp(s):
return int(s, 16)

# Parse data
for line in testdata:
line = line.strip()
if not line or line.startswith("#"):
continue
raw_cp, name = line.split("; ")
# Check for a range
if ".." in raw_cp:
cp1, cp2 = map(parse_cp, raw_cp.split(".."))
# remove ‘*’ at the end
assert name[-1] == '*', (raw_cp, name)
name = name[:-1]
for cp in range(cp1, cp2 + 1):
names_ref[cp] = f"{name}{cp:04X}"
elif name[-1] == '*':
cp = parse_cp(raw_cp)
name = name[:-1]
names_ref[cp] = f"{name}{cp:04X}"
else:
assert '*' not in name, (raw_cp, name)
cp = parse_cp(raw_cp)
names_ref[cp] = name

for cp in range(0, sys.maxunicode + 1):
self.assertEqual(self.db.name(chr(cp), None), names_ref.get(cp))

def test_isxidstart(self):
self.assertTrue(self.db.isxidstart('S'))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for Tangut Ideographs names in :mod:`unicodedata`.
105 changes: 63 additions & 42 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -1052,22 +1052,18 @@
{ 0, 0, "H" }
};

/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int
is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
(0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */
(0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
(0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
(0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
(0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */
(0x323B0 <= code && code <= 0x33479); /* CJK Ideograph Extension J */
find_prefix_id(Py_UCS4 code)
{
for (int i = 0; i < (int)Py_ARRAY_LENGTH(derived_name_ranges); i++) {
if (code < derived_name_ranges[i].first) {
return -1;
}
if (code <= derived_name_ranges[i].last) {
return derived_name_ranges[i].prefixid;
}
}
return -1;
}

/* macros used to determine if the given code point is in the PUA range that
Expand Down Expand Up @@ -1345,7 +1341,9 @@
}
}

if (SBase <= code && code < SBase+SCount) {
int prefixid = find_prefix_id(code);
if (prefixid == 0) {
assert(SBase <= code && code < SBase+SCount);
/* Hangul syllable. */
int SIndex = code - SBase;
int L = SIndex / NCount;
Expand All @@ -1367,11 +1365,11 @@
return 1;
}

if (is_unified_ideograph(code)) {
if (buflen < 28)
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
if (prefixid > 0) {
const char *prefix = derived_name_prefixes[prefixid];
if (snprintf(buffer, buflen, "%s%04X", prefix, code) >= buflen) {
return 0;
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
}
return 1;
}

Expand Down Expand Up @@ -1428,6 +1426,35 @@
return 1;
}

static Py_UCS4
parse_hex_code(const char *name, int namelen)
{
if (namelen < 4 || namelen > 6) {
return (Py_UCS4)-1;
}
if (*name == '0') {
return (Py_UCS4)-1;
}
int v = 0;
while (namelen--) {
v *= 16;
Py_UCS1 c = Py_TOUPPER(*name);
if (c >= '0' && c <= '9') {
v += c - '0';
}
else if (c >= 'A' && c <= 'F') {
v += c - 'A' + 10;
}
else {
return (Py_UCS4)-1;
}
name++;
}
if (v > 0x10ffff) {
return (Py_UCS4)-1;
}
return v;
}

static int
_getcode(const char* name, int namelen, Py_UCS4* code)
Expand All @@ -1436,8 +1463,19 @@
* Named aliases are not resolved, they are returned as a code point in the
* PUA */

/* Check for hangul syllables. */
if (PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0) {
int i = 0;
size_t prefixlen;
for (; i < (int)Py_ARRAY_LENGTH(derived_name_prefixes); i++) {
const char *prefix = derived_name_prefixes[i];
prefixlen = strlen(derived_name_prefixes[i]);
if (PyOS_strnicmp(name, prefix, prefixlen) == 0) {
break;
}
}

if (i == 0) {
/* Hangul syllables. */
assert(PyOS_strnicmp(name, "HANGUL SYLLABLE ", 16) == 0);
int len, L = -1, V = -1, T = -1;
const char *pos = name + 16;
find_syllable(pos, &len, &L, LCount, 0);
Expand All @@ -1454,28 +1492,11 @@
return 0;
}

/* Check for unified ideographs. */
if (PyOS_strnicmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
unsigned int v;
v = 0;
name += 22;
namelen -= 22;
if (namelen != 4 && namelen != 5)
if (i < (int)Py_ARRAY_LENGTH(derived_name_prefixes)) {
Py_UCS4 v = parse_hex_code(name + prefixlen, namelen - prefixlen);

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / Build and test (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [C:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / Build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows (free-threading) / Build and test (arm64)

'function': conversion from 'size_t' to 'int', possible loss of data [C:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]

Check warning on line 1496 in Modules/unicodedata.c

View workflow job for this annotation

GitHub Actions / Windows / Build and test (x64)

'function': conversion from 'size_t' to 'int', possible loss of data [D:\a\cpython\cpython\PCbuild\unicodedata.vcxproj]
if (find_prefix_id(v) != i) {
return 0;
while (namelen--) {
v *= 16;
Py_UCS1 c = Py_TOUPPER(*name);
if (c >= '0' && c <= '9')
v += c - '0';
else if (c >= 'A' && c <= 'F')
v += c - 'A' + 10;
else
return 0;
name++;
}
if (!is_unified_ideograph(v))
return 0;
*code = v;
return 1;
}
Expand Down
28 changes: 28 additions & 0 deletions Modules/unicodename_db.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading