From ca152d3222aff84796a1c35e2751586422605493 Mon Sep 17 00:00:00 2001 From: kovan Date: Fri, 27 Feb 2026 23:44:39 +0100 Subject: [PATCH 1/2] gh-142162: Make PyREPL navigate and edit by grapheme cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backspace, delete, and arrow keys now operate on grapheme clusters (e.g. base character + combining accents) rather than individual Unicode codepoints. This uses unicodedata.iter_graphemes() (new in 3.15) to find cluster boundaries, so one keypress deletes or skips an entire visual character like é (e + U+0301). Changes: - Add prev_grapheme_boundary/next_grapheme_boundary helpers to Reader - Update left, right, backspace, delete commands in commands.py - Update backspace_dedent in readline.py for the non-dedent case Co-Authored-By: Claude Opus 4.6 --- Lib/_pyrepl/commands.py | 18 +++---- Lib/_pyrepl/reader.py | 35 ++++++++++++ Lib/_pyrepl/readline.py | 10 +++- Lib/test/test_pyrepl/test_reader.py | 84 +++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 11 deletions(-) diff --git a/Lib/_pyrepl/commands.py b/Lib/_pyrepl/commands.py index 10127e58897a58..966614ae41fcf9 100644 --- a/Lib/_pyrepl/commands.py +++ b/Lib/_pyrepl/commands.py @@ -312,9 +312,8 @@ class left(MotionCommand): def do(self) -> None: r = self.reader for _ in range(r.get_arg()): - p = r.pos - 1 - if p >= 0: - r.pos = p + if r.pos > 0: + r.pos = r.prev_grapheme_boundary() else: self.reader.error("start of buffer") @@ -324,9 +323,8 @@ def do(self) -> None: r = self.reader b = r.buffer for _ in range(r.get_arg()): - p = r.pos + 1 - if p <= len(b): - r.pos = p + if r.pos < len(b): + r.pos = r.next_grapheme_boundary() else: self.reader.error("end of buffer") @@ -409,8 +407,9 @@ def do(self) -> None: b = r.buffer for i in range(r.get_arg()): if r.pos > 0: - r.pos -= 1 - del b[r.pos] + prev = r.prev_grapheme_boundary() + del b[prev:r.pos] + r.pos = prev r.dirty = True else: self.reader.error("can't backspace at start") @@ -433,7 +432,8 @@ def do(self) -> None: for i in range(r.get_arg()): if r.pos != len(b): - del b[r.pos] + next_pos = r.next_grapheme_boundary() + del b[r.pos:next_pos] r.dirty = True else: self.reader.error("end of buffer") diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py index 9ab92f64d1ef63..27349da68a6ead 100644 --- a/Lib/_pyrepl/reader.py +++ b/Lib/_pyrepl/reader.py @@ -22,6 +22,7 @@ from __future__ import annotations import sys +import unicodedata import _colorize from contextlib import contextmanager @@ -458,6 +459,40 @@ def eol(self, p: int | None = None) -> int: p += 1 return p + def prev_grapheme_boundary(self, pos: int | None = None) -> int: + """Return the position at the start of the grapheme cluster + preceding pos (or self.pos). + + For plain ASCII this is just pos - 1. For combining characters + (e.g. 'e' + U+0301 COMBINING ACUTE ACCENT) it skips the whole + cluster so that one Backspace/Left deletes the visual character. + """ + if pos is None: + pos = self.pos + bol = self.bol(pos) + if pos <= bol: + return pos + line = "".join(self.buffer[bol:pos]) + # Find the last grapheme cluster in the line up to pos + *_, last = unicodedata.iter_graphemes(line) + return bol + last.start + + def next_grapheme_boundary(self, pos: int | None = None) -> int: + """Return the position just past the grapheme cluster starting + at pos (or self.pos). + + For plain ASCII this is just pos + 1. For combining characters + it skips the whole cluster. + """ + if pos is None: + pos = self.pos + eol = self.eol(pos) + if pos >= eol: + return pos + tail = "".join(self.buffer[pos:eol]) + first = next(unicodedata.iter_graphemes(tail)) + return pos + first.end + def max_column(self, y: int) -> int: """Return the last x-offset for line y""" return self.screeninfo[y][0] + sum(self.screeninfo[y][1]) diff --git a/Lib/_pyrepl/readline.py b/Lib/_pyrepl/readline.py index 23b8fa6b9c7625..e9d0ea5d42a8d8 100644 --- a/Lib/_pyrepl/readline.py +++ b/Lib/_pyrepl/readline.py @@ -334,8 +334,14 @@ def do(self) -> None: if pi is not None and pi < indent: repeat = indent - pi break - r.pos -= repeat - del b[r.pos : r.pos + repeat] + if repeat == 1: + # Use grapheme-aware deletion for non-dedent case + prev = r.prev_grapheme_boundary() + del b[prev:r.pos] + r.pos = prev + else: + r.pos -= repeat + del b[r.pos : r.pos + repeat] r.dirty = True else: self.reader.error("can't backspace at start") diff --git a/Lib/test/test_pyrepl/test_reader.py b/Lib/test/test_pyrepl/test_reader.py index b1b6ae16a1e592..ca13416a286bae 100644 --- a/Lib/test/test_pyrepl/test_reader.py +++ b/Lib/test/test_pyrepl/test_reader.py @@ -558,3 +558,87 @@ def test_control_characters(self): reader, _ = handle_all_events(events) self.assert_screen_equal(reader, 'flag = "🏳️\\u200d🌈"', clean=True) self.assert_screen_equal(reader, 'flag {o}={z} {s}"🏳️\\u200d🌈"{z}'.format(**colors)) + + # -- grapheme cluster (combining character) tests -- + + def test_backspace_combining_character(self): + # 'e' + combining acute accent U+0301 = one visual char + events = itertools.chain( + code_to_events("e\u0301"), + [Event(evt="key", data="backspace", raw=bytearray(b"\x7f"))], + ) + reader, _ = handle_all_events(events) + self.assertEqual(reader.buffer, []) + self.assertEqual(reader.pos, 0) + + def test_backspace_combining_in_middle(self): + # "ae\u0301z" → backspace should remove "e\u0301", leaving "az" + events = itertools.chain( + code_to_events("ae\u0301z"), + [ + Event(evt="key", data="left", raw=bytearray(b"\x1bOD")), + Event(evt="key", data="backspace", raw=bytearray(b"\x7f")), + ], + ) + reader, _ = handle_all_events(events) + self.assertEqual(reader.buffer, ["a", "z"]) + self.assertEqual(reader.pos, 1) + + def test_delete_combining_character(self): + # Cursor at start, delete should remove entire "e\u0301" + events = itertools.chain( + code_to_events("e\u0301"), + [ + Event(evt="key", data="home", raw=bytearray(b"\x1b[H")), + Event(evt="key", data="delete", raw=bytearray(b"\x7f")), + ], + ) + reader, _ = handle_all_events(events) + self.assertEqual(reader.buffer, []) + self.assertEqual(reader.pos, 0) + + def test_left_skips_combining_character(self): + # After typing "e\u0301", left should move past both codepoints + events = itertools.chain( + code_to_events("ae\u0301"), + [Event(evt="key", data="left", raw=bytearray(b"\x1bOD"))], + ) + reader, _ = handle_all_events(events) + # Should land before 'e', not between 'e' and combining accent + self.assertEqual(reader.pos, 1) + + def test_right_skips_combining_character(self): + # Move to start, then right should skip "e\u0301" as one unit + events = itertools.chain( + code_to_events("e\u0301z"), + [ + Event(evt="key", data="home", raw=bytearray(b"\x1b[H")), + Event(evt="key", data="right", raw=bytearray(b"\x1bOC")), + ], + ) + reader, _ = handle_all_events(events) + # Should be past both 'e' and combining accent, before 'z' + self.assertEqual(reader.pos, 2) + + def test_backspace_plain_ascii(self): + # Regression: plain ASCII should still work as before + events = itertools.chain( + code_to_events("abc"), + [Event(evt="key", data="backspace", raw=bytearray(b"\x7f"))], + ) + reader, _ = handle_all_events(events) + self.assertEqual(reader.buffer, ["a", "b"]) + self.assertEqual(reader.pos, 2) + + def test_left_right_plain_ascii(self): + # Regression: plain ASCII left/right still move one char at a time + events = itertools.chain( + code_to_events("abc"), + [ + Event(evt="key", data="left", raw=bytearray(b"\x1bOD")), + Event(evt="key", data="left", raw=bytearray(b"\x1bOD")), + Event(evt="key", data="right", raw=bytearray(b"\x1bOC")), + ], + ) + reader, _ = handle_all_events(events) + self.assertEqual(reader.pos, 2) From 09b7c30f93954a7d591a74e4370a3c1fa34f9108 Mon Sep 17 00:00:00 2001 From: kovan Date: Fri, 27 Feb 2026 23:55:59 +0100 Subject: [PATCH 2/2] gh-142162: Add mypy type: ignore for unicodedata.iter_graphemes The iter_graphemes API is new in 3.15 and not yet in mypy's typeshed stubs, so mypy reports attr-defined and no-any-return errors. Co-Authored-By: Claude Opus 4.6 --- Lib/_pyrepl/reader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/_pyrepl/reader.py b/Lib/_pyrepl/reader.py index 27349da68a6ead..b28b4ef659f64b 100644 --- a/Lib/_pyrepl/reader.py +++ b/Lib/_pyrepl/reader.py @@ -474,8 +474,8 @@ def prev_grapheme_boundary(self, pos: int | None = None) -> int: return pos line = "".join(self.buffer[bol:pos]) # Find the last grapheme cluster in the line up to pos - *_, last = unicodedata.iter_graphemes(line) - return bol + last.start + *_, last = unicodedata.iter_graphemes(line) # type: ignore[attr-defined] + return bol + last.start # type: ignore[no-any-return] def next_grapheme_boundary(self, pos: int | None = None) -> int: """Return the position just past the grapheme cluster starting @@ -490,8 +490,8 @@ def next_grapheme_boundary(self, pos: int | None = None) -> int: if pos >= eol: return pos tail = "".join(self.buffer[pos:eol]) - first = next(unicodedata.iter_graphemes(tail)) - return pos + first.end + first = next(unicodedata.iter_graphemes(tail)) # type: ignore[attr-defined] + return pos + first.end # type: ignore[no-any-return] def max_column(self, y: int) -> int: """Return the last x-offset for line y"""