From f166c47da1981e31277445eb8ca49550a073472a Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Sun, 11 Jan 2026 18:05:35 +0900 Subject: [PATCH 1/9] Parse JSON in Rust --- crates/stdlib/src/json.rs | 357 ++++++++++++++++++++++++++-- crates/stdlib/src/json/machinery.rs | 2 +- 2 files changed, 339 insertions(+), 20 deletions(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index cc98ad912cc..d576e0190b1 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -16,6 +16,26 @@ mod _json { use malachite_bigint::BigInt; use rustpython_common::wtf8::Wtf8Buf; + /// Skip JSON whitespace characters (space, tab, newline, carriage return). + /// Works with a character iterator and returns the number of characters skipped. + #[inline] + fn skip_whitespace_chars(chars: &mut std::iter::Peekable) -> usize + where + I: Iterator, + { + let mut count = 0; + while let Some(&c) = chars.peek() { + match c { + ' ' | '\t' | '\n' | '\r' => { + chars.next(); + count += 1; + } + _ => break, + } + } + count + } + #[pyattr(name = "make_scanner")] #[pyclass(name = "Scanner", traverse)] #[derive(Debug, PyPayload)] @@ -90,27 +110,16 @@ mod _json { .map(|x| PyIterReturn::Return(x.to_pyobject(vm))); } '{' => { - // TODO: parse the object in rust - let parse_obj = self.ctx.get_attr("parse_object", vm)?; - let result = parse_obj.call( - ( - (pystr, next_idx), - self.strict, - scan_once, - self.object_hook.clone(), - self.object_pairs_hook.clone(), - ), - vm, - ); - return PyIterReturn::from_pyresult(result, vm); + // Parse object in Rust + return self + .parse_object(pystr, next_idx, &scan_once, vm) + .map(|(obj, end)| PyIterReturn::Return(vm.new_tuple((obj, end)).into())); } '[' => { - // TODO: parse the array in rust - let parse_array = self.ctx.get_attr("parse_array", vm)?; - return PyIterReturn::from_pyresult( - parse_array.call(((pystr, next_idx), scan_once), vm), - vm, - ); + // Parse array in Rust + return self + .parse_array(pystr, next_idx, &scan_once, vm) + .map(|(obj, end)| PyIterReturn::Return(vm.new_tuple((obj, end)).into())); } _ => {} } @@ -189,6 +198,316 @@ mod _json { }; Some((ret, buf.len())) } + + /// Parse a JSON object starting after the opening '{'. + /// Returns (parsed_object, end_character_index). + fn parse_object( + &self, + pystr: PyStrRef, + start_idx: usize, // Character index right after '{' + scan_once: &PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize)> { + flame_guard!("JsonScanner::parse_object"); + + let s = pystr.as_str(); + let mut chars = s.chars().skip(start_idx).peekable(); + let mut idx = start_idx; + + // Skip initial whitespace + idx += skip_whitespace_chars(&mut chars); + + // Check for empty object + match chars.peek() { + Some('}') => { + return self.finalize_object(vec![], idx + 1, vm); + } + Some('"') => { + // Continue to parse first key + } + Some(_) | None => { + return Err(self.make_decode_error( + "Expecting property name enclosed in double quotes", + pystr, + idx, + vm, + )); + } + } + + let mut pairs: Vec<(PyObjectRef, PyObjectRef)> = Vec::new(); + + loop { + // We're now at '"', skip it + chars.next(); + idx += 1; + + // Parse key string using existing scanstring + let (key_wtf8, key_end) = machinery::scanstring(pystr.as_wtf8(), idx, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + + let key_str = key_wtf8.to_string(); + let key: PyObjectRef = vm.ctx.new_str(key_str).into(); + + // Update position and rebuild iterator + idx = key_end; + chars = s.chars().skip(idx).peekable(); + + // Skip whitespace after key + idx += skip_whitespace_chars(&mut chars); + + // Expect ':' delimiter + match chars.peek() { + Some(':') => { + chars.next(); + idx += 1; + } + _ => { + return Err(self.make_decode_error( + "Expecting ':' delimiter", + pystr, + idx, + vm, + )); + } + } + + // Skip whitespace after ':' + idx += skip_whitespace_chars(&mut chars); + + // Parse value recursively using scan_once + let (value, value_end) = self.call_scan_once(scan_once, pystr.clone(), idx, vm)?; + + pairs.push((key, value)); + idx = value_end; + chars = s.chars().skip(idx).peekable(); + + // Skip whitespace after value + idx += skip_whitespace_chars(&mut chars); + + // Check for ',' or '}' + match chars.peek() { + Some('}') => { + idx += 1; + break; + } + Some(',') => { + let comma_idx = idx; + chars.next(); + idx += 1; + + // Skip whitespace after comma + idx += skip_whitespace_chars(&mut chars); + + // Next must be '"' + match chars.peek() { + Some('"') => { + // Continue to next key-value pair + } + Some('}') => { + // Trailing comma before end of object + return Err(self.make_decode_error( + "Illegal trailing comma before end of object", + pystr, + comma_idx, + vm, + )); + } + _ => { + return Err(self.make_decode_error( + "Expecting property name enclosed in double quotes", + pystr, + idx, + vm, + )); + } + } + } + _ => { + return Err(self.make_decode_error( + "Expecting ',' delimiter", + pystr, + idx, + vm, + )); + } + } + } + + self.finalize_object(pairs, idx, vm) + } + + /// Parse a JSON array starting after the opening '['. + /// Returns (parsed_array, end_character_index). + fn parse_array( + &self, + pystr: PyStrRef, + start_idx: usize, // Character index right after '[' + scan_once: &PyObjectRef, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize)> { + flame_guard!("JsonScanner::parse_array"); + + let s = pystr.as_str(); + let mut chars = s.chars().skip(start_idx).peekable(); + let mut idx = start_idx; + + // Skip initial whitespace + idx += skip_whitespace_chars(&mut chars); + + // Check for empty array + if chars.peek() == Some(&']') { + return Ok((vm.ctx.new_list(vec![]).into(), idx + 1)); + } + + let mut values: Vec = Vec::new(); + + loop { + // Parse value + let (value, value_end) = self.call_scan_once(scan_once, pystr.clone(), idx, vm)?; + + values.push(value); + idx = value_end; + chars = s.chars().skip(idx).peekable(); + + // Skip whitespace after value + idx += skip_whitespace_chars(&mut chars); + + match chars.peek() { + Some(']') => { + idx += 1; + break; + } + Some(',') => { + let comma_idx = idx; + chars.next(); + idx += 1; + // Skip whitespace after comma + idx += skip_whitespace_chars(&mut chars); + + // Check for trailing comma + if chars.peek() == Some(&']') { + return Err(self.make_decode_error( + "Illegal trailing comma before end of array", + pystr, + comma_idx, + vm, + )); + } + } + _ => { + return Err(self.make_decode_error( + "Expecting ',' delimiter", + pystr, + idx, + vm, + )); + } + } + } + + Ok((vm.ctx.new_list(values).into(), idx)) + } + + /// Finalize object construction with hooks. + fn finalize_object( + &self, + pairs: Vec<(PyObjectRef, PyObjectRef)>, + end_idx: usize, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize)> { + let result = if let Some(ref pairs_hook) = self.object_pairs_hook { + // object_pairs_hook takes priority - pass list of tuples + let pairs_list: Vec = pairs + .into_iter() + .map(|(k, v)| vm.new_tuple((k, v)).into()) + .collect(); + pairs_hook.call((vm.ctx.new_list(pairs_list),), vm)? + } else { + // Build a dict from pairs + let dict = vm.ctx.new_dict(); + for (key, value) in pairs { + dict.set_item(&*key, value, vm)?; + } + + // Apply object_hook if present + let dict_obj: PyObjectRef = dict.into(); + if let Some(ref hook) = self.object_hook { + hook.call((dict_obj,), vm)? + } else { + dict_obj + } + }; + + Ok((result, end_idx)) + } + + /// Call scan_once and handle the result. + fn call_scan_once( + &self, + scan_once: &PyObjectRef, + pystr: PyStrRef, + idx: usize, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize)> { + // First try to handle common cases directly in Rust + let s = pystr.as_str(); + let mut chars = s.chars().skip(idx).peekable(); + + match chars.peek() { + Some('"') => { + // String - parse directly in Rust + let (wtf8, end) = machinery::scanstring(pystr.as_wtf8(), idx + 1, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + let py_str = vm.ctx.new_str(wtf8.to_string()); + return Ok((py_str.into(), end)); + } + Some('{') => { + // Nested object - parse recursively in Rust + return self.parse_object(pystr, idx + 1, scan_once, vm); + } + Some('[') => { + // Nested array - parse recursively in Rust + return self.parse_array(pystr, idx + 1, scan_once, vm); + } + _ => { + // For other cases (numbers, null, true, false, etc.) + // fall through to call scan_once + } + } + + // Fall back to scan_once for other value types + let result = scan_once.call((pystr.clone(), idx as isize), vm); + + match result { + Ok(tuple) => { + use crate::vm::builtins::PyTupleRef; + let tuple: PyTupleRef = tuple.try_into_value(vm)?; + if tuple.len() != 2 { + return Err(vm.new_value_error("scan_once must return 2-tuple")); + } + let value = tuple.as_slice()[0].clone(); + let end_idx: isize = tuple.as_slice()[1].try_to_value(vm)?; + Ok((value, end_idx as usize)) + } + Err(err) if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) => { + Err(self.make_decode_error("Expecting value", pystr, idx, vm)) + } + Err(err) => Err(err), + } + } + + /// Create a decode error. + fn make_decode_error( + &self, + msg: &str, + s: PyStrRef, + pos: usize, + vm: &VirtualMachine, + ) -> PyBaseExceptionRef { + let err = machinery::DecodeError::new(msg, pos); + py_decode_error(err, s, vm) + } } impl Callable for JsonScanner { diff --git a/crates/stdlib/src/json/machinery.rs b/crates/stdlib/src/json/machinery.rs index 57b8ae441f7..de3c1d8547f 100644 --- a/crates/stdlib/src/json/machinery.rs +++ b/crates/stdlib/src/json/machinery.rs @@ -108,7 +108,7 @@ pub struct DecodeError { pub pos: usize, } impl DecodeError { - fn new(msg: impl Into, pos: usize) -> Self { + pub fn new(msg: impl Into, pos: usize) -> Self { let msg = msg.into(); Self { msg, pos } } From 14243d4646e58207324b8859889bc2d7f4cde810 Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Wed, 14 Jan 2026 20:21:12 +0900 Subject: [PATCH 2/9] Reuse key when decoding JSON --- crates/stdlib/src/json.rs | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index d576e0190b1..adf77775c72 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -15,6 +15,7 @@ mod _json { use core::str::FromStr; use malachite_bigint::BigInt; use rustpython_common::wtf8::Wtf8Buf; + use std::collections::HashMap; /// Skip JSON whitespace characters (space, tab, newline, carriage return). /// Works with a character iterator and returns the number of characters skipped. @@ -111,14 +112,16 @@ mod _json { } '{' => { // Parse object in Rust + let mut memo = HashMap::new(); return self - .parse_object(pystr, next_idx, &scan_once, vm) + .parse_object(pystr, next_idx, &scan_once, &mut memo, vm) .map(|(obj, end)| PyIterReturn::Return(vm.new_tuple((obj, end)).into())); } '[' => { // Parse array in Rust + let mut memo = HashMap::new(); return self - .parse_array(pystr, next_idx, &scan_once, vm) + .parse_array(pystr, next_idx, &scan_once, &mut memo, vm) .map(|(obj, end)| PyIterReturn::Return(vm.new_tuple((obj, end)).into())); } _ => {} @@ -206,6 +209,7 @@ mod _json { pystr: PyStrRef, start_idx: usize, // Character index right after '{' scan_once: &PyObjectRef, + memo: &mut HashMap, vm: &VirtualMachine, ) -> PyResult<(PyObjectRef, usize)> { flame_guard!("JsonScanner::parse_object"); @@ -246,8 +250,16 @@ mod _json { let (key_wtf8, key_end) = machinery::scanstring(pystr.as_wtf8(), idx, self.strict) .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + // Key memoization - reuse existing key strings let key_str = key_wtf8.to_string(); - let key: PyObjectRef = vm.ctx.new_str(key_str).into(); + let key: PyObjectRef = match memo.get(&key_str) { + Some(cached) => cached.clone().into(), + None => { + let py_key = vm.ctx.new_str(key_str.clone()); + memo.insert(key_str, py_key.clone()); + py_key.into() + } + }; // Update position and rebuild iterator idx = key_end; @@ -276,7 +288,8 @@ mod _json { idx += skip_whitespace_chars(&mut chars); // Parse value recursively using scan_once - let (value, value_end) = self.call_scan_once(scan_once, pystr.clone(), idx, vm)?; + let (value, value_end) = + self.call_scan_once(scan_once, pystr.clone(), idx, memo, vm)?; pairs.push((key, value)); idx = value_end; @@ -344,6 +357,7 @@ mod _json { pystr: PyStrRef, start_idx: usize, // Character index right after '[' scan_once: &PyObjectRef, + memo: &mut HashMap, vm: &VirtualMachine, ) -> PyResult<(PyObjectRef, usize)> { flame_guard!("JsonScanner::parse_array"); @@ -364,7 +378,8 @@ mod _json { loop { // Parse value - let (value, value_end) = self.call_scan_once(scan_once, pystr.clone(), idx, vm)?; + let (value, value_end) = + self.call_scan_once(scan_once, pystr.clone(), idx, memo, vm)?; values.push(value); idx = value_end; @@ -448,6 +463,7 @@ mod _json { scan_once: &PyObjectRef, pystr: PyStrRef, idx: usize, + memo: &mut HashMap, vm: &VirtualMachine, ) -> PyResult<(PyObjectRef, usize)> { // First try to handle common cases directly in Rust @@ -464,11 +480,11 @@ mod _json { } Some('{') => { // Nested object - parse recursively in Rust - return self.parse_object(pystr, idx + 1, scan_once, vm); + return self.parse_object(pystr, idx + 1, scan_once, memo, vm); } Some('[') => { // Nested array - parse recursively in Rust - return self.parse_array(pystr, idx + 1, scan_once, vm); + return self.parse_array(pystr, idx + 1, scan_once, memo, vm); } _ => { // For other cases (numbers, null, true, false, etc.) From 099c8a3128d948a3ef32a86e56715ac6783ec65f Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Wed, 14 Jan 2026 02:01:17 +0900 Subject: [PATCH 3/9] Unmark resolved test --- Lib/test/test_json/test_decode.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py index f07f7d55339..ad37d47f083 100644 --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -138,8 +138,6 @@ def test_limit_int(self): class TestPyDecode(TestDecode, PyTest): pass class TestCDecode(TestDecode, CTest): - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_keys_reuse(self): return super().test_keys_reuse() From 788ecb3acabb21854453f76af08b7e594d07221d Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Wed, 14 Jan 2026 20:31:40 +0900 Subject: [PATCH 4/9] Parse null/true/false directly in call_scan_once Parse JSON constants (null, true, false) directly in Rust within call_scan_once() instead of falling back to Python scan_once. This reduces Python-Rust boundary crossings for array/object values. Co-Authored-By: Claude Opus 4.5 --- crates/stdlib/src/json.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index adf77775c72..e4d1e6fc99c 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -470,6 +470,8 @@ mod _json { let s = pystr.as_str(); let mut chars = s.chars().skip(idx).peekable(); + let remaining = &s[idx..]; + match chars.peek() { Some('"') => { // String - parse directly in Rust @@ -486,8 +488,26 @@ mod _json { // Nested array - parse recursively in Rust return self.parse_array(pystr, idx + 1, scan_once, memo, vm); } + Some('n') => { + // null - parse directly in Rust + if remaining.starts_with("null") { + return Ok((vm.ctx.none(), idx + 4)); + } + } + Some('t') => { + // true - parse directly in Rust + if remaining.starts_with("true") { + return Ok((vm.ctx.new_bool(true).into(), idx + 4)); + } + } + Some('f') => { + // false - parse directly in Rust + if remaining.starts_with("false") { + return Ok((vm.ctx.new_bool(false).into(), idx + 5)); + } + } _ => { - // For other cases (numbers, null, true, false, etc.) + // For other cases (numbers, NaN, Infinity, etc.) // fall through to call scan_once } } From e79ab96f4636d89bdf88021f497b60e09141ddd1 Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Wed, 14 Jan 2026 20:33:00 +0900 Subject: [PATCH 5/9] Parse numbers directly in call_scan_once Parse JSON numbers starting with digits (0-9) directly in Rust within call_scan_once() by reusing the existing parse_number() method. This reduces Python-Rust boundary crossings for array/object values. Co-Authored-By: Claude Opus 4.5 --- crates/stdlib/src/json.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index e4d1e6fc99c..e83b8099e5d 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -506,8 +506,14 @@ mod _json { return Ok((vm.ctx.new_bool(false).into(), idx + 5)); } } + Some(c) if c.is_ascii_digit() => { + // Number starting with digit - parse directly in Rust + if let Some((result, len)) = self.parse_number(remaining, vm) { + return Ok((result?, idx + len)); + } + } _ => { - // For other cases (numbers, NaN, Infinity, etc.) + // For other cases (NaN, Infinity, -Infinity, negative numbers, etc.) // fall through to call scan_once } } From 529e3a61dfd26727405018da557209cf4a50ec0b Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Wed, 14 Jan 2026 20:35:20 +0900 Subject: [PATCH 6/9] Parse NaN/Infinity/-Infinity in call_scan_once Parse special JSON constants (NaN, Infinity, -Infinity) and negative numbers directly in Rust within call_scan_once(). This handles: - 'N' -> NaN via parse_constant callback - 'I' -> Infinity via parse_constant callback - '-' -> -Infinity or negative numbers via parse_constant/parse_number This reduces Python-Rust boundary crossings for array/object values. Co-Authored-By: Claude Opus 4.5 --- crates/stdlib/src/json.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index e83b8099e5d..c3b5ee01c37 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -512,8 +512,32 @@ mod _json { return Ok((result?, idx + len)); } } + Some('N') => { + // NaN - parse directly in Rust + if remaining.starts_with("NaN") { + let result = self.parse_constant.call(("NaN",), vm)?; + return Ok((result, idx + 3)); + } + } + Some('I') => { + // Infinity - parse directly in Rust + if remaining.starts_with("Infinity") { + let result = self.parse_constant.call(("Infinity",), vm)?; + return Ok((result, idx + 8)); + } + } + Some('-') => { + // -Infinity or negative number + if remaining.starts_with("-Infinity") { + let result = self.parse_constant.call(("-Infinity",), vm)?; + return Ok((result, idx + 9)); + } + // Try parsing as negative number + if let Some((result, len)) = self.parse_number(remaining, vm) { + return Ok((result?, idx + len)); + } + } _ => { - // For other cases (NaN, Infinity, -Infinity, negative numbers, etc.) // fall through to call scan_once } } From 3adc21dcb3e16e9ed46cc023b2a40ce0da52aeea Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Wed, 14 Jan 2026 22:48:29 +0900 Subject: [PATCH 7/9] Correct wrong index access --- crates/stdlib/src/json.rs | 197 ++++++++++++++++++++++++-------------- 1 file changed, 127 insertions(+), 70 deletions(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index c3b5ee01c37..2e410d8df88 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -37,6 +37,22 @@ mod _json { count } + /// Check if a character iterator starts with a given pattern. + /// This avoids byte/char index mismatch issues with non-ASCII strings. + #[inline] + fn starts_with_chars(mut chars: I, pattern: &str) -> bool + where + I: Iterator, + { + for expected in pattern.chars() { + match chars.next() { + Some(c) if c == expected => continue, + _ => return false, + } + } + true + } + #[pyattr(name = "make_scanner")] #[pyclass(name = "Scanner", traverse)] #[derive(Debug, PyPayload)] @@ -202,6 +218,54 @@ mod _json { Some((ret, buf.len())) } + /// Parse a number from a character iterator. + /// Returns (result, character_count) where character_count is the number of chars consumed. + fn parse_number_from_chars( + &self, + chars: I, + vm: &VirtualMachine, + ) -> Option<(PyResult, usize)> + where + I: Iterator, + { + let mut buf = String::new(); + let mut has_neg = false; + let mut has_decimal = false; + let mut has_exponent = false; + let mut has_e_sign = false; + + for c in chars { + let i = buf.len(); + match c { + '-' if i == 0 => has_neg = true, + n if n.is_ascii_digit() => {} + '.' if !has_decimal => has_decimal = true, + 'e' | 'E' if !has_exponent => has_exponent = true, + '+' | '-' if !has_e_sign => has_e_sign = true, + _ => break, + } + buf.push(c); + } + + let len = buf.len(); + if len == 0 || (len == 1 && has_neg) { + return None; + } + + let ret = if has_decimal || has_exponent { + if let Some(ref parse_float) = self.parse_float { + parse_float.call((&buf,), vm) + } else { + Ok(vm.ctx.new_float(f64::from_str(&buf).unwrap()).into()) + } + } else if let Some(ref parse_int) = self.parse_int { + parse_int.call((&buf,), vm) + } else { + Ok(vm.new_pyobj(BigInt::from_str(&buf).unwrap())) + }; + Some((ret, len)) + } + /// Parse a JSON object starting after the opening '{'. /// Returns (parsed_object, end_character_index). fn parse_object( @@ -458,6 +522,7 @@ mod _json { } /// Call scan_once and handle the result. + /// Uses character iterators to avoid byte/char index mismatch with non-ASCII strings. fn call_scan_once( &self, scan_once: &PyObjectRef, @@ -466,100 +531,92 @@ mod _json { memo: &mut HashMap, vm: &VirtualMachine, ) -> PyResult<(PyObjectRef, usize)> { - // First try to handle common cases directly in Rust let s = pystr.as_str(); - let mut chars = s.chars().skip(idx).peekable(); + let chars = s.chars().skip(idx).peekable(); - let remaining = &s[idx..]; + let first_char = match chars.clone().next() { + Some(c) => c, + None => return Err(self.make_decode_error("Expecting value", pystr, idx, vm)), + }; - match chars.peek() { - Some('"') => { - // String - parse directly in Rust + match first_char { + '"' => { + // String let (wtf8, end) = machinery::scanstring(pystr.as_wtf8(), idx + 1, self.strict) .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; let py_str = vm.ctx.new_str(wtf8.to_string()); - return Ok((py_str.into(), end)); + Ok((py_str.into(), end)) } - Some('{') => { - // Nested object - parse recursively in Rust - return self.parse_object(pystr, idx + 1, scan_once, memo, vm); + '{' => { + // Object + self.parse_object(pystr, idx + 1, scan_once, memo, vm) } - Some('[') => { - // Nested array - parse recursively in Rust - return self.parse_array(pystr, idx + 1, scan_once, memo, vm); + '[' => { + // Array + self.parse_array(pystr, idx + 1, scan_once, memo, vm) } - Some('n') => { - // null - parse directly in Rust - if remaining.starts_with("null") { - return Ok((vm.ctx.none(), idx + 4)); - } + 'n' if starts_with_chars(chars.clone(), "null") => { + // null + Ok((vm.ctx.none(), idx + 4)) } - Some('t') => { - // true - parse directly in Rust - if remaining.starts_with("true") { - return Ok((vm.ctx.new_bool(true).into(), idx + 4)); - } + 't' if starts_with_chars(chars.clone(), "true") => { + // true + Ok((vm.ctx.new_bool(true).into(), idx + 4)) } - Some('f') => { - // false - parse directly in Rust - if remaining.starts_with("false") { - return Ok((vm.ctx.new_bool(false).into(), idx + 5)); - } + 'f' if starts_with_chars(chars.clone(), "false") => { + // false + Ok((vm.ctx.new_bool(false).into(), idx + 5)) } - Some(c) if c.is_ascii_digit() => { - // Number starting with digit - parse directly in Rust - if let Some((result, len)) = self.parse_number(remaining, vm) { - return Ok((result?, idx + len)); - } + 'N' if starts_with_chars(chars.clone(), "NaN") => { + // NaN + let result = self.parse_constant.call(("NaN",), vm)?; + Ok((result, idx + 3)) } - Some('N') => { - // NaN - parse directly in Rust - if remaining.starts_with("NaN") { - let result = self.parse_constant.call(("NaN",), vm)?; - return Ok((result, idx + 3)); - } + 'I' if starts_with_chars(chars.clone(), "Infinity") => { + // Infinity + let result = self.parse_constant.call(("Infinity",), vm)?; + Ok((result, idx + 8)) } - Some('I') => { - // Infinity - parse directly in Rust - if remaining.starts_with("Infinity") { - let result = self.parse_constant.call(("Infinity",), vm)?; - return Ok((result, idx + 8)); - } - } - Some('-') => { + '-' => { // -Infinity or negative number - if remaining.starts_with("-Infinity") { + if starts_with_chars(chars.clone(), "-Infinity") { let result = self.parse_constant.call(("-Infinity",), vm)?; return Ok((result, idx + 9)); } - // Try parsing as negative number - if let Some((result, len)) = self.parse_number(remaining, vm) { + // Negative number - collect number characters + if let Some((result, len)) = self.parse_number_from_chars(chars, vm) { return Ok((result?, idx + len)); } + Err(self.make_decode_error("Expecting value", pystr, idx, vm)) } - _ => { - // fall through to call scan_once - } - } - - // Fall back to scan_once for other value types - let result = scan_once.call((pystr.clone(), idx as isize), vm); - - match result { - Ok(tuple) => { - use crate::vm::builtins::PyTupleRef; - let tuple: PyTupleRef = tuple.try_into_value(vm)?; - if tuple.len() != 2 { - return Err(vm.new_value_error("scan_once must return 2-tuple")); + c if c.is_ascii_digit() => { + // Positive number + if let Some((result, len)) = self.parse_number_from_chars(chars, vm) { + return Ok((result?, idx + len)); } - let value = tuple.as_slice()[0].clone(); - let end_idx: isize = tuple.as_slice()[1].try_to_value(vm)?; - Ok((value, end_idx as usize)) - } - Err(err) if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) => { Err(self.make_decode_error("Expecting value", pystr, idx, vm)) } - Err(err) => Err(err), + _ => { + // Fall back to scan_once for unrecognized input + let result = scan_once.call((pystr.clone(), idx as isize), vm); + + match result { + Ok(tuple) => { + use crate::vm::builtins::PyTupleRef; + let tuple: PyTupleRef = tuple.try_into_value(vm)?; + if tuple.len() != 2 { + return Err(vm.new_value_error("scan_once must return 2-tuple")); + } + let value = tuple.as_slice()[0].clone(); + let end_idx: isize = tuple.as_slice()[1].try_to_value(vm)?; + Ok((value, end_idx as usize)) + } + Err(err) if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) => { + Err(self.make_decode_error("Expecting value", pystr, idx, vm)) + } + Err(err) => Err(err), + } + } } } From e39c6b9adc7506ef0b978dc3778cd8488005200a Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Thu, 15 Jan 2026 01:33:29 +0900 Subject: [PATCH 8/9] Leave more flame span --- crates/stdlib/src/json.rs | 2 + crates/stdlib/src/json/machinery.rs | 184 ++++++++++++++++++---------- 2 files changed, 118 insertions(+), 68 deletions(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index 2e410d8df88..a07acc5bd29 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -24,6 +24,7 @@ mod _json { where I: Iterator, { + flame_guard!("_json::skip_whitespace_chars"); let mut count = 0; while let Some(&c) = chars.peek() { match c { @@ -228,6 +229,7 @@ mod _json { where I: Iterator, { + flame_guard!("JsonScanner::parse_number_from_chars"); let mut buf = String::new(); let mut has_neg = false; let mut has_decimal = false; diff --git a/crates/stdlib/src/json/machinery.rs b/crates/stdlib/src/json/machinery.rs index de3c1d8547f..9f379a962ac 100644 --- a/crates/stdlib/src/json/machinery.rs +++ b/crates/stdlib/src/json/machinery.rs @@ -30,6 +30,7 @@ use std::io; use itertools::Itertools; +use memchr::memchr2; use rustpython_common::wtf8::{CodePoint, Wtf8, Wtf8Buf}; static ESCAPE_CHARS: [&str; 0x20] = [ @@ -131,79 +132,125 @@ pub fn scanstring<'a>( end: usize, strict: bool, ) -> Result<(Wtf8Buf, usize), DecodeError> { - let mut chunks: Vec> = Vec::new(); - let mut output_len = 0usize; - let mut push_chunk = |chunk: StrOrChar<'a>| { - output_len += chunk.len(); - chunks.push(chunk); - }; + flame_guard!("machinery::scanstring"); let unterminated_err = || DecodeError::new("Unterminated string starting at", end - 1); - let mut chars = s.code_point_indices().enumerate().skip(end).peekable(); - let &(_, (mut chunk_start, _)) = chars.peek().ok_or_else(unterminated_err)?; - while let Some((char_i, (byte_i, c))) = chars.next() { - match c.to_char_lossy() { - '"' => { - push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); - let mut out = Wtf8Buf::with_capacity(output_len); - for x in chunks { - match x { - StrOrChar::Str(s) => out.push_wtf8(s), - StrOrChar::Char(c) => out.push(c), + + // Get byte index for character position `end` + let byte_start = { + flame_guard!("machinery::scanstring::byte_start_initialization"); + s.code_point_indices() + .nth(end) + .ok_or_else(unterminated_err)? + .0 + }; + + let bytes = s.as_bytes(); + let search_bytes = &bytes[byte_start..]; + + // Fast path: use memchr to find " or \ quickly + if let Some(pos) = { + flame_guard!("machinery::scanstring::memchr2"); + memchr2(b'"', b'\\', search_bytes) + } { + flame_guard!("machinery::scanstring::memchr2::condition_some"); + if search_bytes[pos] == b'"' { + flame_guard!("machinery::scanstring::memchr2::condition_some::condition_if"); + let content_bytes = &search_bytes[..pos]; + + // In strict mode, check for control characters (0x00-0x1F) + let has_control_char = strict && content_bytes.iter().any(|&b| b < 0x20); + + if !has_control_char { + flame_guard!("machinery::scanstring::fast_path"); + let result_slice = &s[byte_start..byte_start + pos]; + let char_count = result_slice.code_points().count(); + let mut out = Wtf8Buf::with_capacity(pos); + out.push_wtf8(result_slice); + return Ok((out, end + char_count + 1)); + } + } + } + + // Slow path: chunk-based parsing for strings with escapes or control chars + { + flame_guard!("machinery::scanstring::slow_path"); + let mut chunks: Vec> = Vec::new(); + let mut output_len = 0usize; + let mut push_chunk = |chunk: StrOrChar<'a>| { + output_len += chunk.len(); + chunks.push(chunk); + }; + let mut chars = s.code_point_indices().enumerate().skip(end).peekable(); + let &(_, (mut chunk_start, _)) = chars.peek().ok_or_else(unterminated_err)?; + while let Some((char_i, (byte_i, c))) = chars.next() { + match c.to_char_lossy() { + '"' => { + push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); + flame_guard!("machinery::scanstring::assemble_chunks"); + let mut out = Wtf8Buf::with_capacity(output_len); + for x in chunks { + match x { + StrOrChar::Str(s) => out.push_wtf8(s), + StrOrChar::Char(c) => out.push(c), + } } + return Ok((out, char_i + 1)); } - return Ok((out, char_i + 1)); - } - '\\' => { - push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); - let (_, (_, c)) = chars.next().ok_or_else(unterminated_err)?; - let esc = match c.to_char_lossy() { - '"' => "\"", - '\\' => "\\", - '/' => "/", - 'b' => "\x08", - 'f' => "\x0c", - 'n' => "\n", - 'r' => "\r", - 't' => "\t", - 'u' => { - let mut uni = decode_unicode(&mut chars, char_i)?; - chunk_start = byte_i + 6; - if let Some(lead) = uni.to_lead_surrogate() { - // uni is a surrogate -- try to find its pair - let mut chars2 = chars.clone(); - if let Some(((pos2, _), (_, _))) = chars2 - .next_tuple() - .filter(|((_, (_, c1)), (_, (_, c2)))| *c1 == '\\' && *c2 == 'u') - { - let uni2 = decode_unicode(&mut chars2, pos2)?; - if let Some(trail) = uni2.to_trail_surrogate() { - // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates - uni = lead.merge(trail).into(); - chunk_start = pos2 + 6; - chars = chars2; + '\\' => { + push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); + let (_, (_, c)) = chars.next().ok_or_else(unterminated_err)?; + let esc = + match c.to_char_lossy() { + '"' => "\"", + '\\' => "\\", + '/' => "/", + 'b' => "\x08", + 'f' => "\x0c", + 'n' => "\n", + 'r' => "\r", + 't' => "\t", + 'u' => { + let mut uni = decode_unicode(&mut chars, char_i)?; + chunk_start = byte_i + 6; + if let Some(lead) = uni.to_lead_surrogate() { + // uni is a surrogate -- try to find its pair + let mut chars2 = chars.clone(); + if let Some(((pos2, _), (_, _))) = chars2.next_tuple().filter( + |((_, (_, c1)), (_, (_, c2)))| *c1 == '\\' && *c2 == 'u', + ) { + let uni2 = decode_unicode(&mut chars2, pos2)?; + if let Some(trail) = uni2.to_trail_surrogate() { + // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates + uni = lead.merge(trail).into(); + chunk_start = pos2 + 6; + chars = chars2; + } + } } + push_chunk(StrOrChar::Char(uni)); + continue; } - } - push_chunk(StrOrChar::Char(uni)); - continue; - } - _ => { - return Err(DecodeError::new(format!("Invalid \\escape: {c:?}"), char_i)); - } - }; - chunk_start = byte_i + 2; - push_chunk(StrOrChar::Str(esc.as_ref())); - } - '\x00'..='\x1f' if strict => { - return Err(DecodeError::new( - format!("Invalid control character {c:?} at"), - char_i, - )); + _ => { + return Err(DecodeError::new( + format!("Invalid \\escape: {c:?}"), + char_i, + )); + } + }; + chunk_start = byte_i + 2; + push_chunk(StrOrChar::Str(esc.as_ref())); + } + '\x00'..='\x1f' if strict => { + return Err(DecodeError::new( + format!("Invalid control character {c:?} at"), + char_i, + )); + } + _ => {} } - _ => {} } + Err(unterminated_err()) } - Err(unterminated_err()) } #[inline] @@ -211,12 +258,13 @@ fn decode_unicode(it: &mut I, pos: usize) -> Result where I: Iterator, { + flame_guard!("machinery::decode_unicode"); let err = || DecodeError::new("Invalid \\uXXXX escape", pos); - let mut uni = 0; - for x in (0..4).rev() { + let mut uni = 0u16; + for _ in 0..4 { let (_, (_, c)) = it.next().ok_or_else(err)?; let d = c.to_char().and_then(|c| c.to_digit(16)).ok_or_else(err)? as u16; - uni += d * 16u16.pow(x); + uni = (uni << 4) | d; } Ok(uni.into()) } From 088fd8300fd7e6d0cdc2ac8ecea41d2221fbfa60 Mon Sep 17 00:00:00 2001 From: Lee Dogeon Date: Thu, 15 Jan 2026 02:29:11 +0900 Subject: [PATCH 9/9] Refactor json scanstring with byte index --- crates/stdlib/src/json.rs | 500 +++++++++++++++------------- crates/stdlib/src/json/machinery.rs | 204 ++++++------ 2 files changed, 364 insertions(+), 340 deletions(-) diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index a07acc5bd29..4bdf4533533 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -7,7 +7,7 @@ mod _json { use crate::vm::{ AsObject, Py, PyObjectRef, PyPayload, PyResult, VirtualMachine, builtins::{PyBaseExceptionRef, PyStrRef, PyType}, - convert::{ToPyObject, ToPyResult}, + convert::ToPyResult, function::{IntoFuncArgs, OptionalArg}, protocol::PyIterReturn, types::{Callable, Constructor}, @@ -18,40 +18,25 @@ mod _json { use std::collections::HashMap; /// Skip JSON whitespace characters (space, tab, newline, carriage return). - /// Works with a character iterator and returns the number of characters skipped. + /// Works with a byte slice and returns the number of bytes skipped. + /// Since all JSON whitespace chars are ASCII, bytes == chars. #[inline] - fn skip_whitespace_chars(chars: &mut std::iter::Peekable) -> usize - where - I: Iterator, - { - flame_guard!("_json::skip_whitespace_chars"); + fn skip_whitespace(bytes: &[u8]) -> usize { + flame_guard!("_json::skip_whitespace"); let mut count = 0; - while let Some(&c) = chars.peek() { - match c { - ' ' | '\t' | '\n' | '\r' => { - chars.next(); - count += 1; - } + for &b in bytes { + match b { + b' ' | b'\t' | b'\n' | b'\r' => count += 1, _ => break, } } count } - /// Check if a character iterator starts with a given pattern. - /// This avoids byte/char index mismatch issues with non-ASCII strings. + /// Check if a byte slice starts with a given ASCII pattern. #[inline] - fn starts_with_chars(mut chars: I, pattern: &str) -> bool - where - I: Iterator, - { - for expected in pattern.chars() { - match chars.next() { - Some(c) if c == expected => continue, - _ => return false, - } - } - true + fn starts_with_bytes(bytes: &[u8], pattern: &[u8]) -> bool { + bytes.len() >= pattern.len() && &bytes[..pattern.len()] == pattern } #[pyattr(name = "make_scanner")] @@ -106,49 +91,64 @@ mod _json { impl JsonScanner { fn parse( &self, - s: &str, pystr: PyStrRef, - idx: usize, + char_idx: usize, + byte_idx: usize, scan_once: PyObjectRef, vm: &VirtualMachine, ) -> PyResult { flame_guard!("JsonScanner::parse"); - let c = match s.chars().next() { - Some(c) => c, + let bytes = pystr.as_str().as_bytes(); + let wtf8 = pystr.as_wtf8(); + + let first_byte = match bytes.get(byte_idx) { + Some(&b) => b, None => { return Ok(PyIterReturn::StopIteration(Some( - vm.ctx.new_int(idx).into(), + vm.ctx.new_int(char_idx).into(), ))); } }; - let next_idx = idx + c.len_utf8(); - match c { - '"' => { - return scanstring(pystr, next_idx, OptionalArg::Present(self.strict), vm) - .map(|x| PyIterReturn::Return(x.to_pyobject(vm))); + + match first_byte { + b'"' => { + // Parse string - pass slice starting after the quote + let (wtf8_result, chars_consumed, _bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx + 1..], char_idx + 1, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + let end_char_idx = char_idx + 1 + chars_consumed; + return Ok(PyIterReturn::Return( + vm.new_tuple((wtf8_result, end_char_idx)).into(), + )); } - '{' => { + b'{' => { // Parse object in Rust let mut memo = HashMap::new(); return self - .parse_object(pystr, next_idx, &scan_once, &mut memo, vm) - .map(|(obj, end)| PyIterReturn::Return(vm.new_tuple((obj, end)).into())); + .parse_object(pystr, char_idx + 1, byte_idx + 1, &scan_once, &mut memo, vm) + .map(|(obj, end_char, _end_byte)| { + PyIterReturn::Return(vm.new_tuple((obj, end_char)).into()) + }); } - '[' => { + b'[' => { // Parse array in Rust let mut memo = HashMap::new(); return self - .parse_array(pystr, next_idx, &scan_once, &mut memo, vm) - .map(|(obj, end)| PyIterReturn::Return(vm.new_tuple((obj, end)).into())); + .parse_array(pystr, char_idx + 1, byte_idx + 1, &scan_once, &mut memo, vm) + .map(|(obj, end_char, _end_byte)| { + PyIterReturn::Return(vm.new_tuple((obj, end_char)).into()) + }); } _ => {} } + let s = &pystr.as_str()[byte_idx..]; + macro_rules! parse_const { ($s:literal, $val:expr) => { if s.starts_with($s) { return Ok(PyIterReturn::Return( - vm.new_tuple(($val, idx + $s.len())).into(), + vm.new_tuple(($val, char_idx + $s.len())).into(), )); } }; @@ -159,15 +159,20 @@ mod _json { parse_const!("false", false); if let Some((res, len)) = self.parse_number(s, vm) { - return Ok(PyIterReturn::Return(vm.new_tuple((res?, idx + len)).into())); + return Ok(PyIterReturn::Return( + vm.new_tuple((res?, char_idx + len)).into(), + )); } macro_rules! parse_constant { ($s:literal) => { if s.starts_with($s) { return Ok(PyIterReturn::Return( - vm.new_tuple((self.parse_constant.call(($s,), vm)?, idx + $s.len())) - .into(), + vm.new_tuple(( + self.parse_constant.call(($s,), vm)?, + char_idx + $s.len(), + )) + .into(), )); } }; @@ -178,7 +183,7 @@ mod _json { parse_constant!("-Infinity"); Ok(PyIterReturn::StopIteration(Some( - vm.ctx.new_int(idx).into(), + vm.ctx.new_int(char_idx).into(), ))) } @@ -219,87 +224,42 @@ mod _json { Some((ret, buf.len())) } - /// Parse a number from a character iterator. - /// Returns (result, character_count) where character_count is the number of chars consumed. - fn parse_number_from_chars( - &self, - chars: I, - vm: &VirtualMachine, - ) -> Option<(PyResult, usize)> - where - I: Iterator, - { - flame_guard!("JsonScanner::parse_number_from_chars"); - let mut buf = String::new(); - let mut has_neg = false; - let mut has_decimal = false; - let mut has_exponent = false; - let mut has_e_sign = false; - - for c in chars { - let i = buf.len(); - match c { - '-' if i == 0 => has_neg = true, - n if n.is_ascii_digit() => {} - '.' if !has_decimal => has_decimal = true, - 'e' | 'E' if !has_exponent => has_exponent = true, - '+' | '-' if !has_e_sign => has_e_sign = true, - _ => break, - } - buf.push(c); - } - - let len = buf.len(); - if len == 0 || (len == 1 && has_neg) { - return None; - } - - let ret = if has_decimal || has_exponent { - if let Some(ref parse_float) = self.parse_float { - parse_float.call((&buf,), vm) - } else { - Ok(vm.ctx.new_float(f64::from_str(&buf).unwrap()).into()) - } - } else if let Some(ref parse_int) = self.parse_int { - parse_int.call((&buf,), vm) - } else { - Ok(vm.new_pyobj(BigInt::from_str(&buf).unwrap())) - }; - Some((ret, len)) - } - /// Parse a JSON object starting after the opening '{'. - /// Returns (parsed_object, end_character_index). + /// Returns (parsed_object, end_char_index, end_byte_index). fn parse_object( &self, pystr: PyStrRef, - start_idx: usize, // Character index right after '{' + start_char_idx: usize, + start_byte_idx: usize, scan_once: &PyObjectRef, memo: &mut HashMap, vm: &VirtualMachine, - ) -> PyResult<(PyObjectRef, usize)> { + ) -> PyResult<(PyObjectRef, usize, usize)> { flame_guard!("JsonScanner::parse_object"); - let s = pystr.as_str(); - let mut chars = s.chars().skip(start_idx).peekable(); - let mut idx = start_idx; + let bytes = pystr.as_str().as_bytes(); + let wtf8 = pystr.as_wtf8(); + let mut char_idx = start_char_idx; + let mut byte_idx = start_byte_idx; // Skip initial whitespace - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; // Check for empty object - match chars.peek() { - Some('}') => { - return self.finalize_object(vec![], idx + 1, vm); + match bytes.get(byte_idx) { + Some(b'}') => { + return self.finalize_object(vec![], char_idx + 1, byte_idx + 1, vm); } - Some('"') => { + Some(b'"') => { // Continue to parse first key } - Some(_) | None => { + _ => { return Err(self.make_decode_error( "Expecting property name enclosed in double quotes", pystr, - idx, + char_idx, vm, )); } @@ -309,12 +269,16 @@ mod _json { loop { // We're now at '"', skip it - chars.next(); - idx += 1; + char_idx += 1; + byte_idx += 1; + + // Parse key string using scanstring with byte slice + let (key_wtf8, chars_consumed, bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx..], char_idx, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; - // Parse key string using existing scanstring - let (key_wtf8, key_end) = machinery::scanstring(pystr.as_wtf8(), idx, self.strict) - .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + char_idx += chars_consumed; + byte_idx += bytes_consumed; // Key memoization - reuse existing key strings let key_str = key_wtf8.to_string(); @@ -327,68 +291,73 @@ mod _json { } }; - // Update position and rebuild iterator - idx = key_end; - chars = s.chars().skip(idx).peekable(); - // Skip whitespace after key - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; // Expect ':' delimiter - match chars.peek() { - Some(':') => { - chars.next(); - idx += 1; + match bytes.get(byte_idx) { + Some(b':') => { + char_idx += 1; + byte_idx += 1; } _ => { return Err(self.make_decode_error( "Expecting ':' delimiter", pystr, - idx, + char_idx, vm, )); } } // Skip whitespace after ':' - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; - // Parse value recursively using scan_once - let (value, value_end) = - self.call_scan_once(scan_once, pystr.clone(), idx, memo, vm)?; + // Parse value recursively + let (value, value_char_end, value_byte_end) = + self.call_scan_once(scan_once, pystr.clone(), char_idx, byte_idx, memo, vm)?; pairs.push((key, value)); - idx = value_end; - chars = s.chars().skip(idx).peekable(); + char_idx = value_char_end; + byte_idx = value_byte_end; // Skip whitespace after value - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; // Check for ',' or '}' - match chars.peek() { - Some('}') => { - idx += 1; + match bytes.get(byte_idx) { + Some(b'}') => { + char_idx += 1; + byte_idx += 1; break; } - Some(',') => { - let comma_idx = idx; - chars.next(); - idx += 1; + Some(b',') => { + let comma_char_idx = char_idx; + char_idx += 1; + byte_idx += 1; // Skip whitespace after comma - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; // Next must be '"' - match chars.peek() { - Some('"') => { + match bytes.get(byte_idx) { + Some(b'"') => { // Continue to next key-value pair } - Some('}') => { + Some(b'}') => { // Trailing comma before end of object return Err(self.make_decode_error( "Illegal trailing comma before end of object", pystr, - comma_idx, + comma_char_idx, vm, )); } @@ -396,7 +365,7 @@ mod _json { return Err(self.make_decode_error( "Expecting property name enclosed in double quotes", pystr, - idx, + char_idx, vm, )); } @@ -406,72 +375,81 @@ mod _json { return Err(self.make_decode_error( "Expecting ',' delimiter", pystr, - idx, + char_idx, vm, )); } } } - self.finalize_object(pairs, idx, vm) + self.finalize_object(pairs, char_idx, byte_idx, vm) } /// Parse a JSON array starting after the opening '['. - /// Returns (parsed_array, end_character_index). + /// Returns (parsed_array, end_char_index, end_byte_index). fn parse_array( &self, pystr: PyStrRef, - start_idx: usize, // Character index right after '[' + start_char_idx: usize, + start_byte_idx: usize, scan_once: &PyObjectRef, memo: &mut HashMap, vm: &VirtualMachine, - ) -> PyResult<(PyObjectRef, usize)> { + ) -> PyResult<(PyObjectRef, usize, usize)> { flame_guard!("JsonScanner::parse_array"); - let s = pystr.as_str(); - let mut chars = s.chars().skip(start_idx).peekable(); - let mut idx = start_idx; + let bytes = pystr.as_str().as_bytes(); + let mut char_idx = start_char_idx; + let mut byte_idx = start_byte_idx; // Skip initial whitespace - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; // Check for empty array - if chars.peek() == Some(&']') { - return Ok((vm.ctx.new_list(vec![]).into(), idx + 1)); + if bytes.get(byte_idx) == Some(&b']') { + return Ok((vm.ctx.new_list(vec![]).into(), char_idx + 1, byte_idx + 1)); } let mut values: Vec = Vec::new(); loop { // Parse value - let (value, value_end) = - self.call_scan_once(scan_once, pystr.clone(), idx, memo, vm)?; + let (value, value_char_end, value_byte_end) = + self.call_scan_once(scan_once, pystr.clone(), char_idx, byte_idx, memo, vm)?; values.push(value); - idx = value_end; - chars = s.chars().skip(idx).peekable(); + char_idx = value_char_end; + byte_idx = value_byte_end; // Skip whitespace after value - idx += skip_whitespace_chars(&mut chars); - - match chars.peek() { - Some(']') => { - idx += 1; + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + match bytes.get(byte_idx) { + Some(b']') => { + char_idx += 1; + byte_idx += 1; break; } - Some(',') => { - let comma_idx = idx; - chars.next(); - idx += 1; + Some(b',') => { + let comma_char_idx = char_idx; + char_idx += 1; + byte_idx += 1; + // Skip whitespace after comma - idx += skip_whitespace_chars(&mut chars); + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; // Check for trailing comma - if chars.peek() == Some(&']') { + if bytes.get(byte_idx) == Some(&b']') { return Err(self.make_decode_error( "Illegal trailing comma before end of array", pystr, - comma_idx, + comma_char_idx, vm, )); } @@ -480,23 +458,24 @@ mod _json { return Err(self.make_decode_error( "Expecting ',' delimiter", pystr, - idx, + char_idx, vm, )); } } } - Ok((vm.ctx.new_list(values).into(), idx)) + Ok((vm.ctx.new_list(values).into(), char_idx, byte_idx)) } /// Finalize object construction with hooks. fn finalize_object( &self, pairs: Vec<(PyObjectRef, PyObjectRef)>, - end_idx: usize, + end_char_idx: usize, + end_byte_idx: usize, vm: &VirtualMachine, - ) -> PyResult<(PyObjectRef, usize)> { + ) -> PyResult<(PyObjectRef, usize, usize)> { let result = if let Some(ref pairs_hook) = self.object_pairs_hook { // object_pairs_hook takes priority - pass list of tuples let pairs_list: Vec = pairs @@ -520,87 +499,95 @@ mod _json { } }; - Ok((result, end_idx)) + Ok((result, end_char_idx, end_byte_idx)) } /// Call scan_once and handle the result. - /// Uses character iterators to avoid byte/char index mismatch with non-ASCII strings. + /// Returns (value, end_char_idx, end_byte_idx). fn call_scan_once( &self, scan_once: &PyObjectRef, pystr: PyStrRef, - idx: usize, + char_idx: usize, + byte_idx: usize, memo: &mut HashMap, vm: &VirtualMachine, - ) -> PyResult<(PyObjectRef, usize)> { + ) -> PyResult<(PyObjectRef, usize, usize)> { let s = pystr.as_str(); - let chars = s.chars().skip(idx).peekable(); + let bytes = s.as_bytes(); + let wtf8 = pystr.as_wtf8(); - let first_char = match chars.clone().next() { - Some(c) => c, - None => return Err(self.make_decode_error("Expecting value", pystr, idx, vm)), + let first_byte = match bytes.get(byte_idx) { + Some(&b) => b, + None => return Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)), }; - match first_char { - '"' => { - // String - let (wtf8, end) = machinery::scanstring(pystr.as_wtf8(), idx + 1, self.strict) - .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; - let py_str = vm.ctx.new_str(wtf8.to_string()); - Ok((py_str.into(), end)) + match first_byte { + b'"' => { + // String - pass slice starting after the quote + let (wtf8_result, chars_consumed, bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx + 1..], char_idx + 1, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + let py_str = vm.ctx.new_str(wtf8_result.to_string()); + Ok(( + py_str.into(), + char_idx + 1 + chars_consumed, + byte_idx + 1 + bytes_consumed, + )) } - '{' => { + b'{' => { // Object - self.parse_object(pystr, idx + 1, scan_once, memo, vm) + self.parse_object(pystr, char_idx + 1, byte_idx + 1, scan_once, memo, vm) } - '[' => { + b'[' => { // Array - self.parse_array(pystr, idx + 1, scan_once, memo, vm) + self.parse_array(pystr, char_idx + 1, byte_idx + 1, scan_once, memo, vm) } - 'n' if starts_with_chars(chars.clone(), "null") => { + b'n' if starts_with_bytes(&bytes[byte_idx..], b"null") => { // null - Ok((vm.ctx.none(), idx + 4)) + Ok((vm.ctx.none(), char_idx + 4, byte_idx + 4)) } - 't' if starts_with_chars(chars.clone(), "true") => { + b't' if starts_with_bytes(&bytes[byte_idx..], b"true") => { // true - Ok((vm.ctx.new_bool(true).into(), idx + 4)) + Ok((vm.ctx.new_bool(true).into(), char_idx + 4, byte_idx + 4)) } - 'f' if starts_with_chars(chars.clone(), "false") => { + b'f' if starts_with_bytes(&bytes[byte_idx..], b"false") => { // false - Ok((vm.ctx.new_bool(false).into(), idx + 5)) + Ok((vm.ctx.new_bool(false).into(), char_idx + 5, byte_idx + 5)) } - 'N' if starts_with_chars(chars.clone(), "NaN") => { + b'N' if starts_with_bytes(&bytes[byte_idx..], b"NaN") => { // NaN let result = self.parse_constant.call(("NaN",), vm)?; - Ok((result, idx + 3)) + Ok((result, char_idx + 3, byte_idx + 3)) } - 'I' if starts_with_chars(chars.clone(), "Infinity") => { + b'I' if starts_with_bytes(&bytes[byte_idx..], b"Infinity") => { // Infinity let result = self.parse_constant.call(("Infinity",), vm)?; - Ok((result, idx + 8)) + Ok((result, char_idx + 8, byte_idx + 8)) } - '-' => { + b'-' => { // -Infinity or negative number - if starts_with_chars(chars.clone(), "-Infinity") { + if starts_with_bytes(&bytes[byte_idx..], b"-Infinity") { let result = self.parse_constant.call(("-Infinity",), vm)?; - return Ok((result, idx + 9)); + return Ok((result, char_idx + 9, byte_idx + 9)); } - // Negative number - collect number characters - if let Some((result, len)) = self.parse_number_from_chars(chars, vm) { - return Ok((result?, idx + len)); + // Negative number - numbers are ASCII so len == bytes + if let Some((result, len)) = self.parse_number(&s[byte_idx..], vm) { + return Ok((result?, char_idx + len, byte_idx + len)); } - Err(self.make_decode_error("Expecting value", pystr, idx, vm)) + Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)) } - c if c.is_ascii_digit() => { - // Positive number - if let Some((result, len)) = self.parse_number_from_chars(chars, vm) { - return Ok((result?, idx + len)); + b'0'..=b'9' => { + // Positive number - numbers are ASCII so len == bytes + if let Some((result, len)) = self.parse_number(&s[byte_idx..], vm) { + return Ok((result?, char_idx + len, byte_idx + len)); } - Err(self.make_decode_error("Expecting value", pystr, idx, vm)) + Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)) } _ => { // Fall back to scan_once for unrecognized input - let result = scan_once.call((pystr.clone(), idx as isize), vm); + // Note: This path requires char_idx for Python compatibility + let result = scan_once.call((pystr.clone(), char_idx as isize), vm); match result { Ok(tuple) => { @@ -610,11 +597,18 @@ mod _json { return Err(vm.new_value_error("scan_once must return 2-tuple")); } let value = tuple.as_slice()[0].clone(); - let end_idx: isize = tuple.as_slice()[1].try_to_value(vm)?; - Ok((value, end_idx as usize)) + let end_char_idx: isize = tuple.as_slice()[1].try_to_value(vm)?; + // For fallback, we need to calculate byte_idx from char_idx + // This is expensive but fallback should be rare + let end_byte_idx = s + .char_indices() + .nth(end_char_idx as usize) + .map(|(i, _)| i) + .unwrap_or(s.len()); + Ok((value, end_char_idx as usize, end_byte_idx)) } Err(err) if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) => { - Err(self.make_decode_error("Expecting value", pystr, idx, vm)) + Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)) } Err(err) => Err(err), } @@ -637,24 +631,35 @@ mod _json { impl Callable for JsonScanner { type Args = (PyStrRef, isize); - fn call(zelf: &Py, (pystr, idx): Self::Args, vm: &VirtualMachine) -> PyResult { - if idx < 0 { + fn call(zelf: &Py, (pystr, char_idx): Self::Args, vm: &VirtualMachine) -> PyResult { + if char_idx < 0 { return Err(vm.new_value_error("idx cannot be negative")); } - let idx = idx as usize; - let mut chars = pystr.as_str().chars(); - if idx > 0 && chars.nth(idx - 1).is_none() { - PyIterReturn::StopIteration(Some(vm.ctx.new_int(idx).into())).to_pyresult(vm) + let char_idx = char_idx as usize; + let s = pystr.as_str(); + + // Calculate byte index from char index (O(char_idx) but only at entry point) + let byte_idx = if char_idx == 0 { + 0 } else { - zelf.parse( - chars.as_str(), - pystr.clone(), - idx, - zelf.to_owned().into(), - vm, - ) - .and_then(|x| x.to_pyresult(vm)) - } + match s.char_indices().nth(char_idx) { + Some((byte_i, _)) => byte_i, + None => { + // char_idx is beyond the string length + return PyIterReturn::StopIteration(Some(vm.ctx.new_int(char_idx).into())) + .to_pyresult(vm); + } + } + }; + + zelf.parse( + pystr.clone(), + char_idx, + byte_idx, + zelf.to_owned().into(), + vm, + ) + .and_then(|x| x.to_pyresult(vm)) } } @@ -701,7 +706,28 @@ mod _json { vm: &VirtualMachine, ) -> PyResult<(Wtf8Buf, usize)> { flame_guard!("_json::scanstring"); - machinery::scanstring(s.as_wtf8(), end, strict.unwrap_or(true)) - .map_err(|e| py_decode_error(e, s, vm)) + let wtf8 = s.as_wtf8(); + + // Convert char index `end` to byte index + let byte_idx = if end == 0 { + 0 + } else { + wtf8.code_point_indices() + .nth(end) + .map(|(i, _)| i) + .ok_or_else(|| { + py_decode_error( + machinery::DecodeError::new("Unterminated string starting at", end - 1), + s.clone(), + vm, + ) + })? + }; + + let (result, chars_consumed, _bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx..], end, strict.unwrap_or(true)) + .map_err(|e| py_decode_error(e, s, vm))?; + + Ok((result, end + chars_consumed)) } } diff --git a/crates/stdlib/src/json/machinery.rs b/crates/stdlib/src/json/machinery.rs index 9f379a962ac..f33a135ab20 100644 --- a/crates/stdlib/src/json/machinery.rs +++ b/crates/stdlib/src/json/machinery.rs @@ -127,130 +127,128 @@ impl StrOrChar<'_> { } } } +/// Scan a JSON string starting right after the opening quote. +/// +/// # Arguments +/// * `s` - The string slice starting at the first character after the opening `"` +/// * `char_offset` - The character index where this slice starts (for error messages) +/// * `strict` - Whether to reject control characters +/// +/// # Returns +/// * `Ok((result, chars_consumed, bytes_consumed))` - The decoded string and how much was consumed +/// * `Err(DecodeError)` - If the string is malformed pub fn scanstring<'a>( s: &'a Wtf8, - end: usize, + char_offset: usize, strict: bool, -) -> Result<(Wtf8Buf, usize), DecodeError> { +) -> Result<(Wtf8Buf, usize, usize), DecodeError> { flame_guard!("machinery::scanstring"); - let unterminated_err = || DecodeError::new("Unterminated string starting at", end - 1); - - // Get byte index for character position `end` - let byte_start = { - flame_guard!("machinery::scanstring::byte_start_initialization"); - s.code_point_indices() - .nth(end) - .ok_or_else(unterminated_err)? - .0 - }; + let unterminated_err = || DecodeError::new("Unterminated string starting at", char_offset - 1); let bytes = s.as_bytes(); - let search_bytes = &bytes[byte_start..]; // Fast path: use memchr to find " or \ quickly - if let Some(pos) = { - flame_guard!("machinery::scanstring::memchr2"); - memchr2(b'"', b'\\', search_bytes) - } { - flame_guard!("machinery::scanstring::memchr2::condition_some"); - if search_bytes[pos] == b'"' { - flame_guard!("machinery::scanstring::memchr2::condition_some::condition_if"); - let content_bytes = &search_bytes[..pos]; + if let Some(pos) = memchr2(b'"', b'\\', bytes) + && bytes[pos] == b'"' + { + let content_bytes = &bytes[..pos]; - // In strict mode, check for control characters (0x00-0x1F) - let has_control_char = strict && content_bytes.iter().any(|&b| b < 0x20); + // In strict mode, check for control characters (0x00-0x1F) + let has_control_char = strict && content_bytes.iter().any(|&b| b < 0x20); - if !has_control_char { - flame_guard!("machinery::scanstring::fast_path"); - let result_slice = &s[byte_start..byte_start + pos]; - let char_count = result_slice.code_points().count(); - let mut out = Wtf8Buf::with_capacity(pos); - out.push_wtf8(result_slice); - return Ok((out, end + char_count + 1)); - } + if !has_control_char { + flame_guard!("machinery::scanstring::fast_path"); + let result_slice = &s[..pos]; + let char_count = result_slice.code_points().count(); + let mut out = Wtf8Buf::with_capacity(pos); + out.push_wtf8(result_slice); + // +1 for the closing quote + return Ok((out, char_count + 1, pos + 1)); } } // Slow path: chunk-based parsing for strings with escapes or control chars - { - flame_guard!("machinery::scanstring::slow_path"); - let mut chunks: Vec> = Vec::new(); - let mut output_len = 0usize; - let mut push_chunk = |chunk: StrOrChar<'a>| { - output_len += chunk.len(); - chunks.push(chunk); - }; - let mut chars = s.code_point_indices().enumerate().skip(end).peekable(); - let &(_, (mut chunk_start, _)) = chars.peek().ok_or_else(unterminated_err)?; - while let Some((char_i, (byte_i, c))) = chars.next() { - match c.to_char_lossy() { - '"' => { - push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); - flame_guard!("machinery::scanstring::assemble_chunks"); - let mut out = Wtf8Buf::with_capacity(output_len); - for x in chunks { - match x { - StrOrChar::Str(s) => out.push_wtf8(s), - StrOrChar::Char(c) => out.push(c), - } + flame_guard!("machinery::scanstring::slow_path"); + let mut chunks: Vec> = Vec::new(); + let mut output_len = 0usize; + let mut push_chunk = |chunk: StrOrChar<'a>| { + output_len += chunk.len(); + chunks.push(chunk); + }; + + let mut chars = s.code_point_indices().enumerate().peekable(); + let mut chunk_start: usize = 0; + + while let Some((char_i, (byte_i, c))) = chars.next() { + match c.to_char_lossy() { + '"' => { + push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); + flame_guard!("machinery::scanstring::assemble_chunks"); + let mut out = Wtf8Buf::with_capacity(output_len); + for x in chunks { + match x { + StrOrChar::Str(s) => out.push_wtf8(s), + StrOrChar::Char(c) => out.push(c), } - return Ok((out, char_i + 1)); } - '\\' => { - push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); - let (_, (_, c)) = chars.next().ok_or_else(unterminated_err)?; - let esc = - match c.to_char_lossy() { - '"' => "\"", - '\\' => "\\", - '/' => "/", - 'b' => "\x08", - 'f' => "\x0c", - 'n' => "\n", - 'r' => "\r", - 't' => "\t", - 'u' => { - let mut uni = decode_unicode(&mut chars, char_i)?; - chunk_start = byte_i + 6; - if let Some(lead) = uni.to_lead_surrogate() { - // uni is a surrogate -- try to find its pair - let mut chars2 = chars.clone(); - if let Some(((pos2, _), (_, _))) = chars2.next_tuple().filter( - |((_, (_, c1)), (_, (_, c2)))| *c1 == '\\' && *c2 == 'u', - ) { - let uni2 = decode_unicode(&mut chars2, pos2)?; - if let Some(trail) = uni2.to_trail_surrogate() { - // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates - uni = lead.merge(trail).into(); - chunk_start = pos2 + 6; - chars = chars2; - } - } + // +1 for the closing quote + return Ok((out, char_i + 1, byte_i + 1)); + } + '\\' => { + push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); + let (next_char_i, (_, c)) = chars.next().ok_or_else(unterminated_err)?; + let esc = match c.to_char_lossy() { + '"' => "\"", + '\\' => "\\", + '/' => "/", + 'b' => "\x08", + 'f' => "\x0c", + 'n' => "\n", + 'r' => "\r", + 't' => "\t", + 'u' => { + let mut uni = decode_unicode(&mut chars, char_offset + char_i)?; + chunk_start = byte_i + 6; + if let Some(lead) = uni.to_lead_surrogate() { + // uni is a surrogate -- try to find its pair + let mut chars2 = chars.clone(); + if let Some(((_, (byte_pos2, _)), (_, _))) = chars2 + .next_tuple() + .filter(|((_, (_, c1)), (_, (_, c2)))| *c1 == '\\' && *c2 == 'u') + { + let uni2 = + decode_unicode(&mut chars2, char_offset + next_char_i + 1)?; + if let Some(trail) = uni2.to_trail_surrogate() { + // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates + uni = lead.merge(trail).into(); + chunk_start = byte_pos2 + 6; + chars = chars2; } - push_chunk(StrOrChar::Char(uni)); - continue; - } - _ => { - return Err(DecodeError::new( - format!("Invalid \\escape: {c:?}"), - char_i, - )); } - }; - chunk_start = byte_i + 2; - push_chunk(StrOrChar::Str(esc.as_ref())); - } - '\x00'..='\x1f' if strict => { - return Err(DecodeError::new( - format!("Invalid control character {c:?} at"), - char_i, - )); - } - _ => {} + } + push_chunk(StrOrChar::Char(uni)); + continue; + } + _ => { + return Err(DecodeError::new( + format!("Invalid \\escape: {c:?}"), + char_offset + char_i, + )); + } + }; + chunk_start = byte_i + 2; + push_chunk(StrOrChar::Str(esc.as_ref())); + } + '\x00'..='\x1f' if strict => { + return Err(DecodeError::new( + format!("Invalid control character {c:?} at"), + char_offset + char_i, + )); } + _ => {} } - Err(unterminated_err()) } + Err(unterminated_err()) } #[inline]