diff --git a/Lib/test/test_json/test_decode.py b/Lib/test/test_json/test_decode.py index f07f7d5533..ad37d47f08 100644 --- a/Lib/test/test_json/test_decode.py +++ b/Lib/test/test_json/test_decode.py @@ -138,8 +138,6 @@ def test_limit_int(self): class TestPyDecode(TestDecode, PyTest): pass class TestCDecode(TestDecode, CTest): - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_keys_reuse(self): return super().test_keys_reuse() diff --git a/crates/stdlib/src/json.rs b/crates/stdlib/src/json.rs index cc98ad912c..4bdf453353 100644 --- a/crates/stdlib/src/json.rs +++ b/crates/stdlib/src/json.rs @@ -7,7 +7,7 @@ mod _json { use crate::vm::{ AsObject, Py, PyObjectRef, PyPayload, PyResult, VirtualMachine, builtins::{PyBaseExceptionRef, PyStrRef, PyType}, - convert::{ToPyObject, ToPyResult}, + convert::ToPyResult, function::{IntoFuncArgs, OptionalArg}, protocol::PyIterReturn, types::{Callable, Constructor}, @@ -15,6 +15,29 @@ mod _json { use core::str::FromStr; use malachite_bigint::BigInt; use rustpython_common::wtf8::Wtf8Buf; + use std::collections::HashMap; + + /// Skip JSON whitespace characters (space, tab, newline, carriage return). + /// Works with a byte slice and returns the number of bytes skipped. + /// Since all JSON whitespace chars are ASCII, bytes == chars. + #[inline] + fn skip_whitespace(bytes: &[u8]) -> usize { + flame_guard!("_json::skip_whitespace"); + let mut count = 0; + for &b in bytes { + match b { + b' ' | b'\t' | b'\n' | b'\r' => count += 1, + _ => break, + } + } + count + } + + /// Check if a byte slice starts with a given ASCII pattern. + #[inline] + fn starts_with_bytes(bytes: &[u8], pattern: &[u8]) -> bool { + bytes.len() >= pattern.len() && &bytes[..pattern.len()] == pattern + } #[pyattr(name = "make_scanner")] #[pyclass(name = "Scanner", traverse)] @@ -68,58 +91,64 @@ mod _json { impl JsonScanner { fn parse( &self, - s: &str, pystr: PyStrRef, - idx: usize, + char_idx: usize, + byte_idx: usize, scan_once: PyObjectRef, vm: &VirtualMachine, ) -> PyResult { flame_guard!("JsonScanner::parse"); - let c = match s.chars().next() { - Some(c) => c, + let bytes = pystr.as_str().as_bytes(); + let wtf8 = pystr.as_wtf8(); + + let first_byte = match bytes.get(byte_idx) { + Some(&b) => b, None => { return Ok(PyIterReturn::StopIteration(Some( - vm.ctx.new_int(idx).into(), + vm.ctx.new_int(char_idx).into(), ))); } }; - let next_idx = idx + c.len_utf8(); - match c { - '"' => { - return scanstring(pystr, next_idx, OptionalArg::Present(self.strict), vm) - .map(|x| PyIterReturn::Return(x.to_pyobject(vm))); + + match first_byte { + b'"' => { + // Parse string - pass slice starting after the quote + let (wtf8_result, chars_consumed, _bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx + 1..], char_idx + 1, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + let end_char_idx = char_idx + 1 + chars_consumed; + return Ok(PyIterReturn::Return( + vm.new_tuple((wtf8_result, end_char_idx)).into(), + )); } - '{' => { - // TODO: parse the object in rust - let parse_obj = self.ctx.get_attr("parse_object", vm)?; - let result = parse_obj.call( - ( - (pystr, next_idx), - self.strict, - scan_once, - self.object_hook.clone(), - self.object_pairs_hook.clone(), - ), - vm, - ); - return PyIterReturn::from_pyresult(result, vm); + b'{' => { + // Parse object in Rust + let mut memo = HashMap::new(); + return self + .parse_object(pystr, char_idx + 1, byte_idx + 1, &scan_once, &mut memo, vm) + .map(|(obj, end_char, _end_byte)| { + PyIterReturn::Return(vm.new_tuple((obj, end_char)).into()) + }); } - '[' => { - // TODO: parse the array in rust - let parse_array = self.ctx.get_attr("parse_array", vm)?; - return PyIterReturn::from_pyresult( - parse_array.call(((pystr, next_idx), scan_once), vm), - vm, - ); + b'[' => { + // Parse array in Rust + let mut memo = HashMap::new(); + return self + .parse_array(pystr, char_idx + 1, byte_idx + 1, &scan_once, &mut memo, vm) + .map(|(obj, end_char, _end_byte)| { + PyIterReturn::Return(vm.new_tuple((obj, end_char)).into()) + }); } _ => {} } + let s = &pystr.as_str()[byte_idx..]; + macro_rules! parse_const { ($s:literal, $val:expr) => { if s.starts_with($s) { return Ok(PyIterReturn::Return( - vm.new_tuple(($val, idx + $s.len())).into(), + vm.new_tuple(($val, char_idx + $s.len())).into(), )); } }; @@ -130,15 +159,20 @@ mod _json { parse_const!("false", false); if let Some((res, len)) = self.parse_number(s, vm) { - return Ok(PyIterReturn::Return(vm.new_tuple((res?, idx + len)).into())); + return Ok(PyIterReturn::Return( + vm.new_tuple((res?, char_idx + len)).into(), + )); } macro_rules! parse_constant { ($s:literal) => { if s.starts_with($s) { return Ok(PyIterReturn::Return( - vm.new_tuple((self.parse_constant.call(($s,), vm)?, idx + $s.len())) - .into(), + vm.new_tuple(( + self.parse_constant.call(($s,), vm)?, + char_idx + $s.len(), + )) + .into(), )); } }; @@ -149,7 +183,7 @@ mod _json { parse_constant!("-Infinity"); Ok(PyIterReturn::StopIteration(Some( - vm.ctx.new_int(idx).into(), + vm.ctx.new_int(char_idx).into(), ))) } @@ -189,28 +223,443 @@ mod _json { }; Some((ret, buf.len())) } + + /// Parse a JSON object starting after the opening '{'. + /// Returns (parsed_object, end_char_index, end_byte_index). + fn parse_object( + &self, + pystr: PyStrRef, + start_char_idx: usize, + start_byte_idx: usize, + scan_once: &PyObjectRef, + memo: &mut HashMap, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize, usize)> { + flame_guard!("JsonScanner::parse_object"); + + let bytes = pystr.as_str().as_bytes(); + let wtf8 = pystr.as_wtf8(); + let mut char_idx = start_char_idx; + let mut byte_idx = start_byte_idx; + + // Skip initial whitespace + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Check for empty object + match bytes.get(byte_idx) { + Some(b'}') => { + return self.finalize_object(vec![], char_idx + 1, byte_idx + 1, vm); + } + Some(b'"') => { + // Continue to parse first key + } + _ => { + return Err(self.make_decode_error( + "Expecting property name enclosed in double quotes", + pystr, + char_idx, + vm, + )); + } + } + + let mut pairs: Vec<(PyObjectRef, PyObjectRef)> = Vec::new(); + + loop { + // We're now at '"', skip it + char_idx += 1; + byte_idx += 1; + + // Parse key string using scanstring with byte slice + let (key_wtf8, chars_consumed, bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx..], char_idx, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + + char_idx += chars_consumed; + byte_idx += bytes_consumed; + + // Key memoization - reuse existing key strings + let key_str = key_wtf8.to_string(); + let key: PyObjectRef = match memo.get(&key_str) { + Some(cached) => cached.clone().into(), + None => { + let py_key = vm.ctx.new_str(key_str.clone()); + memo.insert(key_str, py_key.clone()); + py_key.into() + } + }; + + // Skip whitespace after key + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Expect ':' delimiter + match bytes.get(byte_idx) { + Some(b':') => { + char_idx += 1; + byte_idx += 1; + } + _ => { + return Err(self.make_decode_error( + "Expecting ':' delimiter", + pystr, + char_idx, + vm, + )); + } + } + + // Skip whitespace after ':' + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Parse value recursively + let (value, value_char_end, value_byte_end) = + self.call_scan_once(scan_once, pystr.clone(), char_idx, byte_idx, memo, vm)?; + + pairs.push((key, value)); + char_idx = value_char_end; + byte_idx = value_byte_end; + + // Skip whitespace after value + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Check for ',' or '}' + match bytes.get(byte_idx) { + Some(b'}') => { + char_idx += 1; + byte_idx += 1; + break; + } + Some(b',') => { + let comma_char_idx = char_idx; + char_idx += 1; + byte_idx += 1; + + // Skip whitespace after comma + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Next must be '"' + match bytes.get(byte_idx) { + Some(b'"') => { + // Continue to next key-value pair + } + Some(b'}') => { + // Trailing comma before end of object + return Err(self.make_decode_error( + "Illegal trailing comma before end of object", + pystr, + comma_char_idx, + vm, + )); + } + _ => { + return Err(self.make_decode_error( + "Expecting property name enclosed in double quotes", + pystr, + char_idx, + vm, + )); + } + } + } + _ => { + return Err(self.make_decode_error( + "Expecting ',' delimiter", + pystr, + char_idx, + vm, + )); + } + } + } + + self.finalize_object(pairs, char_idx, byte_idx, vm) + } + + /// Parse a JSON array starting after the opening '['. + /// Returns (parsed_array, end_char_index, end_byte_index). + fn parse_array( + &self, + pystr: PyStrRef, + start_char_idx: usize, + start_byte_idx: usize, + scan_once: &PyObjectRef, + memo: &mut HashMap, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize, usize)> { + flame_guard!("JsonScanner::parse_array"); + + let bytes = pystr.as_str().as_bytes(); + let mut char_idx = start_char_idx; + let mut byte_idx = start_byte_idx; + + // Skip initial whitespace + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Check for empty array + if bytes.get(byte_idx) == Some(&b']') { + return Ok((vm.ctx.new_list(vec![]).into(), char_idx + 1, byte_idx + 1)); + } + + let mut values: Vec = Vec::new(); + + loop { + // Parse value + let (value, value_char_end, value_byte_end) = + self.call_scan_once(scan_once, pystr.clone(), char_idx, byte_idx, memo, vm)?; + + values.push(value); + char_idx = value_char_end; + byte_idx = value_byte_end; + + // Skip whitespace after value + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + match bytes.get(byte_idx) { + Some(b']') => { + char_idx += 1; + byte_idx += 1; + break; + } + Some(b',') => { + let comma_char_idx = char_idx; + char_idx += 1; + byte_idx += 1; + + // Skip whitespace after comma + let ws = skip_whitespace(&bytes[byte_idx..]); + char_idx += ws; + byte_idx += ws; + + // Check for trailing comma + if bytes.get(byte_idx) == Some(&b']') { + return Err(self.make_decode_error( + "Illegal trailing comma before end of array", + pystr, + comma_char_idx, + vm, + )); + } + } + _ => { + return Err(self.make_decode_error( + "Expecting ',' delimiter", + pystr, + char_idx, + vm, + )); + } + } + } + + Ok((vm.ctx.new_list(values).into(), char_idx, byte_idx)) + } + + /// Finalize object construction with hooks. + fn finalize_object( + &self, + pairs: Vec<(PyObjectRef, PyObjectRef)>, + end_char_idx: usize, + end_byte_idx: usize, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize, usize)> { + let result = if let Some(ref pairs_hook) = self.object_pairs_hook { + // object_pairs_hook takes priority - pass list of tuples + let pairs_list: Vec = pairs + .into_iter() + .map(|(k, v)| vm.new_tuple((k, v)).into()) + .collect(); + pairs_hook.call((vm.ctx.new_list(pairs_list),), vm)? + } else { + // Build a dict from pairs + let dict = vm.ctx.new_dict(); + for (key, value) in pairs { + dict.set_item(&*key, value, vm)?; + } + + // Apply object_hook if present + let dict_obj: PyObjectRef = dict.into(); + if let Some(ref hook) = self.object_hook { + hook.call((dict_obj,), vm)? + } else { + dict_obj + } + }; + + Ok((result, end_char_idx, end_byte_idx)) + } + + /// Call scan_once and handle the result. + /// Returns (value, end_char_idx, end_byte_idx). + fn call_scan_once( + &self, + scan_once: &PyObjectRef, + pystr: PyStrRef, + char_idx: usize, + byte_idx: usize, + memo: &mut HashMap, + vm: &VirtualMachine, + ) -> PyResult<(PyObjectRef, usize, usize)> { + let s = pystr.as_str(); + let bytes = s.as_bytes(); + let wtf8 = pystr.as_wtf8(); + + let first_byte = match bytes.get(byte_idx) { + Some(&b) => b, + None => return Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)), + }; + + match first_byte { + b'"' => { + // String - pass slice starting after the quote + let (wtf8_result, chars_consumed, bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx + 1..], char_idx + 1, self.strict) + .map_err(|e| py_decode_error(e, pystr.clone(), vm))?; + let py_str = vm.ctx.new_str(wtf8_result.to_string()); + Ok(( + py_str.into(), + char_idx + 1 + chars_consumed, + byte_idx + 1 + bytes_consumed, + )) + } + b'{' => { + // Object + self.parse_object(pystr, char_idx + 1, byte_idx + 1, scan_once, memo, vm) + } + b'[' => { + // Array + self.parse_array(pystr, char_idx + 1, byte_idx + 1, scan_once, memo, vm) + } + b'n' if starts_with_bytes(&bytes[byte_idx..], b"null") => { + // null + Ok((vm.ctx.none(), char_idx + 4, byte_idx + 4)) + } + b't' if starts_with_bytes(&bytes[byte_idx..], b"true") => { + // true + Ok((vm.ctx.new_bool(true).into(), char_idx + 4, byte_idx + 4)) + } + b'f' if starts_with_bytes(&bytes[byte_idx..], b"false") => { + // false + Ok((vm.ctx.new_bool(false).into(), char_idx + 5, byte_idx + 5)) + } + b'N' if starts_with_bytes(&bytes[byte_idx..], b"NaN") => { + // NaN + let result = self.parse_constant.call(("NaN",), vm)?; + Ok((result, char_idx + 3, byte_idx + 3)) + } + b'I' if starts_with_bytes(&bytes[byte_idx..], b"Infinity") => { + // Infinity + let result = self.parse_constant.call(("Infinity",), vm)?; + Ok((result, char_idx + 8, byte_idx + 8)) + } + b'-' => { + // -Infinity or negative number + if starts_with_bytes(&bytes[byte_idx..], b"-Infinity") { + let result = self.parse_constant.call(("-Infinity",), vm)?; + return Ok((result, char_idx + 9, byte_idx + 9)); + } + // Negative number - numbers are ASCII so len == bytes + if let Some((result, len)) = self.parse_number(&s[byte_idx..], vm) { + return Ok((result?, char_idx + len, byte_idx + len)); + } + Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)) + } + b'0'..=b'9' => { + // Positive number - numbers are ASCII so len == bytes + if let Some((result, len)) = self.parse_number(&s[byte_idx..], vm) { + return Ok((result?, char_idx + len, byte_idx + len)); + } + Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)) + } + _ => { + // Fall back to scan_once for unrecognized input + // Note: This path requires char_idx for Python compatibility + let result = scan_once.call((pystr.clone(), char_idx as isize), vm); + + match result { + Ok(tuple) => { + use crate::vm::builtins::PyTupleRef; + let tuple: PyTupleRef = tuple.try_into_value(vm)?; + if tuple.len() != 2 { + return Err(vm.new_value_error("scan_once must return 2-tuple")); + } + let value = tuple.as_slice()[0].clone(); + let end_char_idx: isize = tuple.as_slice()[1].try_to_value(vm)?; + // For fallback, we need to calculate byte_idx from char_idx + // This is expensive but fallback should be rare + let end_byte_idx = s + .char_indices() + .nth(end_char_idx as usize) + .map(|(i, _)| i) + .unwrap_or(s.len()); + Ok((value, end_char_idx as usize, end_byte_idx)) + } + Err(err) if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) => { + Err(self.make_decode_error("Expecting value", pystr, char_idx, vm)) + } + Err(err) => Err(err), + } + } + } + } + + /// Create a decode error. + fn make_decode_error( + &self, + msg: &str, + s: PyStrRef, + pos: usize, + vm: &VirtualMachine, + ) -> PyBaseExceptionRef { + let err = machinery::DecodeError::new(msg, pos); + py_decode_error(err, s, vm) + } } impl Callable for JsonScanner { type Args = (PyStrRef, isize); - fn call(zelf: &Py, (pystr, idx): Self::Args, vm: &VirtualMachine) -> PyResult { - if idx < 0 { + fn call(zelf: &Py, (pystr, char_idx): Self::Args, vm: &VirtualMachine) -> PyResult { + if char_idx < 0 { return Err(vm.new_value_error("idx cannot be negative")); } - let idx = idx as usize; - let mut chars = pystr.as_str().chars(); - if idx > 0 && chars.nth(idx - 1).is_none() { - PyIterReturn::StopIteration(Some(vm.ctx.new_int(idx).into())).to_pyresult(vm) + let char_idx = char_idx as usize; + let s = pystr.as_str(); + + // Calculate byte index from char index (O(char_idx) but only at entry point) + let byte_idx = if char_idx == 0 { + 0 } else { - zelf.parse( - chars.as_str(), - pystr.clone(), - idx, - zelf.to_owned().into(), - vm, - ) - .and_then(|x| x.to_pyresult(vm)) - } + match s.char_indices().nth(char_idx) { + Some((byte_i, _)) => byte_i, + None => { + // char_idx is beyond the string length + return PyIterReturn::StopIteration(Some(vm.ctx.new_int(char_idx).into())) + .to_pyresult(vm); + } + } + }; + + zelf.parse( + pystr.clone(), + char_idx, + byte_idx, + zelf.to_owned().into(), + vm, + ) + .and_then(|x| x.to_pyresult(vm)) } } @@ -257,7 +706,28 @@ mod _json { vm: &VirtualMachine, ) -> PyResult<(Wtf8Buf, usize)> { flame_guard!("_json::scanstring"); - machinery::scanstring(s.as_wtf8(), end, strict.unwrap_or(true)) - .map_err(|e| py_decode_error(e, s, vm)) + let wtf8 = s.as_wtf8(); + + // Convert char index `end` to byte index + let byte_idx = if end == 0 { + 0 + } else { + wtf8.code_point_indices() + .nth(end) + .map(|(i, _)| i) + .ok_or_else(|| { + py_decode_error( + machinery::DecodeError::new("Unterminated string starting at", end - 1), + s.clone(), + vm, + ) + })? + }; + + let (result, chars_consumed, _bytes_consumed) = + machinery::scanstring(&wtf8[byte_idx..], end, strict.unwrap_or(true)) + .map_err(|e| py_decode_error(e, s, vm))?; + + Ok((result, end + chars_consumed)) } } diff --git a/crates/stdlib/src/json/machinery.rs b/crates/stdlib/src/json/machinery.rs index 57b8ae441f..f33a135ab2 100644 --- a/crates/stdlib/src/json/machinery.rs +++ b/crates/stdlib/src/json/machinery.rs @@ -30,6 +30,7 @@ use std::io; use itertools::Itertools; +use memchr::memchr2; use rustpython_common::wtf8::{CodePoint, Wtf8, Wtf8Buf}; static ESCAPE_CHARS: [&str; 0x20] = [ @@ -108,7 +109,7 @@ pub struct DecodeError { pub pos: usize, } impl DecodeError { - fn new(msg: impl Into, pos: usize) -> Self { + pub fn new(msg: impl Into, pos: usize) -> Self { let msg = msg.into(); Self { msg, pos } } @@ -126,24 +127,63 @@ impl StrOrChar<'_> { } } } +/// Scan a JSON string starting right after the opening quote. +/// +/// # Arguments +/// * `s` - The string slice starting at the first character after the opening `"` +/// * `char_offset` - The character index where this slice starts (for error messages) +/// * `strict` - Whether to reject control characters +/// +/// # Returns +/// * `Ok((result, chars_consumed, bytes_consumed))` - The decoded string and how much was consumed +/// * `Err(DecodeError)` - If the string is malformed pub fn scanstring<'a>( s: &'a Wtf8, - end: usize, + char_offset: usize, strict: bool, -) -> Result<(Wtf8Buf, usize), DecodeError> { +) -> Result<(Wtf8Buf, usize, usize), DecodeError> { + flame_guard!("machinery::scanstring"); + let unterminated_err = || DecodeError::new("Unterminated string starting at", char_offset - 1); + + let bytes = s.as_bytes(); + + // Fast path: use memchr to find " or \ quickly + if let Some(pos) = memchr2(b'"', b'\\', bytes) + && bytes[pos] == b'"' + { + let content_bytes = &bytes[..pos]; + + // In strict mode, check for control characters (0x00-0x1F) + let has_control_char = strict && content_bytes.iter().any(|&b| b < 0x20); + + if !has_control_char { + flame_guard!("machinery::scanstring::fast_path"); + let result_slice = &s[..pos]; + let char_count = result_slice.code_points().count(); + let mut out = Wtf8Buf::with_capacity(pos); + out.push_wtf8(result_slice); + // +1 for the closing quote + return Ok((out, char_count + 1, pos + 1)); + } + } + + // Slow path: chunk-based parsing for strings with escapes or control chars + flame_guard!("machinery::scanstring::slow_path"); let mut chunks: Vec> = Vec::new(); let mut output_len = 0usize; let mut push_chunk = |chunk: StrOrChar<'a>| { output_len += chunk.len(); chunks.push(chunk); }; - let unterminated_err = || DecodeError::new("Unterminated string starting at", end - 1); - let mut chars = s.code_point_indices().enumerate().skip(end).peekable(); - let &(_, (mut chunk_start, _)) = chars.peek().ok_or_else(unterminated_err)?; + + let mut chars = s.code_point_indices().enumerate().peekable(); + let mut chunk_start: usize = 0; + while let Some((char_i, (byte_i, c))) = chars.next() { match c.to_char_lossy() { '"' => { push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); + flame_guard!("machinery::scanstring::assemble_chunks"); let mut out = Wtf8Buf::with_capacity(output_len); for x in chunks { match x { @@ -151,11 +191,12 @@ pub fn scanstring<'a>( StrOrChar::Char(c) => out.push(c), } } - return Ok((out, char_i + 1)); + // +1 for the closing quote + return Ok((out, char_i + 1, byte_i + 1)); } '\\' => { push_chunk(StrOrChar::Str(&s[chunk_start..byte_i])); - let (_, (_, c)) = chars.next().ok_or_else(unterminated_err)?; + let (next_char_i, (_, c)) = chars.next().ok_or_else(unterminated_err)?; let esc = match c.to_char_lossy() { '"' => "\"", '\\' => "\\", @@ -166,20 +207,21 @@ pub fn scanstring<'a>( 'r' => "\r", 't' => "\t", 'u' => { - let mut uni = decode_unicode(&mut chars, char_i)?; + let mut uni = decode_unicode(&mut chars, char_offset + char_i)?; chunk_start = byte_i + 6; if let Some(lead) = uni.to_lead_surrogate() { // uni is a surrogate -- try to find its pair let mut chars2 = chars.clone(); - if let Some(((pos2, _), (_, _))) = chars2 + if let Some(((_, (byte_pos2, _)), (_, _))) = chars2 .next_tuple() .filter(|((_, (_, c1)), (_, (_, c2)))| *c1 == '\\' && *c2 == 'u') { - let uni2 = decode_unicode(&mut chars2, pos2)?; + let uni2 = + decode_unicode(&mut chars2, char_offset + next_char_i + 1)?; if let Some(trail) = uni2.to_trail_surrogate() { // ok, we found what we were looking for -- \uXXXX\uXXXX, both surrogates uni = lead.merge(trail).into(); - chunk_start = pos2 + 6; + chunk_start = byte_pos2 + 6; chars = chars2; } } @@ -188,7 +230,10 @@ pub fn scanstring<'a>( continue; } _ => { - return Err(DecodeError::new(format!("Invalid \\escape: {c:?}"), char_i)); + return Err(DecodeError::new( + format!("Invalid \\escape: {c:?}"), + char_offset + char_i, + )); } }; chunk_start = byte_i + 2; @@ -197,7 +242,7 @@ pub fn scanstring<'a>( '\x00'..='\x1f' if strict => { return Err(DecodeError::new( format!("Invalid control character {c:?} at"), - char_i, + char_offset + char_i, )); } _ => {} @@ -211,12 +256,13 @@ fn decode_unicode(it: &mut I, pos: usize) -> Result where I: Iterator, { + flame_guard!("machinery::decode_unicode"); let err = || DecodeError::new("Invalid \\uXXXX escape", pos); - let mut uni = 0; - for x in (0..4).rev() { + let mut uni = 0u16; + for _ in 0..4 { let (_, (_, c)) = it.next().ok_or_else(err)?; let d = c.to_char().and_then(|c| c.to_digit(16)).ok_or_else(err)? as u16; - uni += d * 16u16.pow(x); + uni = (uni << 4) | d; } Ok(uni.into()) }