agents-sh · grainier · Mar 28, 2026 · Mar 27, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.4.3"
+version = "0.4.4"
 edition = "2021"
 authors = ["TryParse Contributors"]
 license = "Apache-2.0"

diff --git a/tryparse/Cargo.toml b/tryparse/Cargo.toml
@@ -21,7 +21,7 @@ once_cell.workspace = true
 unicode-normalization.workspace = true
 
 # Optional: derive macro support
-tryparse-derive = { version = "0.4.3", path = "../tryparse-derive", optional = true }
+tryparse-derive = { version = "0.4.4", path = "../tryparse-derive", optional = true }
 
 [dev-dependencies]
 pretty_assertions.workspace = true

diff --git a/tryparse/src/parser/strategies/extractor.rs b/tryparse/src/parser/strategies/extractor.rs
@@ -113,6 +113,10 @@ impl HeuristicExtractor {
     }
 
     /// Finds balanced brace/bracket pairs.
+    ///
+    /// Boundaries are stored as byte offsets into `input`, not char indices,
+    /// so they can be used directly with `&input[start..end]` even when the
+    /// input contains multibyte UTF-8 characters.
     fn find_balanced_boundaries(
         &self,
         input: &str,
@@ -121,14 +125,22 @@ impl HeuristicExtractor {
         pattern: &'static str,
         boundaries: &mut Vec<(usize, usize, &'static str)>,
     ) {
-        let chars: Vec<char> = input.chars().collect();
+        // char_indices() yields (byte_offset, char) pairs.
+        let chars: Vec<(usize, char)> = input.char_indices().collect();
         let mut i = 0;
 
         while i < chars.len() {
-            if chars[i] == open {
+            if chars[i].1 == open {
                 // Found opening bracket/brace, find matching close
                 if let Some(end_idx) = self.find_matching_close(&chars, i, open, close) {
-                    boundaries.push((i, end_idx + 1, pattern));
+                    let byte_start = chars[i].0;
+                    // byte_end: start of the next char after end_idx, or the
+                    // end of the string if end_idx is the last char.
+                    let byte_end = chars
+                        .get(end_idx + 1)
+                        .map(|(offset, _)| *offset)
+                        .unwrap_or(input.len());
+                    boundaries.push((byte_start, byte_end, pattern));
                     i = end_idx + 1;
                 } else {
                     i += 1;
@@ -140,9 +152,13 @@ impl HeuristicExtractor {
     }
 
     /// Finds the matching closing bracket/brace.
+    ///
+    /// `chars` is a slice of `(byte_offset, char)` pairs (from
+    /// `str::char_indices`). Returns the *vec index* (not byte offset) of the
+    /// closing character, or `None` if the opening bracket is unbalanced.
     fn find_matching_close(
         &self,
-        chars: &[char],
+        chars: &[(usize, char)],
         start: usize,
         open: char,
         close: char,
@@ -151,7 +167,7 @@ impl HeuristicExtractor {
         let mut in_string = false;
         let mut escape_next = false;
 
-        for (idx, &ch) in chars.iter().enumerate().skip(start) {
+        for (idx, &(_, ch)) in chars.iter().enumerate().skip(start) {
             if escape_next {
                 escape_next = false;
                 continue;
@@ -301,4 +317,40 @@ mod tests {
         assert_eq!(candidates.len(), 1);
         assert_eq!(candidates[0].content, "[1, 2, 3, 4, 5]");
     }
+
+    // --- Non-ASCII / multibyte UTF-8 regression tests ---
+
+    #[test]
+    fn test_heuristic_extractor_non_ascii_value() {
+        let extractor = HeuristicExtractor::new();
+        // Thai characters (3 bytes each) inside the JSON value
+        let input = r#"{"message":"สวัสดี"}"#;
+
+        let candidates = extractor.extract(input).unwrap();
+        assert!(!candidates.is_empty());
+        assert_eq!(candidates[0].content, input);
+    }
+
+    #[test]
+    fn test_heuristic_extractor_emoji_in_value() {
+        let extractor = HeuristicExtractor::new();
+        // Emoji are 4 bytes each
+        let input = r#"Result: {"msg": "Hello 🎉"} done"#;
+
+        let candidates = extractor.extract(input).unwrap();
+        assert!(!candidates.is_empty());
+        assert_eq!(candidates[0].content, r#"{"msg": "Hello 🎉"}"#);
+    }
+
+    #[test]
+    fn test_heuristic_extractor_multibyte_prose_prefix() {
+        let extractor = HeuristicExtractor::new();
+        // Multibyte chars in the prose BEFORE the JSON mean the '{' char
+        // index != byte offset — the old code would slice at the wrong position.
+        let input = r#"สวัสดี: {"name": "Alice"}"#;
+
+        let candidates = extractor.extract(input).unwrap();
+        assert!(!candidates.is_empty());
+        assert_eq!(candidates[0].content, r#"{"name": "Alice"}"#);
+    }
 }
diff --git a/tryparse/src/parser/strategies/heuristic.rs b/tryparse/src/parser/strategies/heuristic.rs
@@ -89,6 +89,10 @@ impl HeuristicStrategy {
     }
 
     /// Finds balanced brace/bracket pairs.
+    ///
+    /// Boundaries are stored as byte offsets into `input`, not char indices,
+    /// so they can be used directly with `&input[start..end]` even when the
+    /// input contains multibyte UTF-8 characters.
     fn find_balanced_boundaries(
         &self,
         input: &str,
@@ -97,14 +101,22 @@ impl HeuristicStrategy {
         pattern: &'static str,
         boundaries: &mut Vec<(usize, usize, &'static str)>,
     ) {
-        let chars: Vec<char> = input.chars().collect();
+        // char_indices() yields (byte_offset, char) pairs.
+        let chars: Vec<(usize, char)> = input.char_indices().collect();
         let mut i = 0;
 
         while i < chars.len() {
-            if chars[i] == open {
+            if chars[i].1 == open {
                 // Found opening bracket/brace, find matching close
                 if let Some(end_idx) = self.find_matching_close(&chars, i, open, close) {
-                    boundaries.push((i, end_idx + 1, pattern));
+                    let byte_start = chars[i].0;
+                    // byte_end: start of the next char after end_idx, or the
+                    // end of the string if end_idx is the last char.
+                    let byte_end = chars
+                        .get(end_idx + 1)
+                        .map(|(offset, _)| *offset)
+                        .unwrap_or(input.len());
+                    boundaries.push((byte_start, byte_end, pattern));
                     i = end_idx + 1;
                 } else {
                     i += 1;
@@ -117,10 +129,12 @@ impl HeuristicStrategy {
 
     /// Finds the matching closing bracket/brace.
     ///
-    /// Returns the index of the closing character, or None if unbalanced.
+    /// `chars` is a slice of `(byte_offset, char)` pairs (from
+    /// `str::char_indices`). Returns the *vec index* (not byte offset) of the
+    /// closing character, or `None` if the opening bracket is unbalanced.
     fn find_matching_close(
         &self,
-        chars: &[char],
+        chars: &[(usize, char)],
         start: usize,
         open: char,
         close: char,
@@ -129,7 +143,7 @@ impl HeuristicStrategy {
         let mut in_string = false;
         let mut escape_next = false;
 
-        for (idx, &ch) in chars.iter().enumerate().skip(start) {
+        for (idx, &(_, ch)) in chars.iter().enumerate().skip(start) {
             if escape_next {
                 escape_next = false;
                 continue;
@@ -299,16 +313,16 @@ mod tests {
     #[test]
     fn test_find_matching_close() {
         let strategy = HeuristicStrategy::new();
-        let chars: Vec<char> = r#"{"name": "Alice"}"#.chars().collect();
+        let chars: Vec<(usize, char)> = r#"{"name": "Alice"}"#.char_indices().collect();
 
         let close_idx = strategy.find_matching_close(&chars, 0, '{', '}');
-        assert_eq!(close_idx, Some(16)); // Closing brace is at index 16
+        assert_eq!(close_idx, Some(16)); // Closing brace is at vec index 16
     }
 
     #[test]
     fn test_find_matching_close_with_nested() {
         let strategy = HeuristicStrategy::new();
-        let chars: Vec<char> = r#"{"a": {"b": 1}}"#.chars().collect();
+        let chars: Vec<(usize, char)> = r#"{"a": {"b": 1}}"#.char_indices().collect();
 
         let close_idx = strategy.find_matching_close(&chars, 0, '{', '}');
         assert_eq!(close_idx, Some(14));
@@ -322,4 +336,65 @@ mod tests {
         let result = strategy.parse(&huge_input).unwrap();
         assert!(result.is_empty()); // Should reject huge inputs
     }
+
+    // --- Non-ASCII / multibyte UTF-8 regression tests ---
+    // These all previously panicked with "byte index N is not a char boundary"
+    // because char indices were being used as byte offsets.
+
+    #[test]
+    fn test_non_ascii_thai_in_value() {
+        let strategy = HeuristicStrategy::new();
+        // Thai characters are 3 bytes each in UTF-8
+        let input = r#"{"status":"complete","message":"สวัสดี"}"#;
+
+        let result = strategy.parse(input).unwrap();
+        assert!(!result.is_empty());
+        assert_eq!(result[0].value["message"], "สวัสดี");
+    }
+
+    #[test]
+    fn test_non_ascii_emoji_in_value() {
+        let strategy = HeuristicStrategy::new();
+        // Emoji are 4 bytes each in UTF-8
+        let input = r#"{"status":"complete","message":"Hello 🎉"}"#;
+
+        let result = strategy.parse(input).unwrap();
+        assert!(!result.is_empty());
+        assert_eq!(result[0].value["message"], "Hello 🎉");
+    }
+
+    #[test]
+    fn test_non_ascii_chinese_in_value() {
+        let strategy = HeuristicStrategy::new();
+        // CJK characters are 3 bytes each in UTF-8
+        let input = r#"Some text {"greeting": "你好世界"} more text"#;
+
+        let result = strategy.parse(input).unwrap();
+        assert!(!result.is_empty());
+        assert_eq!(result[0].value["greeting"], "你好世界");
+    }
+
+    #[test]
+    fn test_non_ascii_prose_before_json() {
+        let strategy = HeuristicStrategy::new();
+        // Multibyte chars in the surrounding prose (before the JSON object)
+        // cause the byte offset of '{' to diverge from its char index.
+        let input = r#"สวัสดี: {"name": "Alice", "age": 30}"#;
+
+        let result = strategy.parse(input).unwrap();
+        assert!(!result.is_empty());
+        assert_eq!(result[0].value, json!({"name": "Alice", "age": 30}));
+    }
+
+    #[test]
+    fn test_non_ascii_accented_characters() {
+        let strategy = HeuristicStrategy::new();
+        // Accented chars are 2 bytes each in UTF-8
+        let input = r#"Résumé: {"name": "Ångström", "city": "München"}"#;
+
+        let result = strategy.parse(input).unwrap();
+        assert!(!result.is_empty());
+        assert_eq!(result[0].value["name"], "Ångström");
+        assert_eq!(result[0].value["city"], "München");
+    }
 }