Fix UTF-8 character boundary panic in style.rs (Issue #865)

This commit fixes a crash that occurred when difftastic tried to slice
strings at byte positions that fell in the middle of multi-byte UTF-8
characters.

Root Cause:
The crash was caused by a mismatch between how byte positions are
calculated (in the line-numbers crate) and how lines are split when
applying styles. Specifically:

1. LinePositions::from() always adds 1 byte for newlines, even for
   the last line if it doesn't end with a newline
2. split_on_newlines() strips \r characters from CRLF line endings,
   which LinePositions does not account for
3. This causes byte indices to be off, potentially landing in the
   middle of multi-byte UTF-8 characters

Solution:
Added UTF-8 boundary validation to substring_by_byte() that:
- Checks if slice indices are within string bounds
- Verifies indices fall on valid UTF-8 character boundaries
- Adjusts to the nearest previous valid boundary if not
- Prevents panics from invalid string slicing

This is a defensive fix that handles the symptom while the root cause
(in the line-numbers crate) can be addressed separately.

Tests:
Added comprehensive test cases for UTF-8 boundary handling with
multi-byte characters including 2-byte (ê) and 3-byte (世界) chars.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
claude/debug-issue-865-crash-011CUd9ut241YRW738w3K84r
Claude 2025-10-30 09:46:13 +07:00
parent d615490493
commit 32da88b88e
No known key found for this signature in database
1 changed files with 67 additions and 1 deletions

@ -58,11 +58,42 @@ fn byte_offset_for_width(s: &str, width: usize, tab_width: usize) -> usize {
}
fn substring_by_byte(s: &str, start: usize, end: usize) -> &str {
// Ensure start and end are within bounds and on UTF-8 character boundaries.
// This is a defensive measure against byte offset calculation mismatches,
// particularly when handling CRLF line endings or files without trailing newlines.
// Clamp start to valid range and find the nearest valid UTF-8 boundary
let start = if start > s.len() {
s.len()
} else if !s.is_char_boundary(start) {
// Find the previous valid boundary
(0..start)
.rev()
.find(|&i| s.is_char_boundary(i))
.unwrap_or(0)
} else {
start
};
// Clamp end to valid range and find the nearest valid UTF-8 boundary
let end = if end > s.len() {
s.len()
} else if !s.is_char_boundary(end) {
// Find the previous valid boundary
(0..end)
.rev()
.find(|&i| s.is_char_boundary(i))
.unwrap_or(start)
} else {
end
};
&s[start..end]
}
fn substring_by_byte_replace_tabs(s: &str, start: usize, end: usize, tab_width: usize) -> String {
let s = s[start..end].to_string();
// Use substring_by_byte to ensure safe slicing with UTF-8 boundary checks
let s = substring_by_byte(s, start, end).to_string();
s.replace('\t', &" ".repeat(tab_width))
}
@ -672,4 +703,39 @@ mod tests {
);
assert_eq!(res, vec!["foobar", " "])
}
#[test]
fn test_substring_by_byte_utf8_boundary() {
// Test that substring_by_byte handles invalid UTF-8 boundaries gracefully
let s = "car ê"; // 'ê' is 2 bytes (bytes 4-5)
// Valid boundaries - should work normally
assert_eq!(substring_by_byte(s, 0, 3), "car");
assert_eq!(substring_by_byte(s, 4, 6), "ê");
// Invalid boundary in the middle of 'ê' - should adjust to previous valid boundary
// Trying to start at byte 5 (middle of 'ê') should adjust to byte 4
assert_eq!(substring_by_byte(s, 5, 6), "");
// Out of bounds - should clamp to string length
assert_eq!(substring_by_byte(s, 0, 100), "car ê");
assert_eq!(substring_by_byte(s, 10, 20), "");
}
#[test]
fn test_substring_by_byte_with_multibyte_chars() {
// Test with various multi-byte UTF-8 characters
let s = "Hello 世界"; // 世 and 界 are 3 bytes each
// Valid slicing
assert_eq!(substring_by_byte(s, 0, 5), "Hello");
assert_eq!(substring_by_byte(s, 6, 9), "");
assert_eq!(substring_by_byte(s, 9, 12), "");
// Invalid boundary in the middle of a 3-byte character
// Should adjust to the previous valid boundary
let result = substring_by_byte(s, 7, 10);
// Starting at byte 7 (middle of '世') should adjust
assert!(result.len() < 4); // Should not include partial characters
}
}