Attempt to detect and decode UTF-16 files too

Closes #345
add_libdifftastic
Wilfred Hughes 2022-08-28 15:37:09 +07:00
parent 8b5642ef51
commit b1b3756fa7
4 changed files with 84 additions and 34 deletions

@ -1,5 +1,10 @@
## 0.35 (unreleased)
### Parsing
Difftastic will now autodetect files in UTF-16-BE and
UTF-16-LE. Previously it required files to be UTF-8.
## 0.34 (released 27th August 2022)
### Build

@ -181,6 +181,9 @@ sample_files/toml_before.toml sample_files/toml_after.toml
sample_files/typing_before.ml sample_files/typing_after.ml
3941fd44b0bf744da834a0b3eda1ba76 -
sample_files/utf16_before.py sample_files/utf16_after.py
23ae372384bdddc7dd8745c22fab580d -
sample_files/whitespace_before.tsx sample_files/whitespace_after.tsx
c4151c5a44b11e04fd11c2594597ed33 -

@ -75,40 +75,81 @@ pub fn read_or_die(path: &Path) -> Vec<u8> {
}
}
fn utf16_from_bytes_lossy(bytes: &[u8]) -> String {
let is_big_endian = match &bytes {
[0xfe, 0xff, ..] => true,
[0xff, 0xfe, ..] => false,
_ => false, // assume little endian if no BOM is present.
};
// https://stackoverflow.com/a/57172592
let u16_values: Vec<u16> = bytes
.chunks_exact(2)
.into_iter()
.map(|a| {
if is_big_endian {
u16::from_be_bytes([a[0], a[1]])
} else {
u16::from_le_bytes([a[0], a[1]])
}
})
.collect();
String::from_utf16_lossy(u16_values.as_slice())
}
pub enum ProbableFileKind {
Text(String),
Binary,
}
/// Do these bytes look like a binary (non-textual) format?
pub fn is_probably_binary(bytes: &[u8]) -> bool {
pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
// Only consider the first 1,000 bytes, as tree_magic_mini
// considers the entire file, which is very slow on large files.
let mut bytes = bytes;
if bytes.len() > 1000 {
bytes = &bytes[..1000];
let mut magic_bytes = bytes;
if magic_bytes.len() > 1000 {
magic_bytes = &magic_bytes[..1000];
}
let mime = tree_magic_mini::from_u8(bytes);
let mime = tree_magic_mini::from_u8(magic_bytes);
match mime {
// Treat pdf as binary.
"application/pdf" => return true,
"application/pdf" => return ProbableFileKind::Binary,
// Treat all image content as binary.
v if v.starts_with("image/") => return true,
v if v.starts_with("image/") => return ProbableFileKind::Binary,
// Treat all audio content as binary.
v if v.starts_with("audio/") => return true,
v if v.starts_with("audio/") => return ProbableFileKind::Binary,
// Treat all video content as binary.
v if v.starts_with("video/") => return true,
v if v.starts_with("video/") => return ProbableFileKind::Binary,
// Treat all font content as binary.
v if v.starts_with("font/") => return true,
v if v.starts_with("font/") => return ProbableFileKind::Binary,
_ => {}
}
// If more than 20 of the first 1,000 characters are null bytes or
// invalid UTF-8, we assume it's binary.
let num_replaced = String::from_utf8_lossy(bytes)
.to_string()
let utf8_string = String::from_utf8_lossy(bytes).to_string();
let num_utf8_invalid = utf8_string
.chars()
.take(1000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 20 {
return ProbableFileKind::Text(utf8_string);
}
let utf16_string = utf16_from_bytes_lossy(bytes);
let num_utf16_invalid = utf16_string
.chars()
.take(1000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf16_invalid <= 20 {
return ProbableFileKind::Text(utf16_string);
}
num_replaced > 20
ProbableFileKind::Binary
}
/// All the files in `dir`, including subdirectories.
@ -180,13 +221,13 @@ mod tests {
use super::*;
#[test]
fn test_text_is_not_binary() {
fn test_plaintext_is_text() {
let s = "hello world";
assert!(!is_probably_binary(s.as_bytes()));
assert!(matches!(guess_content(s.as_bytes()), ProbableFileKind::Text(_)));
}
#[test]
fn test_null_bytes_are_binary() {
let s = "\0".repeat(1000);
assert!(is_probably_binary(s.as_bytes()));
assert!(matches!(guess_content(s.as_bytes()), ProbableFileKind::Binary));
}
}

@ -36,7 +36,9 @@ use crate::parse::syntax;
use diff::changes::ChangeMap;
use diff::dijkstra::ExceededGraphLimit;
use display::context::opposite_positions;
use files::{is_probably_binary, read_files_or_die, read_or_die, relative_paths_in_either};
use files::{
guess_content, read_files_or_die, read_or_die, relative_paths_in_either, ProbableFileKind,
};
use log::info;
use mimalloc::MiMalloc;
use parse::guess_language::guess;
@ -214,26 +216,25 @@ fn diff_file_content(
byte_limit: usize,
language_override: Option<parse::guess_language::Language>,
) -> DiffResult {
if is_probably_binary(lhs_bytes) || is_probably_binary(rhs_bytes) {
return DiffResult {
lhs_display_path: lhs_display_path.into(),
rhs_display_path: rhs_display_path.into(),
language: None,
lhs_src: FileContent::Binary(lhs_bytes.to_vec()),
rhs_src: FileContent::Binary(rhs_bytes.to_vec()),
lhs_positions: vec![],
rhs_positions: vec![],
};
}
let (mut lhs_src, mut rhs_src) = match (guess_content(lhs_bytes), guess_content(rhs_bytes)) {
(ProbableFileKind::Binary, _) | (_, ProbableFileKind::Binary) => {
return DiffResult {
lhs_display_path: lhs_display_path.into(),
rhs_display_path: rhs_display_path.into(),
language: None,
lhs_src: FileContent::Binary(lhs_bytes.to_vec()),
rhs_src: FileContent::Binary(rhs_bytes.to_vec()),
lhs_positions: vec![],
rhs_positions: vec![],
};
}
(ProbableFileKind::Text(lhs_src), ProbableFileKind::Text(rhs_src)) => (lhs_src, rhs_src),
};
// TODO: don't replace tab characters inside string literals.
let tab_as_spaces = " ".repeat(tab_width);
let mut lhs_src = String::from_utf8_lossy(lhs_bytes)
.to_string()
.replace('\t', &tab_as_spaces);
let mut rhs_src = String::from_utf8_lossy(rhs_bytes)
.to_string()
.replace('\t', &tab_as_spaces);
lhs_src = lhs_src.replace('\t', &tab_as_spaces);
rhs_src = rhs_src.replace('\t', &tab_as_spaces);
// Ignore the trailing newline, if present.
// TODO: highlight if this has changes (#144).