Support Windows-1252 and make UTF-16 detection stricter

Fixes #797
pull/813/head
Wilfred Hughes 2025-01-04 15:48:39 +07:00
parent 5a06f3d70d
commit fadd0f22ef
7 changed files with 27 additions and 1 deletions

@ -2,6 +2,9 @@
### Parsing
File detection now supports Windows-1252 encoded test (an extension of
ISO-8859-1), and is stricter about UTF-16 detection.
Updated to the latest tree-sitter parser for Make and YAML.
## 0.62 (released 20th December 2024)

10
Cargo.lock generated

@ -247,6 +247,7 @@ dependencies = [
"cc",
"clap",
"crossterm",
"encoding_rs",
"glob",
"hashbrown",
"humansize",
@ -317,6 +318,15 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a"
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "env_logger"
version = "0.10.2"

@ -97,6 +97,7 @@ tree-sitter-lua = "0.2.0"
tree-sitter-xml = "0.7.0"
tree-sitter-make = "1.1.1"
tree-sitter-yaml = "0.7.0"
encoding_rs = "0.8.35"
[dev-dependencies]
# assert_cmd 2.0.10 requires predicates 3.

@ -298,6 +298,9 @@ ca98b4d14fc21e0f04cf24aeb3d2526c -
sample_files/whitespace_1.tsx sample_files/whitespace_2.tsx
ac8b1a89ac26333f2d4e9433b2ca3958 -
sample_files/windows_1251_1.txt sample_files/windows_2251_1.txt
d41d8cd98f00b204e9800998ecf8427e -
sample_files/xml_1.xml sample_files/xml_2.xml
e629cbd2e721fd249c7ce1626f17e953 -

@ -0,0 +1 @@
Muß können: löst muß daß Heißt löscht führen für muß ähnlich

@ -0,0 +1 @@
Muß können: löst muß daß Heißt löscht führen für muß ähmlich

@ -215,7 +215,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
.take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf16_invalid <= 5 {
if num_utf16_invalid <= 1 {
info!(
"Input file is mostly valid UTF-16 (invalid characters: {})",
num_utf16_invalid
@ -223,6 +223,13 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
return ProbableFileKind::Text(utf16_string);
}
// If the input bytes are valid Windows-1252 (an extension of
// ISO-8859-1 aka Latin 1), treat them as such.
let (latin1_str, _encoding, saw_malformed) = encoding_rs::WINDOWS_1252.decode(bytes);
if !saw_malformed {
return ProbableFileKind::Text(latin1_str.to_string());
}
ProbableFileKind::Binary
}