Consider null bytes in text file detection

pull/647/head^2
Wilfred Hughes 2024-03-06 23:07:59 +07:00
parent d3797ef4e2
commit c6da85759c
2 changed files with 19 additions and 2 deletions

@ -1,5 +1,10 @@
## 0.57 (unreleased)
### Parsing
Text file detection is now stricter, considering null bytes as a hint
that files are binaries.
### Diffing
Scheme now uses the same slider heuristics as other lisps, preferring

@ -197,7 +197,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let num_utf8_invalid = utf8_string
.chars()
.take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf8_invalid <= 10 {
info!(
@ -213,7 +213,7 @@ pub(crate) fn guess_content(bytes: &[u8]) -> ProbableFileKind {
let num_utf16_invalid = utf16_string
.chars()
.take(5000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
if num_utf16_invalid <= 5 {
info!(
@ -327,6 +327,18 @@ mod tests {
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
}
#[test]
fn test_dex_is_binary() {
let bytes = vec![
0x34, 0x8a, 0x4b, 0x8f, 0x77, 0xa4, 0x4e, 0xb1, 0x31, 0x2d, 0x5f, 0xfb, 0x10, 0x08,
0xa8, 0x6b, 0x58, 0x06, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x78, 0x56, 0x34, 0x12,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xac, 0x05, 0x00, 0x00, 0x23, 0x00,
0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
];
assert_eq!(guess_content(&bytes), ProbableFileKind::Binary);
}
#[test]
fn test_png_bytes_are_binary() {
let bytes = vec![