Treat files with null bytes as binary

pull/281/head
Wilfred Hughes 2022-04-29 18:08:38 +07:00
parent d9ef270d27
commit 75a3624f7c
2 changed files with 14 additions and 3 deletions

@ -1,5 +1,10 @@
## 0.29 (unreleased)
### Parsing
Improved detection of binary files that have a large number of null
bytes.
## 0.28 (released 29th April 2022)
### Parsing

@ -69,14 +69,15 @@ pub fn read_or_die(path: &Path) -> Vec<u8> {
/// Do these bytes look like a binary (non-textual) format?
pub fn is_probably_binary(bytes: &[u8]) -> bool {
// If more than 20 of the first 1,000 characters are not valid
// UTF-8, we assume it's binary.
// If more than 20 of the first 1,000 characters are null bytes or
// invalid UTF-8, we assume it's binary.
let num_replaced = String::from_utf8_lossy(bytes)
.to_string()
.chars()
.take(1000)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
.filter(|c| *c == std::char::REPLACEMENT_CHARACTER || *c == '\0')
.count();
num_replaced > 20
}
@ -153,4 +154,9 @@ mod tests {
let s = "hello world";
assert!(!is_probably_binary(s.as_bytes()));
}
#[test]
fn test_null_bytes_are_binary() {
let s = "\0".repeat(1000);
assert!(is_probably_binary(s.as_bytes()));
}
}