diff --git a/CHANGELOG.md b/CHANGELOG.md
index 347bd6560..83aedcd67 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ Updated grammar for Java.
 
 Improved parsing of qualified constructors in Haskell.
 
+Difftastic is now stricter about valid UTF-8 and UTF-16, considering
+more of the file's bytes during filetype detection. This fixes cases
+where e.g. PDF was sometimes incorrectly considered as UTF-8.
+
 ### Diffing
 
 Improved handling of delimiters ("nested sliders") in languages that
diff --git a/src/files.rs b/src/files.rs
index 71180b430..1cad43540 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -193,11 +193,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
     let utf8_string = String::from_utf8_lossy(bytes).to_string();
     let num_utf8_invalid = utf8_string
         .chars()
-        .take(1000)
+        .take(5000)
         .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
         .count();
     if num_utf8_invalid <= 10 {
-        info!("Input file is mostly valid UTF-8");
+        info!(
+            "Input file is mostly valid UTF-8 (invalid characters: {})",
+            num_utf8_invalid
+        );
         return ProbableFileKind::Text(utf8_string);
     }
 
@@ -206,11 +209,14 @@ pub fn guess_content(bytes: &[u8]) -> ProbableFileKind {
     let utf16_string = String::from_utf16_lossy(&u16_values);
     let num_utf16_invalid = utf16_string
         .chars()
-        .take(1000)
+        .take(5000)
         .filter(|c| *c == std::char::REPLACEMENT_CHARACTER)
         .count();
     if num_utf16_invalid <= 5 {
-        info!("Input file is mostly valid UTF-16");
+        info!(
+            "Input file is mostly valid UTF-16 (invalid characters: {})",
+            num_utf16_invalid
+        );
         return ProbableFileKind::Text(utf16_string);
     }