Set a length limit on lines when doing a word diff

See #653
2024-02-29 00:54:55 +07:00 · 2024-02-29 00:54:55 +07:00 · 53298e4240
parent 7e8f928926
commit 53298e4240
5 changed files with 34 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,11 @@ Updated JavaScript, TypeScript and QML parsers.

 Added support for Smali.

+### Diffing
+
+Fixed an issue with runaway memory usage on text files with very long
+lines.
+
 ### Display

 Fixed an issue where all files would show a permissions change when
--- a/sample_files/compare.expected
+++ b/sample_files/compare.expected
@ -127,6 +127,9 @@ sample_files/julia_before.jl sample_files/julia_after.jl
 sample_files/load_before.js sample_files/load_after.js
 7ead0c677e1ccc5639ea2a8199a8175e  -

+sample_files/long_line_before.txt sample_files/long_line_after.txt
+850e2efa67152b79d5b5099f202d61bb  -
+
 sample_files/lua_before.lua sample_files/lua_after.lua
 9f5c85cd6806c724c84afa805da76bb7  -

--- a/sample_files/long_line_after.txt
+++ b/sample_files/long_line_after.txt
--- a/sample_files/long_line_before.txt
+++ b/sample_files/long_line_before.txt
--- a/src/line_parser.rs
+++ b/src/line_parser.rs
@ -10,6 +10,8 @@ use crate::{
    parse::syntax::{AtomKind, MatchKind, MatchedPos, TokenKind},
 };

+const MAX_WORDS_IN_LINE: usize = 1000;
+
 fn split_lines_keep_newline(s: &str) -> Vec<&str> {
    lazy_static! {
        static ref NEWLINE_RE: Regex = Regex::new("\n").unwrap();
@ -142,10 +144,28 @@ pub(crate) fn change_positions(lhs_src: &str, rhs_src: &str) -> Vec<MatchedPos>
                let lhs_part = lhs_lines.join("");
                let rhs_part = rhs_lines.join("");

-                for diff_res in myers_diff::slice_unique_by_hash(
-                    &split_words(&lhs_part),
-                    &split_words(&rhs_part),
-                ) {
+                let lhs_words = split_words(&lhs_part);
+                let rhs_words = split_words(&rhs_part);
+
+                // Myers Diff scales badly on large inputs, and
+                // word-level diffing is merely nice to have. If we
+                // have a very large number of words, don't diff
+                // individual words.
+                if lhs_words.len() > MAX_WORDS_IN_LINE || rhs_words.len() > MAX_WORDS_IN_LINE {
+                    let lhs_pos = lhs_lp.from_region(lhs_offset, lhs_offset + lhs_part.len());
+                    mps.push(MatchedPos {
+                        kind: MatchKind::NovelWord {
+                            highlight: TokenKind::Atom(AtomKind::Normal),
+                        },
+                        pos: lhs_pos[0],
+                    });
+
+                    lhs_offset += lhs_part.len();
+                    rhs_offset += rhs_part.len();
+                    continue;
+                }
+
+                for diff_res in myers_diff::slice_unique_by_hash(&lhs_words, &rhs_words) {
                    match diff_res {
                        myers_diff::DiffResult::Left(lhs_word) => {
                            let lhs_pos =