Give novel punctuation a lower edge cost

We'd rather see an unchanged variable name than an unchanged comma.

Fixes #366
pull/361/head
Wilfred Hughes 2022-09-09 09:43:33 +07:00
parent ed1e1d870e
commit fe5ef8757d
6 changed files with 157 additions and 26 deletions

@ -1,5 +1,10 @@
## 0.36 (unreleased)
### Diffing
Improved diff cost model to prefer finding unchanged variable names
over unchanged punctuation.
## 0.35 (released 2nd September 2022)
### Diffing

@ -0,0 +1,26 @@
// From lodash under a MIT license.
// https://github.com/lodash/lodash/blob/master/LICENSE
/**
* This function is like `arrayIncludes` except that it accepts a comparator.
*
* @private
* @param {Array} [array] The array to inspect.
* @param {*} target The value to search for.
* @param {Function} comparator The comparator invoked per element.
* @returns {boolean} Returns `true` if `target` is found, else `false`.
*/
function arrayIncludesWith(array, target, comparator) {
if (array == null) {
return false
}
for (const value of array) {
if (comparator(target, value)) {
return true
}
}
return false
}
export default arrayIncludesWith

@ -0,0 +1,25 @@
// From lodash under a MIT license.
// https://github.com/lodash/lodash/blob/master/LICENSE
/**
* This function is like `arrayIncludes` except that it accepts a comparator.
*
* @private
* @param {Array} [array] The array to inspect.
* @param {*} target The value to search for.
* @param {Function} comparator The comparator invoked per element.
* @returns {boolean} Returns `true` if `target` is found, else `false`.
*/
function arrayIncludesWith(array, value, comparator) {
let index = -1
const length = array == null ? 0 : array.length
while (++index < length) {
if (comparator(value, array[index])) {
return true
}
}
return false
}
export default arrayIncludesWith

@ -13,6 +13,9 @@ sample_files/chinese_before.po sample_files/chinese_after.po
sample_files/clojure_before.clj sample_files/clojure_after.clj
b916e224f289888252cd7597bab339e6 -
sample_files/comma_before.js sample_files/comma_after.js
6b06dbb3eb6cc44ce51d2e40695d823b -
sample_files/comments_before.rs sample_files/comments_after.rs
0b2756c60659993310f899b131cca84f -

@ -326,7 +326,10 @@ mod tests {
EnterUnchangedDelimiter {
depth_difference: 0
},
NovelAtomLHS { contiguous: false },
NovelAtomLHS {
contiguous: false,
probably_punctuation: false,
},
ExitDelimiterBoth,
]
);
@ -369,8 +372,14 @@ mod tests {
EnterUnchangedDelimiter {
depth_difference: 0
},
NovelAtomRHS { contiguous: false },
NovelAtomRHS { contiguous: false },
NovelAtomRHS {
contiguous: false,
probably_punctuation: false
},
NovelAtomRHS {
contiguous: false,
probably_punctuation: false
},
ExitDelimiterBoth,
]
);
@ -456,8 +465,14 @@ mod tests {
UnchangedNode {
depth_difference: 0
},
NovelAtomLHS { contiguous: false },
NovelAtomLHS { contiguous: true },
NovelAtomLHS {
contiguous: false,
probably_punctuation: false
},
NovelAtomLHS {
contiguous: true,
probably_punctuation: false
},
]
);
}
@ -492,7 +507,10 @@ mod tests {
actions,
vec![
EnterNovelDelimiterLHS { contiguous: false },
NovelAtomLHS { contiguous: true },
NovelAtomLHS {
contiguous: true,
probably_punctuation: false
},
ExitDelimiterLHS,
]
);
@ -531,9 +549,15 @@ mod tests {
actions,
vec![
EnterNovelDelimiterLHS { contiguous: false },
NovelAtomLHS { contiguous: true },
NovelAtomLHS {
contiguous: true,
probably_punctuation: false
},
ExitDelimiterLHS,
NovelAtomLHS { contiguous: true },
NovelAtomLHS {
contiguous: true,
probably_punctuation: true
},
]
);
}
@ -640,7 +664,10 @@ mod tests {
ReplacedComment {
levenshtein_pct: 95
},
NovelAtomLHS { contiguous: false }
NovelAtomLHS {
contiguous: false,
probably_punctuation: false
}
]
);
}

@ -278,20 +278,38 @@ impl<'a, 'b> Vertex<'a, 'b> {
/// See [`neighbours`] for all the edges available for a given `Vertex`.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum Edge {
UnchangedNode { depth_difference: u32 },
EnterUnchangedDelimiter { depth_difference: u32 },
ReplacedComment { levenshtein_pct: u8 },
NovelAtomLHS { contiguous: bool },
NovelAtomRHS { contiguous: bool },
UnchangedNode {
depth_difference: u32,
},
EnterUnchangedDelimiter {
depth_difference: u32,
},
ReplacedComment {
levenshtein_pct: u8,
},
NovelAtomLHS {
contiguous: bool,
probably_punctuation: bool,
},
NovelAtomRHS {
contiguous: bool,
probably_punctuation: bool,
},
// TODO: An EnterNovelDelimiterBoth edge might help performance
// rather doing LHS and RHS separately.
EnterNovelDelimiterLHS { contiguous: bool },
EnterNovelDelimiterRHS { contiguous: bool },
EnterNovelDelimiterLHS {
contiguous: bool,
},
EnterNovelDelimiterRHS {
contiguous: bool,
},
ExitDelimiterLHS,
ExitDelimiterRHS,
ExitDelimiterBoth,
}
const NOT_CONTIGUOUS_PENALTY: u64 = 50;
impl Edge {
pub fn cost(self) -> u64 {
match self {
@ -317,21 +335,38 @@ impl Edge {
ReplacedComment { levenshtein_pct } => 150 + u64::from(100 - levenshtein_pct),
// Otherwise, we've added/removed a node.
NovelAtomLHS { contiguous }
| NovelAtomRHS { contiguous }
| EnterNovelDelimiterLHS { contiguous }
| EnterNovelDelimiterRHS { contiguous } => {
if contiguous {
300
} else {
NovelAtomLHS {
contiguous,
probably_punctuation,
}
| NovelAtomRHS {
contiguous,
probably_punctuation,
} => {
let mut cost = 300;
if !contiguous {
cost += NOT_CONTIGUOUS_PENALTY;
}
// If it's only punctuation, decrease the cost
// slightly. It's better to have novel punctuation
// than novel variable names.
if probably_punctuation {
cost -= 10;
}
cost
}
EnterNovelDelimiterLHS { contiguous } | EnterNovelDelimiterRHS { contiguous } => {
let mut cost = 300;
if !contiguous {
// This needs to be more than 40 greater than the
// contiguous case. Otherwise, we end up choosing
// a badly positioned unchanged delimiter just
// because it has a better depth difference.
//
// TODO: write a test for this case.
350
cost += NOT_CONTIGUOUS_PENALTY;
}
cost
}
}
}
@ -374,6 +409,14 @@ fn allocate_if_new<'syn, 'b>(
}
}
/// Does this atom look like punctuation?
///
/// This check is deliberately conservative, becuase it's hard to
/// accurately recognise punctuation in a language-agnostic way.
fn looks_like_punctuation(content: &str) -> bool {
content == "," || content == ";"
}
/// Compute the neighbours of `v` if we haven't previously done so,
/// write them to the .neighbours cell inside `v`, and return them.
pub fn get_set_neighbours<'syn, 'b>(
@ -579,12 +622,13 @@ pub fn get_set_neighbours<'syn, 'b>(
if let Some(lhs_syntax) = &v.lhs_syntax {
match lhs_syntax {
// Step over this novel atom.
Syntax::Atom { .. } => {
Syntax::Atom { content, .. } => {
res.push((
NovelAtomLHS {
// TODO: should this apply if prev is a parent
// node rather than a sibling?
contiguous: lhs_syntax.prev_is_contiguous(),
probably_punctuation: looks_like_punctuation(content),
},
allocate_if_new(
Vertex {
@ -634,10 +678,11 @@ pub fn get_set_neighbours<'syn, 'b>(
if let Some(rhs_syntax) = &v.rhs_syntax {
match rhs_syntax {
// Step over this novel atom.
Syntax::Atom { .. } => {
Syntax::Atom { content, .. } => {
res.push((
NovelAtomRHS {
contiguous: rhs_syntax.prev_is_contiguous(),
probably_punctuation: looks_like_punctuation(content),
},
allocate_if_new(
Vertex {