Fix crash on multibyte characters

Previously parsing would proceed byte at a time, which would crash if
the source contained multibyte characters. Instead, try all the
regular expression patterns, and jump to the next nearest match.
pull/25/head
Wilfred Hughes 2021-07-18 22:34:52 +07:00
parent 841dba8789
commit 28d5e51911
2 changed files with 97 additions and 60 deletions

@ -2,6 +2,8 @@
### Parsing
Fixed a crash on parsing non-ASCII source files.
Improved parsing for Rust punctuation.
Improved parsing for OCaml punctuation.

@ -80,11 +80,7 @@ fn as_regex_vec(v: &Value) -> Vec<Regex> {
}
fn as_regex(s: &str) -> Regex {
let mut pattern = String::with_capacity(1 + s.len());
pattern.push('^');
pattern.push_str(s);
Regex::new(&pattern).unwrap()
Regex::new(&s).unwrap()
}
fn lang_from_value(name: &str, v: &Value) -> Language {
@ -138,6 +134,13 @@ pub fn parse<'a>(arena: &'a Arena<Syntax<'a>>, s: &str, lang: &Language) -> Vec<
parse_from(arena, s, &nl_pos, lang, &mut ParseState::new())
}
enum LexKind {
Comment,
Atom,
OpenDelimiter,
CloseDelimiter,
}
fn parse_from<'a>(
arena: &'a Arena<Syntax<'a>>,
s: &str,
@ -147,50 +150,82 @@ fn parse_from<'a>(
) -> Vec<&'a Syntax<'a>> {
let mut result: Vec<&'a Syntax<'a>> = vec![];
'outer: while state.str_i < s.len() {
while state.str_i < s.len() {
let mut current_match: Option<(LexKind, regex::Match)> = None;
for pattern in &lang.comment_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
let atom = Syntax::new_comment(
arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
continue 'outer;
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Comment, m));
}
}
}
}
for pattern in &lang.atom_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
let atom = Syntax::new_atom(
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Atom, m));
}
}
}
}
// TODO: fix duplication with previous loop
for pattern in &lang.string_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Atom, m));
}
}
}
}
if let Some(m) = lang.open_delimiter_pattern.find(&s[state.str_i..]) {
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::OpenDelimiter, m));
}
}
};
if let Some(m) = lang.close_delimiter_pattern.find(&s[state.str_i..]) {
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::CloseDelimiter, m));
}
}
};
match current_match {
Some((match_kind, m)) => match match_kind {
LexKind::Comment => {
let atom = Syntax::new_comment(
arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
continue 'outer;
}
}
for pattern in &lang.string_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
LexKind::Atom => {
let atom = Syntax::new_atom(
arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
m.as_str(),
);
result.push(atom);
state.str_i += m.end();
continue 'outer;
}
}
if let Some(m) = lang.open_delimiter_pattern.find(&s[state.str_i..]) {
LexKind::OpenDelimiter => {
let start = state.str_i;
state.str_i += m.end();
@ -200,7 +235,7 @@ fn parse_from<'a>(
nl_pos.from_offsets(state.str_i, state.str_i + 1),
));
let open_pos = nl_pos.from_offsets(start, start + m.end());
let open_pos = nl_pos.from_offsets(start + m.start(), start + m.end());
let items = Syntax::new_list(
arena,
m.as_str(),
@ -210,18 +245,18 @@ fn parse_from<'a>(
close_pos,
);
result.push(items);
continue;
};
if let Some(m) = lang.close_delimiter_pattern.find(&s[state.str_i..]) {
}
LexKind::CloseDelimiter => {
state.close_brace = Some((
m.as_str().into(),
nl_pos.from_offsets(state.str_i, state.str_i + m.end()),
nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
));
state.str_i += m.end();
return result;
}
},
None => break,
};
state.str_i += 1;
}
result