Fix crash on multibyte characters

Previously parsing would proceed byte at a time, which would crash if
the source contained multibyte characters. Instead, try all the
regular expression patterns, and jump to the next nearest match.
pull/25/head
Wilfred Hughes 2021-07-18 22:34:52 +07:00
parent 841dba8789
commit 28d5e51911
2 changed files with 97 additions and 60 deletions

@ -2,6 +2,8 @@
### Parsing ### Parsing
Fixed a crash on parsing non-ASCII source files.
Improved parsing for Rust punctuation. Improved parsing for Rust punctuation.
Improved parsing for OCaml punctuation. Improved parsing for OCaml punctuation.

@ -80,11 +80,7 @@ fn as_regex_vec(v: &Value) -> Vec<Regex> {
} }
fn as_regex(s: &str) -> Regex { fn as_regex(s: &str) -> Regex {
let mut pattern = String::with_capacity(1 + s.len()); Regex::new(&s).unwrap()
pattern.push('^');
pattern.push_str(s);
Regex::new(&pattern).unwrap()
} }
fn lang_from_value(name: &str, v: &Value) -> Language { fn lang_from_value(name: &str, v: &Value) -> Language {
@ -138,6 +134,13 @@ pub fn parse<'a>(arena: &'a Arena<Syntax<'a>>, s: &str, lang: &Language) -> Vec<
parse_from(arena, s, &nl_pos, lang, &mut ParseState::new()) parse_from(arena, s, &nl_pos, lang, &mut ParseState::new())
} }
enum LexKind {
Comment,
Atom,
OpenDelimiter,
CloseDelimiter,
}
fn parse_from<'a>( fn parse_from<'a>(
arena: &'a Arena<Syntax<'a>>, arena: &'a Arena<Syntax<'a>>,
s: &str, s: &str,
@ -147,50 +150,82 @@ fn parse_from<'a>(
) -> Vec<&'a Syntax<'a>> { ) -> Vec<&'a Syntax<'a>> {
let mut result: Vec<&'a Syntax<'a>> = vec![]; let mut result: Vec<&'a Syntax<'a>> = vec![];
'outer: while state.str_i < s.len() { while state.str_i < s.len() {
let mut current_match: Option<(LexKind, regex::Match)> = None;
for pattern in &lang.comment_patterns { for pattern in &lang.comment_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) { if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0); match current_match {
let atom = Syntax::new_comment( Some((_, prev_m)) if prev_m.start() <= m.start() => {}
arena, _ => {
nl_pos.from_offsets(state.str_i, state.str_i + m.end()), current_match = Some((LexKind::Comment, m));
m.as_str(), }
); }
result.push(atom);
state.str_i += m.end();
continue 'outer;
} }
} }
for pattern in &lang.atom_patterns { for pattern in &lang.atom_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) { if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0); match current_match {
let atom = Syntax::new_atom( Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Atom, m));
}
}
}
}
// TODO: fix duplication with previous loop
for pattern in &lang.string_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::Atom, m));
}
}
}
}
if let Some(m) = lang.open_delimiter_pattern.find(&s[state.str_i..]) {
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::OpenDelimiter, m));
}
}
};
if let Some(m) = lang.close_delimiter_pattern.find(&s[state.str_i..]) {
match current_match {
Some((_, prev_m)) if prev_m.start() <= m.start() => {}
_ => {
current_match = Some((LexKind::CloseDelimiter, m));
}
}
};
match current_match {
Some((match_kind, m)) => match match_kind {
LexKind::Comment => {
let atom = Syntax::new_comment(
arena, arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()), nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
m.as_str(), m.as_str(),
); );
result.push(atom); result.push(atom);
state.str_i += m.end(); state.str_i += m.end();
continue 'outer;
} }
} LexKind::Atom => {
for pattern in &lang.string_patterns {
if let Some(m) = pattern.find(&s[state.str_i..]) {
assert_eq!(m.start(), 0);
let atom = Syntax::new_atom( let atom = Syntax::new_atom(
arena, arena,
nl_pos.from_offsets(state.str_i, state.str_i + m.end()), nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
m.as_str(), m.as_str(),
); );
result.push(atom); result.push(atom);
state.str_i += m.end(); state.str_i += m.end();
continue 'outer;
} }
} LexKind::OpenDelimiter => {
if let Some(m) = lang.open_delimiter_pattern.find(&s[state.str_i..]) {
let start = state.str_i; let start = state.str_i;
state.str_i += m.end(); state.str_i += m.end();
@ -200,7 +235,7 @@ fn parse_from<'a>(
nl_pos.from_offsets(state.str_i, state.str_i + 1), nl_pos.from_offsets(state.str_i, state.str_i + 1),
)); ));
let open_pos = nl_pos.from_offsets(start, start + m.end()); let open_pos = nl_pos.from_offsets(start + m.start(), start + m.end());
let items = Syntax::new_list( let items = Syntax::new_list(
arena, arena,
m.as_str(), m.as_str(),
@ -210,18 +245,18 @@ fn parse_from<'a>(
close_pos, close_pos,
); );
result.push(items); result.push(items);
continue; }
}; LexKind::CloseDelimiter => {
if let Some(m) = lang.close_delimiter_pattern.find(&s[state.str_i..]) {
state.close_brace = Some(( state.close_brace = Some((
m.as_str().into(), m.as_str().into(),
nl_pos.from_offsets(state.str_i, state.str_i + m.end()), nl_pos.from_offsets(state.str_i + m.start(), state.str_i + m.end()),
)); ));
state.str_i += m.end(); state.str_i += m.end();
return result; return result;
}
},
None => break,
}; };
state.str_i += 1;
} }
result result