feat: move parsing escape_interpolations to the scanner

pull/559/head
Amaan Qureshi 2023-08-14 13:06:09 +07:00
parent 49b6f1c84a
commit e0cb646673
2 changed files with 34 additions and 14 deletions

@ -72,6 +72,7 @@ module.exports = grammar({
$._dedent, $._dedent,
$.string_start, $.string_start,
$._string_content, $._string_content,
$.escape_interpolation,
$.string_end, $.string_end,
// Mark comments as external tokens so that the external scanner is always // Mark comments as external tokens so that the external scanner is always
@ -966,7 +967,7 @@ module.exports = grammar({
string_content: $ => prec.right(repeat1( string_content: $ => prec.right(repeat1(
choice( choice(
$._escape_interpolation, $.escape_interpolation,
$.escape_sequence, $.escape_sequence,
$._not_escape_sequence, $._not_escape_sequence,
$._string_content, $._string_content,
@ -988,8 +989,6 @@ module.exports = grammar({
$.yield, $.yield,
), ),
_escape_interpolation: _ => token.immediate(choice('{{', '}}')),
escape_sequence: _ => token.immediate(prec(1, seq( escape_sequence: _ => token.immediate(prec(1, seq(
'\\', '\\',
choice( choice(

@ -44,6 +44,7 @@ enum TokenType {
DEDENT, DEDENT,
STRING_START, STRING_START,
STRING_CONTENT, STRING_CONTENT,
ESCAPE_INTERPOLATION,
STRING_END, STRING_END,
COMMENT, COMMENT,
CLOSE_PAREN, CLOSE_PAREN,
@ -171,13 +172,34 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
valid_symbols[CLOSE_PAREN] || valid_symbols[CLOSE_PAREN] ||
valid_symbols[CLOSE_BRACKET]; valid_symbols[CLOSE_BRACKET];
bool advanced_once = false;
if (valid_symbols[ESCAPE_INTERPOLATION] && scanner->delimiters.len > 0 &&
(lexer->lookahead == '{' || lexer->lookahead == '}') &&
!error_recovery_mode) {
Delimiter delimiter = VEC_BACK(scanner->delimiters);
if (is_format(&delimiter)) {
lexer->mark_end(lexer);
bool is_left_brace = lexer->lookahead == '{';
advance(lexer);
advanced_once = true;
if ((lexer->lookahead == '{' && is_left_brace) ||
(lexer->lookahead == '}' && !is_left_brace)) {
advance(lexer);
lexer->mark_end(lexer);
lexer->result_symbol = ESCAPE_INTERPOLATION;
return true;
}
return false;
}
}
if (valid_symbols[STRING_CONTENT] && scanner->delimiters.len > 0 && if (valid_symbols[STRING_CONTENT] && scanner->delimiters.len > 0 &&
!error_recovery_mode) { !error_recovery_mode) {
Delimiter delimiter = VEC_BACK(scanner->delimiters); Delimiter delimiter = VEC_BACK(scanner->delimiters);
int32_t end_char = end_character(&delimiter); int32_t end_char = end_character(&delimiter);
bool has_content = false; bool has_content = advanced_once;
while (lexer->lookahead) { while (lexer->lookahead) {
if ((lexer->lookahead == '{' || lexer->lookahead == '}') && if ((advanced_once || lexer->lookahead == '{' || lexer->lookahead == '}') &&
is_format(&delimiter)) { is_format(&delimiter)) {
lexer->mark_end(lexer); lexer->mark_end(lexer);
lexer->result_symbol = STRING_CONTENT; lexer->result_symbol = STRING_CONTENT;
@ -186,23 +208,23 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
if (lexer->lookahead == '\\') { if (lexer->lookahead == '\\') {
if (is_raw(&delimiter)) { if (is_raw(&delimiter)) {
// Step over the backslash. // Step over the backslash.
lexer->advance(lexer, false); advance(lexer);
// Step over any escaped quotes. // Step over any escaped quotes.
if (lexer->lookahead == end_character(&delimiter) || if (lexer->lookahead == end_character(&delimiter) ||
lexer->lookahead == '\\') { lexer->lookahead == '\\') {
lexer->advance(lexer, false); advance(lexer);
} }
continue; continue;
} }
if (is_bytes(&delimiter)) { if (is_bytes(&delimiter)) {
lexer->mark_end(lexer); lexer->mark_end(lexer);
lexer->advance(lexer, false); advance(lexer);
if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || if (lexer->lookahead == 'N' || lexer->lookahead == 'u' ||
lexer->lookahead == 'U') { lexer->lookahead == 'U') {
// In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are // In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are
// not escape sequences // not escape sequences
// https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
lexer->advance(lexer, false); advance(lexer);
} else { } else {
lexer->result_symbol = STRING_CONTENT; lexer->result_symbol = STRING_CONTENT;
return has_content; return has_content;
@ -215,14 +237,14 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
} else if (lexer->lookahead == end_char) { } else if (lexer->lookahead == end_char) {
if (is_triple(&delimiter)) { if (is_triple(&delimiter)) {
lexer->mark_end(lexer); lexer->mark_end(lexer);
lexer->advance(lexer, false); advance(lexer);
if (lexer->lookahead == end_char) { if (lexer->lookahead == end_char) {
lexer->advance(lexer, false); advance(lexer);
if (lexer->lookahead == end_char) { if (lexer->lookahead == end_char) {
if (has_content) { if (has_content) {
lexer->result_symbol = STRING_CONTENT; lexer->result_symbol = STRING_CONTENT;
} else { } else {
lexer->advance(lexer, false); advance(lexer);
lexer->mark_end(lexer); lexer->mark_end(lexer);
VEC_POP(scanner->delimiters); VEC_POP(scanner->delimiters);
lexer->result_symbol = STRING_END; lexer->result_symbol = STRING_END;
@ -240,7 +262,7 @@ bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer,
if (has_content) { if (has_content) {
lexer->result_symbol = STRING_CONTENT; lexer->result_symbol = STRING_CONTENT;
} else { } else {
lexer->advance(lexer, false); advance(lexer);
VEC_POP(scanner->delimiters); VEC_POP(scanner->delimiters);
lexer->result_symbol = STRING_END; lexer->result_symbol = STRING_END;
} }
@ -431,7 +453,6 @@ unsigned tree_sitter_python_external_scanner_serialize(void *payload,
for (; iter < scanner->indents.len && for (; iter < scanner->indents.len &&
size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) { ++iter) {
// yeah, it narrows the value but it's fine?
buffer[size++] = (char)scanner->indents.data[iter]; buffer[size++] = (char)scanner->indents.data[iter];
} }