difftastic/vendored_parsers/tree-sitter-elm/src/scanner.c

497 lines
16 KiB
C

#include "tree_sitter/parser.h"
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define VEC_RESIZE(vec, _cap) \
void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \
assert(tmp != NULL); \
(vec).data = tmp; \
assert((vec).data != NULL); \
(vec).cap = (_cap);
#define VEC_GROW(vec, _cap) \
if ((vec).cap < (_cap)) { \
VEC_RESIZE((vec), (_cap)); \
}
#define VEC_PUSH(vec, el) \
if ((vec).cap == (vec).len) { \
VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \
} \
(vec).data[(vec).len++] = (el);
#define VEC_POP(vec) (vec).len--;
#define VEC_BACK(vec) ((vec).data[(vec).len - 1])
#define VEC_FREE(vec) \
{ \
if ((vec).data != NULL) \
free((vec).data); \
}
#define VEC_CLEAR(vec) (vec).len = 0;
#define VEC_REVERSE(vec) \
do { \
if ((vec).len > 1) { \
for (size_t i = 0, j = (vec).len - 1; i < j; i++, j--) { \
uint8_t tmp = (vec).data[i]; \
(vec).data[i] = (vec).data[j]; \
(vec).data[j] = tmp; \
} \
} \
} while (0)
enum TokenType {
VIRTUAL_END_DECL,
VIRTUAL_OPEN_SECTION,
VIRTUAL_END_SECTION,
MINUS_WITHOUT_TRAILING_WHITESPACE,
GLSL_CONTENT,
BLOCK_COMMENT_CONTENT,
};
typedef struct {
uint32_t len;
uint32_t cap;
uint8_t *data;
} vec;
typedef struct {
uint32_t indent_length;
vec indents;
vec runback;
} Scanner;
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
// > You can detect error recovery in the external scanner by the fact that
// > _all_ tokens are considered valid at once.
// https://github.com/tree-sitter/tree-sitter/pull/1783#issuecomment-1181011411
static bool in_error_recovery(const bool *valid_symbols) {
return (valid_symbols[VIRTUAL_END_DECL] &&
valid_symbols[VIRTUAL_OPEN_SECTION] &&
valid_symbols[VIRTUAL_END_SECTION] &&
valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] &&
valid_symbols[GLSL_CONTENT] &&
valid_symbols[BLOCK_COMMENT_CONTENT]);
}
static bool is_elm_space(TSLexer *lexer) {
return lexer->lookahead == ' ' || lexer->lookahead == '\r' ||
lexer->lookahead == '\n';
}
static int checkForIn(TSLexer *lexer, const bool *valid_symbols) {
// Are we at the end of a let (in) declaration
if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == 'i') {
skip(lexer);
if (lexer->lookahead == 'n') {
skip(lexer);
if (is_elm_space(lexer) || lexer->eof(lexer)) {
return 2; // Success
}
return 1; // Partial
}
return 1; // Partial
}
return 0;
}
static bool scan_block_comment(TSLexer *lexer) {
lexer->mark_end(lexer);
if (lexer->lookahead != '{') {
return false;
}
advance(lexer);
if (lexer->lookahead != '-') {
return false;
}
advance(lexer);
while (true) {
switch (lexer->lookahead) {
case '{':
scan_block_comment(lexer);
break;
case '-':
advance(lexer);
if (lexer->lookahead == '}') {
advance(lexer);
return true;
}
break;
case '\0':
return true;
default:
advance(lexer);
}
}
}
static void advance_to_line_end(TSLexer *lexer) {
while (true) {
if (lexer->lookahead == '\n' || lexer->eof(lexer)) {
break;
}
advance(lexer);
}
}
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (in_error_recovery(valid_symbols)) {
return false;
}
// First handle eventual runback tokens, we saved on a previous scan op
if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 &&
valid_symbols[VIRTUAL_END_DECL]) {
VEC_POP(scanner->runback);
lexer->result_symbol = VIRTUAL_END_DECL;
return true;
}
if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 &&
valid_symbols[VIRTUAL_END_SECTION]) {
VEC_POP(scanner->runback);
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
VEC_CLEAR(scanner->runback);
// Check if we have newlines and how much indentation
bool has_newline = false;
bool found_in = false;
bool can_call_mark_end = true;
lexer->mark_end(lexer);
while (true) {
if (lexer->lookahead == ' ' || lexer->lookahead == '\r') {
skip(lexer);
} else if (lexer->lookahead == '\n') {
skip(lexer);
has_newline = true;
while (true) {
if (lexer->lookahead == ' ') {
skip(lexer);
} else {
scanner->indent_length = lexer->get_column(lexer);
break;
}
}
} else if (!valid_symbols[BLOCK_COMMENT_CONTENT] &&
lexer->lookahead == '-') {
advance(lexer);
int32_t lookahead = lexer->lookahead;
// Handle minus without a whitespace for negate
if (valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] &&
((lookahead >= 'a' && lookahead <= 'z') ||
(lookahead >= 'A' && lookahead <= 'Z') || lookahead == '(')) {
if (can_call_mark_end) {
lexer->result_symbol = MINUS_WITHOUT_TRAILING_WHITESPACE;
lexer->mark_end(lexer);
return true;
}
return false;
}
// Scan past line comments. As far as the special token
// types we're scanning for here are concerned line comments
// are like whitespace. There is nothing useful to be
// learned from, say, their indentation. So we advance past
// them here.
//
// The one thing we need to keep in mind is that we should
// not call `lexer->mark_end(lexer)` after this point, or
// the comment will be lost.
if (lookahead == '-' && has_newline) {
can_call_mark_end = false;
advance(lexer);
advance_to_line_end(lexer);
} else if (valid_symbols[BLOCK_COMMENT_CONTENT] &&
lexer->lookahead == '}') {
lexer->result_symbol = BLOCK_COMMENT_CONTENT;
return true;
} else {
return false;
}
} else if (lexer->eof(lexer)) {
if (valid_symbols[VIRTUAL_END_SECTION]) {
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
if (valid_symbols[VIRTUAL_END_DECL]) {
lexer->result_symbol = VIRTUAL_END_DECL;
return true;
}
break;
} else {
break;
}
}
if (checkForIn(lexer, valid_symbols) == 2) {
if (has_newline) {
found_in = true;
} else {
lexer->result_symbol = VIRTUAL_END_SECTION;
VEC_POP(scanner->indents);
return true;
}
}
// Open section if the grammar lets us but only push to indent stack if
// we go further down in the stack
if (valid_symbols[VIRTUAL_OPEN_SECTION] && !lexer->eof(lexer)) {
VEC_PUSH(scanner->indents, lexer->get_column(lexer));
lexer->result_symbol = VIRTUAL_OPEN_SECTION;
return true;
}
if (valid_symbols[BLOCK_COMMENT_CONTENT]) {
if (!can_call_mark_end) {
return false;
}
lexer->mark_end(lexer);
while (true) {
if (lexer->lookahead == '\0') {
break;
}
if (lexer->lookahead != '{' && lexer->lookahead != '-') {
advance(lexer);
} else if (lexer->lookahead == '-') {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '}') {
break;
}
} else if (scan_block_comment(lexer)) {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '-') {
break;
}
}
}
lexer->result_symbol = BLOCK_COMMENT_CONTENT;
return true;
}
if (has_newline) {
// We had a newline now it's time to check if we need to add
// multiple tokens to get back up to the right level
VEC_CLEAR(scanner->runback);
while (scanner->indent_length <= VEC_BACK(scanner->indents)) {
if (scanner->indent_length == VEC_BACK(scanner->indents)) {
if (found_in) {
VEC_PUSH(scanner->runback, 1);
found_in = false;
break;
}
// Don't insert VIRTUAL_END_DECL when there is a line
// comment incoming
if (lexer->lookahead == '-') {
skip(lexer);
if (lexer->lookahead == '-') {
break;
}
}
// Don't insert VIRTUAL_END_DECL when there is a block
// comment incoming
if (lexer->lookahead == '{') {
skip(lexer);
if (lexer->lookahead == '-') {
break;
}
}
VEC_PUSH(scanner->runback, 0);
break;
}
if (scanner->indent_length < VEC_BACK(scanner->indents)) {
VEC_POP(scanner->indents);
VEC_PUSH(scanner->runback, 1);
found_in = false;
}
}
// Needed for some of the more weird cases where let is in the same
// line as everything before the in in the next line
if (found_in) {
VEC_PUSH(scanner->runback, 1);
found_in = false;
}
// Our list is the wrong way around, reverse it
VEC_REVERSE(scanner->runback);
// Handle the first runback token if we have them, if there are more
// they will be handled on the next scan operation
if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 &&
valid_symbols[VIRTUAL_END_DECL]) {
VEC_POP(scanner->runback);
lexer->result_symbol = VIRTUAL_END_DECL;
return true;
}
if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 &&
valid_symbols[VIRTUAL_END_SECTION]) {
VEC_POP(scanner->runback);
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
if (lexer->eof(lexer) && valid_symbols[VIRTUAL_END_SECTION]) {
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
}
if (valid_symbols[GLSL_CONTENT]) {
if (!can_call_mark_end) {
return false;
}
lexer->result_symbol = GLSL_CONTENT;
while (true) {
switch (lexer->lookahead) {
case '|':
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == ']') {
advance(lexer);
return true;
}
break;
case '\0':
lexer->mark_end(lexer);
return true;
default:
advance(lexer);
}
}
}
return false;
}
// --------------------------------------------------------------------------------------------------------
// API
// --------------------------------------------------------------------------------------------------------
/**
* This function allocates the persistent state of the parser that is passed
* into the other API functions.
*/
void *tree_sitter_elm_external_scanner_create() {
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
return scanner;
}
/**
* Main logic entry point.
* Since the state is a singular vector, it can just be cast and used directly.
*/
bool tree_sitter_elm_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(scanner, lexer, valid_symbols);
}
/**
* Copy the current state to another location for later reuse.
* This is normally more complex, but since this parser's state constists solely
* of a vector of integers, it can just be copied.
*/
unsigned tree_sitter_elm_external_scanner_serialize(void *payload,
char *buffer) {
Scanner *scanner = (Scanner *)payload;
size_t size = 0;
if (3 + scanner->indents.len + scanner->runback.len >=
TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
return 0;
}
size_t runback_count = scanner->runback.len;
if (runback_count > UINT8_MAX) {
runback_count = UINT8_MAX;
}
buffer[size++] = (char)runback_count;
if (runback_count > 0) {
memcpy(&buffer[size], scanner->runback.data, runback_count);
}
size += runback_count;
size_t indent_length_length = sizeof(scanner->indent_length);
buffer[size++] = (char)indent_length_length;
if (indent_length_length > 0) {
memcpy(&buffer[size], &scanner->indent_length, indent_length_length);
}
size += indent_length_length;
int iter = 1;
for (; iter != scanner->indents.len &&
size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE;
++iter) {
buffer[size++] = (char)scanner->indents.data[iter];
}
return size;
}
/**
* Load another parser state into the currently active state.
* `payload` is the state of the previous parser execution, while `buffer` is
* the saved state of a different position (e.g. when doing incremental
* parsing).
*/
void tree_sitter_elm_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
VEC_CLEAR(scanner->runback);
VEC_CLEAR(scanner->indents);
VEC_PUSH(scanner->indents, 0);
if (length == 0) {
return;
}
size_t size = 0;
size_t runback_count = (unsigned char)buffer[size++];
VEC_GROW(scanner->runback, runback_count)
if (runback_count > 0) {
memcpy(scanner->runback.data, &buffer[size], runback_count);
scanner->runback.len = runback_count;
size += runback_count;
}
size_t indent_length_length = (unsigned char)buffer[size++];
if (indent_length_length > 0) {
memcpy(&scanner->indent_length, &buffer[size], indent_length_length);
size += indent_length_length;
}
for (; size < length; size++) {
VEC_PUSH(scanner->indents, (unsigned char)buffer[size]);
}
assert(size == length);
}
/**
* Destroy the state.
*/
void tree_sitter_elm_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
VEC_FREE(scanner->indents);
VEC_FREE(scanner->runback);
free(scanner);
}