difftastic/vendored_parsers/tree-sitter-f-sharp/src/scanner.cc

373 lines
10 KiB
C++

#include <tree_sitter/parser.h>
#include <vector>
#include <utility>
#include <cwctype>
#include <cstring>
#include <cassert>
#include <stdio.h>
#include <string>
#include <algorithm>
namespace {
using std::vector;
using std::string;
enum TokenType {
VIRTUAL_OPEN_SECTION,
VIRTUAL_END_SECTION,
VIRTUAL_END_ALIGNED,
BLOCK_COMMENT_CONTENT,
};
bool in_error_recovery(const bool *valid_symbols) {
return
(valid_symbols[VIRTUAL_OPEN_SECTION] &&
valid_symbols[VIRTUAL_END_SECTION] &&
valid_symbols[VIRTUAL_END_ALIGNED]);
}
struct Scanner {
Scanner() { }
unsigned serialize(char *buffer) {
size_t i = 0;
size_t runback_count = runback.size();
if (runback_count > UINT8_MAX)
runback_count = UINT8_MAX;
buffer[i++] = runback_count;
if (runback_count > 0)
{
memcpy(&buffer[i], runback.data(), runback_count);
}
i += runback_count;
size_t indent_length_length = sizeof(indent_length);
buffer[i++] = indent_length_length;
if (indent_length_length > 0)
{
memcpy(&buffer[i], &indent_length, indent_length_length);
}
i += indent_length_length;
vector<uint32_t>::iterator
iter = indent_length_stack.begin() + 1,
end = indent_length_stack.end();
for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter)
{
buffer[i++] = *iter;
}
return i;
}
void deserialize(const char *buffer, unsigned length) {
runback.clear();
indent_length_stack.clear();
indent_length_stack.push_back(0);
if (length > 0)
{
size_t i = 0;
size_t runback_count = (uint8_t)buffer[i++];
runback.resize(runback_count);
if (runback_count > 0)
{
memcpy(runback.data(), &buffer[i], runback_count);
}
i += runback_count;
size_t indent_length_length = buffer[i++];
if (indent_length_length > 0)
{
memcpy(&indent_length, &buffer[i], indent_length_length);
}
i += indent_length_length;
for (; i < length; i++)
{
indent_length_stack.push_back(buffer[i]);
}
}
}
void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
bool isWS(TSLexer *lexer) {
return lexer->lookahead == ' ' || lexer->lookahead == '\r' || lexer->lookahead == '\n';
}
bool scan_block_comment(TSLexer *lexer) {
lexer->mark_end(lexer);
if (lexer->lookahead != '(')
return false;
advance(lexer);
if (lexer->lookahead != '*')
return false;
advance(lexer);
while (true) {
switch (lexer->lookahead) {
case '(':
scan_block_comment(lexer);
break;
case '*':
advance(lexer);
if (lexer->lookahead == ')') {
advance(lexer);
return true;
}
break;
case '\0':
return true;
default:
advance(lexer);
}
}
}
void advance_to_line_end(TSLexer *lexer) {
while (true) {
if (lexer->lookahead == '\n') {
break;
} else if (lexer->eof(lexer)) {
break;
} else {
advance(lexer);
}
}
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (in_error_recovery(valid_symbols))
return false;
// First handle eventual runback tokens, we saved on a previous scan op
if (!runback.empty() && runback.back() == 0 && valid_symbols[VIRTUAL_END_ALIGNED])
{
runback.pop_back();
lexer->result_symbol = VIRTUAL_END_ALIGNED;
return true;
}
if (!runback.empty() && runback.back() == 1 && valid_symbols[VIRTUAL_END_SECTION])
{
runback.pop_back();
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
runback.clear();
// Check if we have newlines and how much indentation
bool has_newline = false;
bool can_call_mark_end = true;
lexer->mark_end(lexer);
while (true) {
if (lexer->lookahead == ' ') {
skip(lexer);
}
else if (lexer->lookahead == '\n') {
skip(lexer);
has_newline = true;
while (true)
{
if (lexer->lookahead == ' ')
{
skip(lexer);
}
else
{
indent_length = lexer->get_column(lexer);
break;
}
}
}
else if (lexer->lookahead == '\r') {
skip(lexer);
}
else if (valid_symbols[VIRTUAL_END_ALIGNED] && lexer->lookahead == ';') {
advance(lexer);
lexer->mark_end(lexer);
lexer->result_symbol = VIRTUAL_END_ALIGNED;
return true;
}
else if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == ')') {
lexer->result_symbol = VIRTUAL_END_SECTION;
indent_length_stack.pop_back();
return true;
}
else if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == ']') {
lexer->result_symbol = VIRTUAL_END_SECTION;
indent_length_stack.pop_back();
return true;
}
else if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == '}') {
lexer->result_symbol = VIRTUAL_END_SECTION;
indent_length_stack.pop_back();
return true;
}
else if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == '|') {
skip(lexer);
if (lexer->lookahead == '}' || lexer->lookahead == ']') {
lexer->result_symbol = VIRTUAL_END_SECTION;
indent_length_stack.pop_back();
return true;
}
}
else if (lexer->eof(lexer)) {
if (valid_symbols[VIRTUAL_END_SECTION])
{
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
if (valid_symbols[VIRTUAL_END_ALIGNED])
{
lexer->result_symbol = VIRTUAL_END_ALIGNED;
return true;
}
break;
}
else { break; }
}
bool closing = lexer->lookahead == ']' || lexer->lookahead == ')' || lexer->lookahead == '}';
// Open section if the grammar lets us but only push to indent stack if we go further down in the stack
if (valid_symbols[VIRTUAL_OPEN_SECTION] && !lexer->eof(lexer)) {
indent_length_stack.push_back(lexer->get_column(lexer));
if (closing) {
return false;
}
if (lexer->lookahead == '|') {
skip(lexer);
if (lexer->lookahead == '}' || lexer->lookahead == ']') {
return false;
}
}
lexer->result_symbol = VIRTUAL_OPEN_SECTION;
return true;
}
else if (valid_symbols[BLOCK_COMMENT_CONTENT]) {
if (!can_call_mark_end) { return false; }
lexer->mark_end(lexer);
while (true) {
if (lexer->lookahead == '\0') { break; }
if (lexer->lookahead != '(' && lexer->lookahead != '*') {
advance(lexer);
}
else if (lexer->lookahead == '*') {
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == ')')
{
break;
}
}
else if (scan_block_comment(lexer))
{
lexer->mark_end(lexer);
advance(lexer);
if (lexer->lookahead == '*')
{
break;
}
}
}
lexer->result_symbol = BLOCK_COMMENT_CONTENT;
return true;
}
else if (has_newline) {
// We had a newline now it's time to check if we need to add multiple tokens to get back up to the right level
runback.clear();
while (indent_length <= indent_length_stack.back()) {
if (indent_length == indent_length_stack.back()) {
// Don't insert VIRTUAL_END_DECL when there is a line comment incoming
if (lexer->lookahead == '/') {
skip(lexer);
if (lexer->lookahead == '/') { break; }
}
// Don't insert VIRTUAL_END_DECL when there is a block comment incoming
if (lexer->lookahead == '(') {
skip(lexer);
if (lexer->lookahead == '*') { break; }
}
runback.push_back(0);
break;
}
else if (indent_length < indent_length_stack.back()) {
indent_length_stack.pop_back();
runback.push_back(1);
}
}
// Our list is the wrong way around, reverse it
std::reverse(runback.begin(), runback.end());
// Handle the first runback token if we have them, if there are more they will be handled on the next scan operation
if (!runback.empty() && runback.back() == 0 && valid_symbols[VIRTUAL_END_ALIGNED]) {
runback.pop_back();
lexer->result_symbol = VIRTUAL_END_ALIGNED;
return true;
}
if (!runback.empty() && runback.back() == 1 && valid_symbols[VIRTUAL_END_SECTION]) {
runback.pop_back();
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
else if (lexer->eof(lexer) && valid_symbols[VIRTUAL_END_SECTION]) {
lexer->result_symbol = VIRTUAL_END_SECTION;
return true;
}
}
return false;
}
uint32_t indent_length;
vector<uint32_t> indent_length_stack;
vector<uint8_t> runback;
};
} // namespace end
extern "C" {
void *tree_sitter_fsharp_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_fsharp_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_fsharp_external_scanner_serialize(void *payload, char *buffer) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(buffer);
}
void tree_sitter_fsharp_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(buffer, length);
}
void tree_sitter_fsharp_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}