From 222053dfd832047b5d7517f30aec8c1efbe37a75 Mon Sep 17 00:00:00 2001 From: ganezdragon Date: Sat, 10 Feb 2024 15:20:46 +0530 Subject: [PATCH] converting scanner.cc to scanner.c --- .vscode/settings.json | 3 +- Package.swift | 2 +- src/grammar.json | 4 +- src/scanner.c | 1066 +++++++++++++++++++++++++++++++++++++++++ src/scanner.cc | 785 ------------------------------ 5 files changed, 1070 insertions(+), 790 deletions(-) create mode 100644 src/scanner.c delete mode 100644 src/scanner.cc diff --git a/.vscode/settings.json b/.vscode/settings.json index 9c8e20426..6fcdfc4c5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -67,6 +67,7 @@ "type_traits": "cpp", "typeinfo": "cpp", "unordered_map": "cpp", - "utility": "cpp" + "utility": "cpp", + "__memory": "c" } } \ No newline at end of file diff --git a/Package.swift b/Package.swift index 72e92694e..d4c0049d1 100644 --- a/Package.swift +++ b/Package.swift @@ -27,7 +27,7 @@ let package = Package( ], sources: [ "src/parser.c", - "src/scanner.cc", + "src/scanner.c", ], publicHeadersPath: "bindings/swift", cSettings: [.headerSearchPath("src")]) diff --git a/src/grammar.json b/src/grammar.json index 91d74afde..91edb74e9 100644 --- a/src/grammar.json +++ b/src/grammar.json @@ -8702,9 +8702,7 @@ "name": "_pod_content" } ], - "inline": [ - "semi_colon" - ], + "inline": [], "supertypes": [] } diff --git a/src/scanner.c b/src/scanner.c new file mode 100644 index 000000000..306cdad06 --- /dev/null +++ b/src/scanner.c @@ -0,0 +1,1066 @@ +#include + +#include +#include +#include +#include +#include + +// #include +// #include +// #include +// #include +// #include + +// using std::vector; +// using std::memcpy; +// using std::regex; + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +// String related macros +#define STRING_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, ((_cap) + 1) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + memset((vec).data + (vec).len, 0, \ + (((_cap) + 1) - (vec).len) * sizeof((vec).data[0])); \ + (vec).cap = (_cap); + +#define STRING_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) \ + { \ + STRING_RESIZE((vec), (_cap)); \ + } + +#define STRING_PUSH(vec, el) \ + if ((vec).cap == (vec).len) \ + { \ + STRING_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define STRING_FREE(vec) \ + if ((vec).data != NULL) \ + free((vec).data); \ + (vec).data = NULL; + +#define STRING_CLEAR(vec) \ + { \ + (vec).len = 0; \ + memset((vec).data, 0, (vec).cap * sizeof(char)); \ + } + +#define MAX_QUEUE_SIZE 1000 + +enum TokenType +{ + START_DELIMITER, + END_DELIMITER, + STRING_CONTENT, + STRING_SINGLE_QUOTED_CONTENT, + STRING_QQ_QUOTED_CONTENT, + STRING_DOUBLE_QUOTED_CONTENT, + START_DELIMITER_QW, + ELEMENT_IN_QW, + END_DELIMITER_QW, + START_DELIMITER_REGEX, + REGEX_PATTERN, + END_DELIMITER_REGEX, + START_DELIMITER_SEARCH_REPLACE, + SEARCH_REPLACE_CONTENT, + SEPARATOR_DELIMITER_SEARCH_REPLACE, + END_DELIMITER_SEARCH_REPLACE, + START_DELIMITER_TRANSLITERATION, + TRANSLITERATION_CONTENT, + SEPARATOR_DELIMITER_TRANSLITERATION, + END_DELIMITER_TRANSLITERATION, + IMAGINARY_HEREDOC_START, + HEREDOC_START_IDENTIFIER, + HEREDOC_CONTENT, + HEREDOC_END_IDENTIFIER, + POD_CONTENT, +}; + +typedef struct +{ + uint32_t cap; + uint32_t len; + char *data; +} String; + +static String string_new() +{ + return (String){.cap = 16, .len = 0, .data = calloc(1, sizeof(char) * 17)}; +} + +// START OF --- a array implementation of STRING queue in C +typedef struct +{ + int front, rear, size; + unsigned capacity; + String *array; +} Queue; + +// function to create a queue +// of given capacity. +// It initializes size of queue as 0 +static Queue *createQueue(unsigned capacity) +{ + Queue *queue = malloc(sizeof(Queue)); + + queue->capacity = capacity; + queue->front = queue->size = 0; + + // This is important, see the enqueue + queue->rear = capacity - 1; + queue->array = (String *)malloc(queue->capacity * sizeof(String)); + return queue; +} + +// Queue is full when size becomes +// equal to the capacity +static int isFull(Queue *queue) +{ + return (queue->size == queue->capacity); +} + +// Queue is empty when size is 0 +int isEmpty(Queue *queue) +{ + return (queue->size == 0); +} + +// Function to add an item to the queue. +// It changes rear and size +static void enqueue(Queue *queue, String item) +{ + if (isFull(queue)) + return; + queue->rear = (queue->rear + 1) % queue->capacity; + queue->array[queue->rear] = item; + queue->size = queue->size + 1; +} + +// Function to remove an item from queue. +// It changes front and size +static String dequeue(Queue *queue) +{ + // if (isEmpty(queue)) + // return NULL; + String item = queue->array[queue->front]; + queue->front = (queue->front + 1) % queue->capacity; + queue->size = queue->size - 1; + return item; +} + +// Function to get front of queue +static String front(Queue *queue) +{ + // if (isEmpty(queue)) + // return CHAR_MIN; + return queue->array[queue->front]; +} + +// Function to get rear of queue +static String rear(Queue *queue) +{ + // if (isEmpty(queue)) + // return CHAR_MIN; + return queue->array[queue->rear]; +} + +// END OF --- a array implementation of STRING queue in C + +// START OF --- a array implementation of Boolean queue in C +typedef struct +{ + int front, rear, size; + unsigned capacity; + bool *array; +} BoolQueue; + +// function to create a queue +// of given capacity. +// It initializes size of queue as 0 +static BoolQueue *createBoolQueue(unsigned capacity) +{ + BoolQueue *queue = malloc(sizeof(BoolQueue)); + + queue->capacity = capacity; + queue->front = queue->size = 0; + + // This is important, see the enqueue + queue->rear = capacity - 1; + queue->array = (bool *)malloc(queue->capacity * sizeof(bool)); + return queue; +} + +// BoolQueue is full when size becomes +// equal to the capacity +static int isBoolQueueFull(BoolQueue *queue) +{ + return (queue->size == queue->capacity); +} + +// BoolQueue is empty when size is 0 +int isBoolQueueEmpty(BoolQueue *queue) +{ + return (queue->size == 0); +} + +// Function to add an item to the queue. +// It changes rear and size +static void enqueueBoolQueue(BoolQueue *queue, bool item) +{ + if (isFull(queue)) + return; + queue->rear = (queue->rear + 1) % queue->capacity; + queue->array[queue->rear] = item; + queue->size = queue->size + 1; +} + +// Function to remove an item from queue. +// It changes front and size +static bool dequeueBoolQueue(BoolQueue *queue) +{ + if (isEmpty(queue)) + return NULL; + bool item = queue->array[queue->front]; + queue->front = (queue->front + 1) % queue->capacity; + queue->size = queue->size - 1; + return item; +} + +// Function to get front of queue +static bool frontBoolQueue(BoolQueue *queue) +{ + // if (isEmpty(queue)) + // return CHAR_MIN; + return queue->array[queue->front]; +} + +// Function to get rear of queue +static bool rearBoolQueue(BoolQueue *queue) +{ + // if (isEmpty(queue)) + // return CHAR_MIN; + return queue->array[queue->rear]; +} + +// END OF --- a array implementation of Boolean queue in C + +typedef struct +{ + bool started_heredoc; + bool started_heredoc_body; + Queue *heredoc_identifier_queue; + BoolQueue *heredoc_allows_interpolation; + BoolQueue *heredoc_allows_indent; +} Heredoc; + +static Heredoc heredoc_new() +{ + Heredoc heredoc = { + .started_heredoc = false, + .started_heredoc_body = false, + .heredoc_identifier_queue = createQueue(MAX_QUEUE_SIZE), + .heredoc_allows_interpolation = createQueue(MAX_QUEUE_SIZE), + .heredoc_allows_indent = createQueue(MAX_QUEUE_SIZE), + }; + return heredoc; +} + +typedef struct +{ + int32_t start_delimiter_char; + int32_t end_delimiter_char; + bool is_separator_delimiter_parsed; + bool is_delimiter_enclosing; // is the delimiter {}, <> and same character not //, !! + Heredoc heredoc; +} Scanner; + +static void advance(TSLexer *lexer) +{ + lexer->advance(lexer, false); +} + +static void skip(TSLexer *lexer) +{ + lexer->advance(lexer, true); +} + +int iswspace(wint_t wc); + +// runs over spaces like a champ +static void run_over_spaces(TSLexer *lexer) +{ + while (iswspace(lexer->lookahead)) + skip(lexer); +} + +// runs with the spaces using advance +static void run_with_spaces(TSLexer *lexer) +{ + while (iswspace(lexer->lookahead)) + advance(lexer); +} + +static int32_t get_end_delimiter(Scanner *scanner) +{ + return scanner->end_delimiter_char; +} + +static bool handle_interpolation(Scanner *scanner, TSLexer *lexer, enum TokenType surrounding_token) +{ + if (lexer->lookahead == '$') + { + + // allow $ to be last character in a regex + if (surrounding_token == SEARCH_REPLACE_CONTENT || surrounding_token == REGEX_PATTERN) + { + advance(lexer); + run_with_spaces(lexer); + if (lexer->lookahead == get_end_delimiter(scanner)) + { + lexer->result_symbol = surrounding_token; + lexer->mark_end(lexer); + return true; + } + } + return false; + } + + return false; +} + +static bool handle_escape_sequence(TSLexer *lexer, enum TokenType surrounding_token) +{ + // escape sequences, only basic support as of now + if (lexer->lookahead == '\\') + { + advance(lexer); + // also, self end delimiter will be treated as string + if ( + lexer->lookahead == 't' || lexer->lookahead == 'n' || lexer->lookahead == 'r' || lexer->lookahead == 'f' || lexer->lookahead == 'b' || lexer->lookahead == 'a' || lexer->lookahead == 'e') + { + // advance(lexer); + lexer->mark_end(lexer); + return false; + } + else + { + lexer->result_symbol = surrounding_token; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + return false; + } + return false; +} + +static bool scan_nested_delimiters(Scanner *scanner, TSLexer *lexer, enum TokenType token_type) +{ + while (lexer->lookahead) + { + if (lexer->lookahead == get_end_delimiter(scanner)) + { + lexer->result_symbol = token_type; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + else if (lexer->lookahead == scanner->start_delimiter_char) + { + lexer->result_symbol = token_type; + advance(lexer); + scan_nested_delimiters(scanner, lexer, token_type); + } + else if (lexer->lookahead == '\\') + { + advance(lexer); + advance(lexer); + } + else + { + advance(lexer); + } + } + lexer->mark_end(lexer); + return false; +} + +static bool parse_delimited_and_interpolated_content(Scanner *scanner, TSLexer *lexer, enum TokenType token_type, enum TokenType ending_delimiter) +{ + if (lexer->lookahead == get_end_delimiter(scanner)) + { + lexer->result_symbol = ending_delimiter; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + else + { + // oh boy! the interpolation + if (lexer->lookahead == '$') + { + return handle_interpolation(scanner, lexer, token_type); + } + // escape sequences, only basic support as of now + if (lexer->lookahead == '\\') + { + return handle_escape_sequence(lexer, token_type); + } + + if (!lexer->lookahead) + { + lexer->mark_end(lexer); + return false; + } + + // handling nested delimiters qq { hello { from { the}}}; + if (lexer->lookahead == scanner->start_delimiter_char) + { + lexer->result_symbol = token_type; + advance(lexer); + return scan_nested_delimiters(scanner, lexer, token_type); + } + + lexer->result_symbol = token_type; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + + // shouldn't reach here + return false; +} + +static void set_end_delimiter(Scanner *scanner, int32_t start_delimiter) +{ + // round, angle, square, curly + scanner->is_delimiter_enclosing = true; + if (start_delimiter == '(') + { + scanner->end_delimiter_char = ')'; + } + else if (start_delimiter == '<') + { + scanner->end_delimiter_char = '>'; + } + else if (start_delimiter == '[') + { + scanner->end_delimiter_char = ']'; + } + else if (start_delimiter == '{') + { + scanner->end_delimiter_char = '}'; + } + else + { + scanner->is_delimiter_enclosing = false; + scanner->end_delimiter_char = start_delimiter; + } +} + +static bool process_separator_delimiter(Scanner *scanner, TSLexer *lexer, enum TokenType separator_token, enum TokenType end_token) +{ + if (scanner->is_separator_delimiter_parsed) + { + lexer->result_symbol = end_token; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + else + { + lexer->result_symbol = separator_token; + advance(lexer); + lexer->mark_end(lexer); + + // if delimiter is {}, (), <>, [] + if (scanner->is_delimiter_enclosing) + { + run_over_spaces(lexer); + + if (lexer->lookahead == scanner->start_delimiter_char) + { + lexer->result_symbol = separator_token; + advance(lexer); + lexer->mark_end(lexer); + + scanner->is_separator_delimiter_parsed = true; + + return true; + } + + return false; + } + else + { + scanner->is_separator_delimiter_parsed = true; + + return true; + } + + return false; + } +} + +// Give a token type, parses the start delimiter, +// and keeps track of it in memory. +static bool parse_start_delimiter(Scanner *scanner, TSLexer *lexer, enum TokenType token_type) +{ + run_over_spaces(lexer); + + scanner->start_delimiter_char = lexer->lookahead; + set_end_delimiter(scanner, scanner->start_delimiter_char); + + // for substitute and tr/y usecase + scanner->is_separator_delimiter_parsed = false; + + lexer->result_symbol = token_type; + advance(lexer); + lexer->mark_end(lexer); + + return true; +} + +/** + * Consume a "word" in POSIX parlance, and returns it unquoted. + * + * This is an approximate implementation that doesn't deal with any + * POSIX-mandated substitution, and assumes the default value for + * IFS. + */ +bool advance_word(Scanner *scanner, TSLexer *lexer, String *unquoted_word, bool *allows_interpolation) +{ + bool empty = true; + bool has_space_before = false; + *allows_interpolation = true; + + // <<~EOF + if (lexer->lookahead == '~') + { + enqueueBoolQueue(scanner->heredoc.heredoc_allows_indent, true); + advance(lexer); + } + else + { + enqueueBoolQueue(scanner->heredoc.heredoc_allows_indent, false); + } + + // <<\EOF, <<~\EOF + if (lexer->lookahead == '\\') + { + *allows_interpolation = false; + advance(lexer); + } + + // run over the spaces + if (iswspace(lexer->lookahead)) + { + run_over_spaces(lexer); + has_space_before = true; + } + + int32_t quote = 0; + if ( + lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '`') + { + *allows_interpolation = (lexer->lookahead == '\'') ? false : true; + quote = lexer->lookahead; + advance(lexer); + } + else if (has_space_before) + { + return false; + } + + regex_t regex; + // compile the regex expression + regcomp(®ex, "[a-zA-Z0-9]", 0); + while ( + lexer->lookahead && !regexec(®ex, (char*)lexer->lookahead, 0, NULL, 0) + // && std::regex_match(std::string(1, static_cast(lexer->lookahead)), identifier_regex) + && !(quote ? lexer->lookahead == quote : iswspace(lexer->lookahead))) + { + // TODO: check this below condition + if (lexer->lookahead == '\\') + { + advance(lexer); + if (!lexer->lookahead) + return false; + } + empty = false; + STRING_PUSH(*unquoted_word, lexer->lookahead); + advance(lexer); + } + + // free regex memory + regfree(®ex); + + if (quote && lexer->lookahead == quote) + { + advance(lexer); + } + + return !empty; +} + +bool exit_if_heredoc_end_delimiter(Scanner *scanner, TSLexer *lexer) +{ + String word = string_new(); + // lexer->result_symbol = HEREDOC_END_IDENTIFIER; + while (!iswspace(lexer->lookahead)) + { + // printf("string here - %c", lexer->lookahead); + STRING_PUSH(word, lexer->lookahead); + advance(lexer); + + if (!lexer->lookahead) + { + break; + } + } + + if (word.data == front(scanner->heredoc.heredoc_identifier_queue).data) + { + // if (1) { + lexer->result_symbol = HEREDOC_END_IDENTIFIER; + lexer->mark_end(lexer); + + // unset stuffs + scanner->heredoc.started_heredoc = false; + scanner->heredoc.started_heredoc_body = false; + dequeue(scanner->heredoc.heredoc_identifier_queue); + dequeue(scanner->heredoc.heredoc_allows_interpolation); + return true; + } + else + { + lexer->result_symbol = HEREDOC_CONTENT; + return true; + } +} + +static inline bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) +{ + // on ERROR, external scanner is called with all valid_symbols to be true. + // so for our usecase we need this logic. + // ref - https://github.com/tree-sitter/tree-sitter/issues/1128 + if ( + valid_symbols[START_DELIMITER] && valid_symbols[END_DELIMITER] && valid_symbols[STRING_CONTENT] && valid_symbols[STRING_SINGLE_QUOTED_CONTENT] && valid_symbols[STRING_QQ_QUOTED_CONTENT] && valid_symbols[STRING_DOUBLE_QUOTED_CONTENT] && valid_symbols[START_DELIMITER_QW] && valid_symbols[END_DELIMITER_QW] && valid_symbols[START_DELIMITER_REGEX] && valid_symbols[REGEX_PATTERN] && valid_symbols[END_DELIMITER_REGEX] && valid_symbols[START_DELIMITER_SEARCH_REPLACE] && valid_symbols[SEARCH_REPLACE_CONTENT] && valid_symbols[SEPARATOR_DELIMITER_SEARCH_REPLACE] && valid_symbols[END_DELIMITER_SEARCH_REPLACE] && valid_symbols[START_DELIMITER_TRANSLITERATION] && valid_symbols[TRANSLITERATION_CONTENT] && valid_symbols[SEPARATOR_DELIMITER_TRANSLITERATION] && valid_symbols[END_DELIMITER_TRANSLITERATION] && valid_symbols[IMAGINARY_HEREDOC_START] && valid_symbols[HEREDOC_START_IDENTIFIER] && valid_symbols[HEREDOC_CONTENT] && valid_symbols[HEREDOC_END_IDENTIFIER] && valid_symbols[POD_CONTENT]) + { + return false; + } + + if (valid_symbols[STRING_SINGLE_QUOTED_CONTENT]) + { + + // end when you reach the final single quote ' + if (lexer->lookahead == '\'') + { + lexer->mark_end(lexer); + advance(lexer); + return false; + } + // check for escaped single quote \' + else if (lexer->lookahead == '\\') + { + lexer->result_symbol = STRING_SINGLE_QUOTED_CONTENT; + advance(lexer); + + if (lexer->lookahead == '\'') + { + advance(lexer); + } + lexer->mark_end(lexer); + return true; + } + + // some exit conditions + if (!lexer->lookahead) + { + lexer->mark_end(lexer); + return false; + } + + lexer->result_symbol = STRING_SINGLE_QUOTED_CONTENT; + advance(lexer); + lexer->mark_end(lexer); + + return true; + } + + // TODO: handle qqqSTRINGq; - this should throw error + if (valid_symbols[START_DELIMITER]) + { + return parse_start_delimiter(scanner, lexer, START_DELIMITER); + } + + if (valid_symbols[STRING_QQ_QUOTED_CONTENT]) + { + return parse_delimited_and_interpolated_content(scanner, lexer, STRING_QQ_QUOTED_CONTENT, END_DELIMITER); + } + + if (valid_symbols[STRING_DOUBLE_QUOTED_CONTENT]) + { + if (lexer->lookahead == '"') + { + lexer->mark_end(lexer); + advance(lexer); + return false; + } + + // oh boy! the interpolation + if (lexer->lookahead == '$') + { + return handle_interpolation(scanner, lexer, STRING_DOUBLE_QUOTED_CONTENT); + } + // escape sequences, only basic support as of now + if (lexer->lookahead == '\\') + { + return handle_escape_sequence(lexer, STRING_DOUBLE_QUOTED_CONTENT); + } + + // some exit conditions + if (!lexer->lookahead) + { + lexer->mark_end(lexer); + return false; + } + + lexer->result_symbol = STRING_DOUBLE_QUOTED_CONTENT; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + + if (valid_symbols[START_DELIMITER_QW]) + { + return parse_start_delimiter(scanner, lexer, START_DELIMITER_QW); + } + + if (valid_symbols[ELEMENT_IN_QW]) + { + run_over_spaces(lexer); + + if (lexer->lookahead == get_end_delimiter(scanner)) + { + lexer->result_symbol = END_DELIMITER_QW; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + + // exit condition + if (!lexer->lookahead) + { + lexer->mark_end(lexer); + return false; + } + + while ( + lexer->lookahead // exit condition + && lexer->lookahead != ' ' && lexer->lookahead != '\t' && lexer->lookahead != '\r' && lexer->lookahead != '\n' && lexer->lookahead != get_end_delimiter(scanner)) + { + lexer->result_symbol = ELEMENT_IN_QW; + advance(lexer); + } + + lexer->mark_end(lexer); + return true; + } + + if (valid_symbols[START_DELIMITER_REGEX]) + { + return parse_start_delimiter(scanner, lexer, START_DELIMITER_REGEX); + } + if (valid_symbols[REGEX_PATTERN]) + { + return parse_delimited_and_interpolated_content(scanner, lexer, REGEX_PATTERN, END_DELIMITER_REGEX); + } + + if (valid_symbols[START_DELIMITER_SEARCH_REPLACE]) + { + return parse_start_delimiter(scanner, lexer, START_DELIMITER_SEARCH_REPLACE); + } + + if (valid_symbols[SEARCH_REPLACE_CONTENT]) + { + if (lexer->lookahead == get_end_delimiter(scanner)) + { + return process_separator_delimiter(scanner, lexer, SEPARATOR_DELIMITER_SEARCH_REPLACE, END_DELIMITER_SEARCH_REPLACE); + } + else + { + // oh boy! the interpolation + if (lexer->lookahead == '$') + { + return handle_interpolation(scanner, lexer, SEARCH_REPLACE_CONTENT); + } + // escape sequences, only basic support as of now + if (lexer->lookahead == '\\') + { + return handle_escape_sequence(lexer, SEARCH_REPLACE_CONTENT); + } + + // some exit conditions + if (!lexer->lookahead) + { + lexer->mark_end(lexer); + return false; + } + + // handling nested delimiters qq { hello { from { the}}}; + if (lexer->lookahead == scanner->start_delimiter_char) + { + lexer->result_symbol = SEARCH_REPLACE_CONTENT; + advance(lexer); + return scan_nested_delimiters(scanner, lexer, SEARCH_REPLACE_CONTENT); + } + + lexer->result_symbol = SEARCH_REPLACE_CONTENT; + advance(lexer); + return true; + } + } + + if (valid_symbols[START_DELIMITER_TRANSLITERATION]) + { + return parse_start_delimiter(scanner, lexer, START_DELIMITER_TRANSLITERATION); + } + if (valid_symbols[TRANSLITERATION_CONTENT]) + { + if (lexer->lookahead == get_end_delimiter(scanner)) + { + return process_separator_delimiter(scanner, lexer, SEPARATOR_DELIMITER_TRANSLITERATION, END_DELIMITER_TRANSLITERATION); + } + + // exit condition + if (!lexer->lookahead) + { + lexer->mark_end(lexer); + return false; + } + + // escape sequence + if (lexer->lookahead == '\\') + { + lexer->result_symbol = TRANSLITERATION_CONTENT; + advance(lexer); + // self end delimiter + if (lexer->lookahead == get_end_delimiter(scanner)) + { + advance(lexer); + } + + lexer->mark_end(lexer); + return true; + } + + // handling nested delimiters qq { hello { from { the}}}; + if (lexer->lookahead == scanner->start_delimiter_char) + { + lexer->result_symbol = TRANSLITERATION_CONTENT; + advance(lexer); + return scan_nested_delimiters(scanner, lexer, TRANSLITERATION_CONTENT); + } + + lexer->result_symbol = TRANSLITERATION_CONTENT; + advance(lexer); + lexer->mark_end(lexer); + return true; + } + + if (valid_symbols[HEREDOC_START_IDENTIFIER]) + { + lexer->result_symbol = HEREDOC_START_IDENTIFIER; + + String delimiter = string_new(); + bool *allows_interpolation; + bool found_delimiter = advance_word(scanner, lexer, &delimiter, allows_interpolation); + if (found_delimiter) + { + enqueue(scanner->heredoc.heredoc_identifier_queue, delimiter); + enqueueBoolQueue(scanner->heredoc.heredoc_allows_interpolation, allows_interpolation); + + scanner->heredoc.started_heredoc = true; + } + + return found_delimiter; + } + + if ( + (valid_symbols[HEREDOC_CONTENT] || valid_symbols[IMAGINARY_HEREDOC_START]) && !isBoolQueueEmpty(scanner->heredoc.heredoc_identifier_queue)) + { + // another exit condition + if (!lexer->lookahead && !scanner->heredoc.started_heredoc_body) + { + return false; + } + + if (lexer->lookahead == '\n' && !scanner->heredoc.started_heredoc_body) + { + scanner->heredoc.started_heredoc_body = true; + + lexer->result_symbol = IMAGINARY_HEREDOC_START; + lexer->mark_end(lexer); + return true; + } + + if (scanner->heredoc.started_heredoc_body) + { + switch (lexer->lookahead) + { + case '\\': + { + if (frontBoolQueue(scanner->heredoc.heredoc_allows_interpolation)) + { + return handle_escape_sequence(lexer, HEREDOC_CONTENT); + } + } + + case '$': + { + if (frontBoolQueue(scanner->heredoc.heredoc_allows_interpolation)) + { + return false; + } + } + + case '\n': + { + skip(lexer); + lexer->mark_end(lexer); + // TODO: validate all possible intended heredocs properly + if (frontBoolQueue(scanner->heredoc.heredoc_allows_indent)) + { + while (iswspace(lexer->lookahead)) + { + advance(lexer); + } + } + return exit_if_heredoc_end_delimiter(scanner, lexer); + } + + default: + { + // exit condition + if (!lexer->lookahead) + { + scanner->heredoc.started_heredoc_body = false; + lexer->mark_end(lexer); + return false; + } + lexer->result_symbol = HEREDOC_CONTENT; + advance(lexer); + return true; + } + } + } + else + { + return false; + } + } + + if (valid_symbols[POD_CONTENT]) + { + + while (lexer->lookahead) + { + lexer->result_symbol = POD_CONTENT; + + // if it is =cut that marks the end of pod content + if (lexer->lookahead == '=') + { + lexer->advance(lexer, false); + if (lexer->lookahead == 'c') + { + lexer->advance(lexer, false); + if (lexer->lookahead == 'u') + { + lexer->advance(lexer, false); + if (lexer->lookahead == 't') + { + lexer->advance(lexer, false); + lexer->mark_end(lexer); + return true; + } + } + } + } + else + { + lexer->advance(lexer, false); + } + } + + // or if it end of the file also, mark the end of pod content + lexer->mark_end(lexer); + return true; + } + + return false; +} + +static unsigned serialize(Scanner *scanner, char *buffer) +{ + uint32_t size = 0; + + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) +{ +} + +void *tree_sitter_perl_external_scanner_create() +{ + Scanner *scanner = malloc(sizeof(Scanner)); + scanner->heredoc = heredoc_new(); + return scanner; +} + +unsigned tree_sitter_perl_external_scanner_serialize( + void *payload, + char *buffer) +{ + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_perl_external_scanner_deserialize( + void *payload, + const char *buffer, + unsigned length) +{ + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); // TODO: need to deserialize heredoc +} + +bool tree_sitter_perl_external_scanner_scan( + void *payload, + TSLexer *lexer, + const bool *valid_symbols) +{ + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +void tree_sitter_perl_external_scanner_destroy(void *payload) +{ + Scanner *scanner = (Scanner *)payload; + // for (size_t i = 0; i < scanner->heredocs.len; i++) { + // Heredoc *heredoc = &scanner->heredocs.data[i]; + // STRING_FREE(heredoc->current_leading_word); + // STRING_FREE(heredoc->delimiter); + // } + // VEC_FREE(scanner->heredocs); + free(scanner); +} diff --git a/src/scanner.cc b/src/scanner.cc deleted file mode 100644 index 77f8427d5..000000000 --- a/src/scanner.cc +++ /dev/null @@ -1,785 +0,0 @@ -#include -#include -#include -#include -#include -#include - -namespace { - - using std::vector; - using std::memcpy; - using std::regex; - - enum TokenType { - START_DELIMITER, - END_DELIMITER, - STRING_CONTENT, - STRING_SINGLE_QUOTED_CONTENT, - STRING_QQ_QUOTED_CONTENT, - STRING_DOUBLE_QUOTED_CONTENT, - START_DELIMITER_QW, - ELEMENT_IN_QW, - END_DELIMITER_QW, - START_DELIMITER_REGEX, - REGEX_PATTERN, - END_DELIMITER_REGEX, - START_DELIMITER_SEARCH_REPLACE, - SEARCH_REPLACE_CONTENT, - SEPARATOR_DELIMITER_SEARCH_REPLACE, - END_DELIMITER_SEARCH_REPLACE, - START_DELIMITER_TRANSLITERATION, - TRANSLITERATION_CONTENT, - SEPARATOR_DELIMITER_TRANSLITERATION, - END_DELIMITER_TRANSLITERATION, - IMAGINARY_HEREDOC_START, - HEREDOC_START_IDENTIFIER, - HEREDOC_CONTENT, - HEREDOC_END_IDENTIFIER, - POD_CONTENT, - }; - - struct Delimiter { - - int32_t get_end_delimiter() { - return end_delimiter; - } - - int32_t end_delimiter; - }; - - struct Scanner { - Scanner() { - // assert(sizeof(Delimiter) == sizeof(char)); - deserialize(NULL, 0); - } - - unsigned serialize(char *buffer) { - size_t no_of_bytes = 0; - - // size_t delimiter_count = delimiter_stack.size(); - // if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX; - // buffer[no_of_bytes++] = delimiter_count; - - // if (delimiter_count > 0) { - // memcpy(&buffer[no_of_bytes], delimiter_stack.data(), delimiter_count); - // } - // no_of_bytes += delimiter_count; - - return no_of_bytes; - } - - void deserialize(const char *buffer, unsigned length) { - // delimiter_stack.clear(); - - // if (length > 0) { - // size_t no_of_bytes = 0; - - // size_t delimiter_count = (uint8_t)buffer[no_of_bytes++]; - // delimiter_stack.resize(delimiter_count); - // if (delimiter_count > 0) { - // memcpy(delimiter_stack.data(), &buffer[no_of_bytes], delimiter_count); - // } - // no_of_bytes += delimiter_count; - // } - } - - bool scan(TSLexer *lexer, const bool *valid_symbols) { - // on ERROR, external scanner is called with all valid_symbols to be true. - // so for our usecase we need this logic. - // ref - https://github.com/tree-sitter/tree-sitter/issues/1128 - if ( - valid_symbols[START_DELIMITER] - && valid_symbols[END_DELIMITER] - && valid_symbols[STRING_CONTENT] - && valid_symbols[STRING_SINGLE_QUOTED_CONTENT] - && valid_symbols[STRING_QQ_QUOTED_CONTENT] - && valid_symbols[STRING_DOUBLE_QUOTED_CONTENT] - && valid_symbols[START_DELIMITER_QW] - && valid_symbols[END_DELIMITER_QW] - && valid_symbols[START_DELIMITER_REGEX] - && valid_symbols[REGEX_PATTERN] - && valid_symbols[END_DELIMITER_REGEX] - && valid_symbols[START_DELIMITER_SEARCH_REPLACE] - && valid_symbols[SEARCH_REPLACE_CONTENT] - && valid_symbols[SEPARATOR_DELIMITER_SEARCH_REPLACE] - && valid_symbols[END_DELIMITER_SEARCH_REPLACE] - && valid_symbols[START_DELIMITER_TRANSLITERATION] - && valid_symbols[TRANSLITERATION_CONTENT] - && valid_symbols[SEPARATOR_DELIMITER_TRANSLITERATION] - && valid_symbols[END_DELIMITER_TRANSLITERATION] - && valid_symbols[IMAGINARY_HEREDOC_START] - && valid_symbols[HEREDOC_START_IDENTIFIER] - && valid_symbols[HEREDOC_CONTENT] - && valid_symbols[HEREDOC_END_IDENTIFIER] - && valid_symbols[POD_CONTENT] - ) { - return false; - } - - if (valid_symbols[STRING_SINGLE_QUOTED_CONTENT]) { - - // end when you reach the final single quote ' - if (lexer->lookahead == '\'') { - lexer->mark_end(lexer); - advance(lexer); - return false; - } - // check for escaped single quote \' - else if (lexer->lookahead == '\\') { - lexer->result_symbol = STRING_SINGLE_QUOTED_CONTENT; - advance(lexer); - - if (lexer->lookahead == '\'') { - advance(lexer); - } - lexer->mark_end(lexer); - return true; - } - - // some exit conditions - if (!lexer->lookahead) { - lexer->mark_end(lexer); - return false; - } - - lexer->result_symbol = STRING_SINGLE_QUOTED_CONTENT; - advance(lexer); - lexer->mark_end(lexer); - - return true; - } - - // TODO: handle qqqSTRINGq; - this should throw error - if (valid_symbols[START_DELIMITER]) { - return parse_start_delimiter(lexer, START_DELIMITER); - } - - if (valid_symbols[STRING_QQ_QUOTED_CONTENT]) { - return parse_delimited_and_interpolated_content(lexer, STRING_QQ_QUOTED_CONTENT, END_DELIMITER); - } - - if (valid_symbols[STRING_DOUBLE_QUOTED_CONTENT]) { - if (lexer->lookahead == '"') { - lexer->mark_end(lexer); - advance(lexer); - return false; - } - - // oh boy! the interpolation - if (lexer->lookahead == '$') { - return handle_interpolation(lexer, STRING_DOUBLE_QUOTED_CONTENT); - } - // escape sequences, only basic support as of now - if (lexer->lookahead == '\\') { - return handle_escape_sequence(lexer, STRING_DOUBLE_QUOTED_CONTENT); - } - - // some exit conditions - if (!lexer->lookahead) { - lexer->mark_end(lexer); - return false; - } - - lexer->result_symbol = STRING_DOUBLE_QUOTED_CONTENT; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - - if (valid_symbols[START_DELIMITER_QW]) { - return parse_start_delimiter(lexer, START_DELIMITER_QW); - } - - if (valid_symbols[ELEMENT_IN_QW]) { - run_over_spaces(lexer); - - if (lexer->lookahead == get_end_delimiter()) { - lexer->result_symbol = END_DELIMITER_QW; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - - // exit condition - if (!lexer->lookahead) { - lexer->mark_end(lexer); - return false; - } - - while ( - lexer->lookahead // exit condition - && lexer->lookahead != ' ' - && lexer->lookahead != '\t' - && lexer->lookahead != '\r' - && lexer->lookahead != '\n' - && lexer->lookahead != get_end_delimiter() - ) { - lexer->result_symbol = ELEMENT_IN_QW; - advance(lexer); - } - - lexer->mark_end(lexer); - return true; - } - - if (valid_symbols[START_DELIMITER_REGEX]) { - return parse_start_delimiter(lexer, START_DELIMITER_REGEX); - } - if (valid_symbols[REGEX_PATTERN]) { - return parse_delimited_and_interpolated_content(lexer, REGEX_PATTERN, END_DELIMITER_REGEX); - } - - if (valid_symbols[START_DELIMITER_SEARCH_REPLACE]) { - return parse_start_delimiter(lexer, START_DELIMITER_SEARCH_REPLACE); - } - - if (valid_symbols[SEARCH_REPLACE_CONTENT]) { - if (lexer->lookahead == get_end_delimiter()) { - return process_separator_delimiter(lexer, SEPARATOR_DELIMITER_SEARCH_REPLACE, END_DELIMITER_SEARCH_REPLACE); - } - else { - // oh boy! the interpolation - if (lexer->lookahead == '$') { - return handle_interpolation(lexer, SEARCH_REPLACE_CONTENT); - } - // escape sequences, only basic support as of now - if (lexer->lookahead == '\\') { - return handle_escape_sequence(lexer, SEARCH_REPLACE_CONTENT); - } - - // some exit conditions - if (!lexer->lookahead) { - lexer->mark_end(lexer); - return false; - } - - // handling nested delimiters qq { hello { from { the}}}; - if (lexer->lookahead == start_delimiter_char) { - lexer->result_symbol = SEARCH_REPLACE_CONTENT; - advance(lexer); - return scan_nested_delimiters(lexer, SEARCH_REPLACE_CONTENT); - } - - lexer->result_symbol = SEARCH_REPLACE_CONTENT; - advance(lexer); - return true; - } - } - - if (valid_symbols[START_DELIMITER_TRANSLITERATION]) { - return parse_start_delimiter(lexer, START_DELIMITER_TRANSLITERATION); - } - if (valid_symbols[TRANSLITERATION_CONTENT]) { - if (lexer->lookahead == get_end_delimiter()) { - return process_separator_delimiter(lexer, SEPARATOR_DELIMITER_TRANSLITERATION, END_DELIMITER_TRANSLITERATION); - } - - // exit condition - if (!lexer->lookahead) { - lexer->mark_end(lexer); - return false; - } - - // escape sequence - if (lexer->lookahead == '\\') { - lexer->result_symbol = TRANSLITERATION_CONTENT; - advance(lexer); - // self end delimiter - if (lexer->lookahead == get_end_delimiter()) { - advance(lexer); - } - - lexer->mark_end(lexer); - return true; - } - - // handling nested delimiters qq { hello { from { the}}}; - if (lexer->lookahead == start_delimiter_char) { - lexer->result_symbol = TRANSLITERATION_CONTENT; - advance(lexer); - return scan_nested_delimiters(lexer, TRANSLITERATION_CONTENT); - } - - lexer->result_symbol = TRANSLITERATION_CONTENT; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - - if (valid_symbols[HEREDOC_START_IDENTIFIER]) { - lexer->result_symbol = HEREDOC_START_IDENTIFIER; - - std::string delimiter; - bool allows_interpolation; - bool found_delimiter = advance_word(lexer, delimiter, allows_interpolation); - if (found_delimiter) { - heredoc_identifier_queue.push(delimiter); - heredoc_allows_interpolation.push(allows_interpolation); - - started_heredoc = true; - } - - return found_delimiter; - } - - if ( - (valid_symbols[HEREDOC_CONTENT] || valid_symbols[IMAGINARY_HEREDOC_START]) - && !heredoc_identifier_queue.empty() - ) { - // another exit condition - if (!lexer->lookahead && !started_heredoc_body) { - return false; - } - - if (lexer->lookahead == '\n' && !started_heredoc_body) { - started_heredoc_body = true; - - lexer->result_symbol = IMAGINARY_HEREDOC_START; - lexer->mark_end(lexer); - return true; - } - - if (started_heredoc_body) { - switch (lexer->lookahead) { - case '\\': { - if (heredoc_allows_interpolation.front()) { - return handle_escape_sequence(lexer, HEREDOC_CONTENT); - } - } - - case '$': { - if (heredoc_allows_interpolation.front()) { - return false; - } - } - - case '\n': { - skip(lexer); - lexer->mark_end(lexer); - // TODO: validate all possible intended heredocs properly - if (heredoc_allows_indent.front()) { - while (iswspace(lexer->lookahead)) { - advance(lexer); - } - } - return exit_if_heredoc_end_delimiter(lexer); - } - - default: { - // exit condition - if (!lexer->lookahead) { - started_heredoc_body = false; - lexer->mark_end(lexer); - return false; - } - lexer->result_symbol = HEREDOC_CONTENT; - advance(lexer); - return true; - } - } - } - else { - return false; - } - } - - if (valid_symbols[POD_CONTENT]) { - - while (lexer->lookahead) { - lexer->result_symbol = POD_CONTENT; - - // if it is =cut that marks the end of pod content - if (lexer->lookahead == '=') { - lexer->advance(lexer, false); - if (lexer->lookahead == 'c') { - lexer->advance(lexer, false); - if (lexer->lookahead == 'u') { - lexer->advance(lexer, false); - if (lexer->lookahead == 't') { - lexer->advance(lexer, false); - lexer->mark_end(lexer); - return true; - } - } - } - } - else { - lexer->advance(lexer, false); - } - } - - // or if it end of the file also, mark the end of pod content - lexer->mark_end(lexer); - return true; - } - - return false; - } - - bool parse_delimited_and_interpolated_content(TSLexer *lexer, TokenType token_type, TokenType ending_delimiter) { - if (lexer->lookahead == get_end_delimiter()) { - lexer->result_symbol = ending_delimiter; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - else { - // oh boy! the interpolation - if (lexer->lookahead == '$') { - return handle_interpolation(lexer, token_type); - } - // escape sequences, only basic support as of now - if (lexer->lookahead == '\\') { - return handle_escape_sequence(lexer, token_type); - } - - if (!lexer->lookahead) { - lexer->mark_end(lexer); - return false; - } - - // handling nested delimiters qq { hello { from { the}}}; - if (lexer->lookahead == start_delimiter_char) { - lexer->result_symbol = token_type; - advance(lexer); - return scan_nested_delimiters(lexer, token_type); - } - - lexer->result_symbol = token_type; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - - // shouldn't reach here - return false; - } - - bool scan_nested_delimiters(TSLexer *lexer, TokenType token_type) { - while(lexer->lookahead) { - if (lexer->lookahead == get_end_delimiter()) { - lexer->result_symbol = token_type; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - else if (lexer->lookahead == start_delimiter_char) { - lexer->result_symbol = token_type; - advance(lexer); - scan_nested_delimiters(lexer, token_type); - } - else if (lexer->lookahead == '\\') { - advance(lexer); - advance(lexer); - } - else { - advance(lexer); - } - } - lexer->mark_end(lexer); - return false; - } - - void advance(TSLexer *lexer) { - lexer->advance(lexer, false); - } - - void skip(TSLexer *lexer) { - lexer->advance(lexer, true); - } - - void set_end_delimiter(int32_t start_delimiter) { - // round, angle, square, curly - is_delimiter_enclosing = true; - if (start_delimiter == '(') { - end_delimiter_char = ')'; - } - else if (start_delimiter == '<') { - end_delimiter_char = '>'; - } - else if (start_delimiter == '[') { - end_delimiter_char = ']'; - } - else if (start_delimiter == '{') { - end_delimiter_char = '}'; - } - else { - is_delimiter_enclosing = false; - end_delimiter_char = start_delimiter; - } - } - - bool process_separator_delimiter(TSLexer *lexer, TokenType separator_token, TokenType end_token) { - if (is_separator_delimiter_parsed) { - lexer->result_symbol = end_token; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - else { - lexer->result_symbol = separator_token; - advance(lexer); - lexer->mark_end(lexer); - - // if delimiter is {}, (), <>, [] - if (is_delimiter_enclosing) { - run_over_spaces(lexer); - - if (lexer->lookahead == start_delimiter_char) { - lexer->result_symbol = separator_token; - advance(lexer); - lexer->mark_end(lexer); - - is_separator_delimiter_parsed = true; - - return true; - } - - return false; - } - else { - is_separator_delimiter_parsed = true; - - return true; - } - - return false; - } - } - - int32_t get_end_delimiter() { - return end_delimiter_char; - } - - // Give a token type, parses the start delimiter, - // and keeps track of it in memory. - bool parse_start_delimiter(TSLexer *lexer, TokenType token_type) { - run_over_spaces(lexer); - - start_delimiter_char = lexer->lookahead; - set_end_delimiter(start_delimiter_char); - - // for substitute and tr/y usecase - is_separator_delimiter_parsed = false; - - lexer->result_symbol = token_type; - advance(lexer); - lexer->mark_end(lexer); - - return true; - } - - // runs over spaces like a champ - void run_over_spaces(TSLexer *lexer) { - while (iswspace(lexer->lookahead)) skip(lexer); - } - - // runs with the spaces using advance - void run_with_spaces(TSLexer *lexer) { - while (iswspace(lexer->lookahead)) advance(lexer); - } - - bool handle_interpolation(TSLexer *lexer, TokenType surrounding_token) { - if (lexer->lookahead == '$') { - - // allow $ to be last character in a regex - if (surrounding_token == SEARCH_REPLACE_CONTENT || surrounding_token == REGEX_PATTERN) { - advance(lexer); - run_with_spaces(lexer); - if (lexer->lookahead == get_end_delimiter()) { - lexer->result_symbol = surrounding_token; - lexer->mark_end(lexer); - return true; - } - } - return false; - } - - return false; - } - - bool handle_escape_sequence(TSLexer *lexer, TokenType surrounding_token) { - // escape sequences, only basic support as of now - if (lexer->lookahead == '\\') { - advance(lexer); - // also, self end delimiter will be treated as string - if ( - lexer->lookahead == 't' || lexer->lookahead == 'n' || lexer->lookahead == 'r' || lexer->lookahead == 'f' || lexer->lookahead == 'b' || lexer->lookahead == 'a' || lexer->lookahead == 'e' - ) { - // advance(lexer); - lexer->mark_end(lexer); - return false; - } - else { - lexer->result_symbol = surrounding_token; - advance(lexer); - lexer->mark_end(lexer); - return true; - } - return false; - } - return false; - } - - bool handle_nested_delimiters() { - return true; - } - - /** - * Consume a "word" in POSIX parlance, and returns it unquoted. - * - * This is an approximate implementation that doesn't deal with any - * POSIX-mandated substitution, and assumes the default value for - * IFS. - */ - bool advance_word(TSLexer *lexer, std::string& unquoted_word, bool& allows_interpolation) { - bool empty = true; - bool has_space_before = false; - allows_interpolation = true; - - // <<~EOF - if (lexer->lookahead == '~') { - heredoc_allows_indent.push(true); - advance(lexer); - } - else { - heredoc_allows_indent.push(false); - } - - // <<\EOF, <<~\EOF - if (lexer->lookahead == '\\') { - allows_interpolation = false; - advance(lexer); - } - - - // run over the spaces - if (iswspace(lexer->lookahead)) { - run_over_spaces(lexer); - has_space_before = true; - } - - int32_t quote = 0; - if ( - lexer->lookahead == '\'' - || lexer->lookahead == '"' - || lexer->lookahead == '`' - ) { - allows_interpolation = (lexer->lookahead == '\'') ? false : true; - quote = lexer->lookahead; - advance(lexer); - } - else if (has_space_before) { - return false; - } - - regex identifier_regex("[a-zA-Z0-9]"); - while ( - lexer->lookahead - && std::regex_match(std::string(1, static_cast(lexer->lookahead)), identifier_regex) - && ! (quote ? lexer->lookahead == quote : iswspace(lexer->lookahead)) - ) { - // TODO: check this below condition - if (lexer->lookahead == '\\') { - advance(lexer); - if (! lexer->lookahead) return false; - } - empty = false; - unquoted_word += lexer->lookahead; - advance(lexer); - } - - if (quote && lexer->lookahead == quote) { - advance(lexer); - } - - return ! empty; - } - - bool exit_if_heredoc_end_delimiter(TSLexer *lexer) { - std::string word; - // lexer->result_symbol = HEREDOC_END_IDENTIFIER; - while (!iswspace(lexer->lookahead)) { - // printf("string here - %c", lexer->lookahead); - word += lexer->lookahead; - advance(lexer); - - if (!lexer->lookahead) { - break; - } - } - - if (word == heredoc_identifier_queue.front()) { - // if (1) { - lexer->result_symbol = HEREDOC_END_IDENTIFIER; - lexer->mark_end(lexer); - - // unset stuffs - started_heredoc = false; - started_heredoc_body = false; - heredoc_identifier_queue.pop(); - heredoc_allows_interpolation.pop(); - return true; - } - else { - lexer->result_symbol = HEREDOC_CONTENT; - return true; - } - } - - int32_t start_delimiter_char; - int32_t end_delimiter_char; - bool is_separator_delimiter_parsed; - bool is_delimiter_enclosing; // is the delimiter {}, <> and same character not //, !! - int delimiter_cout = 0; - bool reached; - - // heredoc - bool started_heredoc = false; - bool started_heredoc_body = false; - std::queue heredoc_identifier_queue; - std::queue heredoc_allows_interpolation; - std::queue heredoc_allows_indent; - - }; - -} - -extern "C" { - void * tree_sitter_perl_external_scanner_create() { - return new Scanner(); - } - - void tree_sitter_perl_external_scanner_destroy(void *payload) { - Scanner *scanner = static_cast(payload); - delete scanner; - } - - unsigned tree_sitter_perl_external_scanner_serialize( - void *payload, - char *buffer - ) { - Scanner *scanner = static_cast(payload); - return scanner->serialize(buffer); - } - - void tree_sitter_perl_external_scanner_deserialize( - void *payload, - const char *buffer, - unsigned length - ) { - Scanner *scanner = static_cast(payload); - scanner->deserialize(buffer, length); - } - - bool tree_sitter_perl_external_scanner_scan( - void *payload, - TSLexer *lexer, - const bool *valid_symbols - ) { - - Scanner *scanner = static_cast(payload); - return scanner->scan(lexer, valid_symbols); - } -}