Intorduce external parser for dollar quoted strings

PostgreSQL supports stings of the following format $TAG$mycontent$TAG$
https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING

These strings often used to define a function body.
pull/315/head
Maksim Novikov 2022-06-11 21:11:31 +07:00
parent 2ec2fedbb3
commit c3d3b38557
No known key found for this signature in database
GPG Key ID: 2F1C320B72DA6C55
12 changed files with 11215 additions and 10986 deletions

@ -9,7 +9,7 @@
"sources": [
"bindings/node/binding.cc",
"src/parser.c",
# If your language uses an external scanner, add it here.
"src/scanner.cc"
],
"cflags_c": [
"-std=c99",

@ -10,22 +10,12 @@ fn main() {
let parser_path = src_dir.join("parser.c");
c_config.file(&parser_path);
// If your language uses an external scanner written in C,
// then include this block of code:
/*
let scanner_path = src_dir.join("scanner.c");
c_config.file(&scanner_path);
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
c_config.compile("parser");
println!("cargo:rerun-if-changed={}", parser_path.to_str().unwrap());
// If your language uses an external scanner written in C++,
// then include this block of code:
/*
let mut cpp_config = cc::Build::new();
cpp_config.cpp(true);
cpp_config.include(&src_dir);
@ -36,5 +26,4 @@ fn main() {
cpp_config.file(&scanner_path);
cpp_config.compile("scanner");
println!("cargo:rerun-if-changed={}", scanner_path.to_str().unwrap());
*/
}

@ -65,6 +65,11 @@ function createCaseInsensitiveRegex(word) {
module.exports = grammar({
name: "sql",
extras: $ => [$.comment, /[\s\f\uFEFF\u2060\u200B]|\\\r?\n/],
externals: $ => [
$._dollar_quoted_string_tag,
$._dollar_quoted_string_content,
$._dollar_quoted_string_end_tag,
],
rules: {
source_file: $ => repeat($._statement),
@ -545,8 +550,12 @@ module.exports = grammar({
type: $ => seq($._identifier, optional(seq("(", $.number, ")"))),
string: $ =>
choice(
seq("'", field("content", /[^']*/), "'"),
seq("$$", field("content", /(\$?[^$]+)+/), "$$"), // FIXME empty string test, maybe read a bit more into c comments answer
seq("'", field("content", alias(/[^']*/, $.content)), "'"),
seq(
$._dollar_quoted_string_tag,
field("content", alias($._dollar_quoted_string_content, $.content)),
$._dollar_quoted_string_end_tag,
),
),
field_access: $ => seq($.identifier, "->>", $.string),
ordered_expression: $ =>

41
src/grammar.json vendored

@ -4638,8 +4638,13 @@
"type": "FIELD",
"name": "content",
"content": {
"type": "PATTERN",
"value": "[^']*"
"type": "ALIAS",
"content": {
"type": "PATTERN",
"value": "[^']*"
},
"named": true,
"value": "content"
}
},
{
@ -4652,20 +4657,25 @@
"type": "SEQ",
"members": [
{
"type": "STRING",
"value": "$$"
"type": "SYMBOL",
"name": "_dollar_quoted_string_tag"
},
{
"type": "FIELD",
"name": "content",
"content": {
"type": "PATTERN",
"value": "(\\$?[^$]+)+"
"type": "ALIAS",
"content": {
"type": "SYMBOL",
"name": "_dollar_quoted_string_content"
},
"named": true,
"value": "content"
}
},
{
"type": "STRING",
"value": "$$"
"type": "SYMBOL",
"name": "_dollar_quoted_string_end_tag"
}
]
}
@ -5309,7 +5319,20 @@
],
"conflicts": [],
"precedences": [],
"externals": [],
"externals": [
{
"type": "SYMBOL",
"name": "_dollar_quoted_string_tag"
},
{
"type": "SYMBOL",
"name": "_dollar_quoted_string_content"
},
{
"type": "SYMBOL",
"name": "_dollar_quoted_string_end_tag"
}
],
"inline": [],
"supertypes": []
}

@ -3830,7 +3830,18 @@
{
"type": "string",
"named": true,
"fields": {}
"fields": {
"content": {
"multiple": false,
"required": true,
"types": [
{
"type": "content",
"named": true
}
]
}
}
},
{
"type": "table_column",
@ -4626,10 +4637,6 @@
"type": "$",
"named": false
},
{
"type": "$$",
"named": false
},
{
"type": "%",
"named": false
@ -5242,6 +5249,10 @@
"type": "comment",
"named": true
},
{
"type": "content",
"named": true
},
{
"type": "|",
"named": false

21786
src/parser.c vendored

File diff suppressed because it is too large Load Diff

159
src/scanner.cc vendored

@ -0,0 +1,159 @@
#include <tree_sitter/parser.h>
#include <string>
#include <cwctype>
namespace {
using std::string;
enum TokenType {
DOLLAR_QUOTED_STRING_TAG,
DOLLAR_QUOTED_STRING_CONTENT,
DOLLAR_QUOTED_STRING_END_TAG,
};
struct Scanner {
string dollar_quoted_string_tag;
string current_leading_word;
bool dollar_quoted_string_started;
void skip(TSLexer *lexer) {
lexer->advance(lexer, true);
}
void advance(TSLexer *lexer) {
lexer->advance(lexer, false);
}
unsigned serialize(char *buffer) {
if (dollar_quoted_string_tag.length() + 1>= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
buffer[0] = dollar_quoted_string_started;
dollar_quoted_string_tag.copy(&buffer[1], dollar_quoted_string_tag.length());
return dollar_quoted_string_tag.length() + 1;
}
void deserialize(const char *buffer, unsigned length) {
if (length == 0) {
dollar_quoted_string_started = false;
dollar_quoted_string_tag.clear();
} else {
dollar_quoted_string_started = buffer[0];
dollar_quoted_string_tag.assign(&buffer[1], &buffer[length]);
}
}
bool scan_dollar_quoted_string_content(TSLexer *lexer) {
unsigned long int pos = 0;
lexer->result_symbol = DOLLAR_QUOTED_STRING_CONTENT;
lexer->mark_end(lexer);
for (;;) {
if (lexer->lookahead == '\0') {
return false;
} else if (lexer->lookahead == dollar_quoted_string_tag[pos]) {
if (pos == dollar_quoted_string_tag.length() - 1) {
return true;
} else if (pos == 0) {
lexer->result_symbol = DOLLAR_QUOTED_STRING_CONTENT;
lexer->mark_end(lexer);
}
pos++;
advance(lexer);
} else if (pos != 0) {
pos = 0;
} else {
advance(lexer);
}
}
}
bool scan_dollar_quoted_string_tag(TSLexer *lexer) {
while (iswspace(lexer->lookahead)) skip(lexer);
dollar_quoted_string_tag.clear();
if (lexer->lookahead == '$') {
dollar_quoted_string_tag += lexer->lookahead;
advance(lexer);
} else {
return false;
}
while (iswalpha(lexer->lookahead)) {
dollar_quoted_string_tag += lexer->lookahead;
advance(lexer);
}
if (lexer->lookahead == '$') {
dollar_quoted_string_tag += lexer->lookahead;
advance(lexer);
dollar_quoted_string_started = true;
return true;
}
return false;
}
bool scan_dollar_quoted_string_end_tag(TSLexer *lexer) {
current_leading_word.clear();
while (
lexer->lookahead != '\0' &&
current_leading_word.length() < dollar_quoted_string_tag.length()
) {
current_leading_word += lexer->lookahead;
advance(lexer);
}
return current_leading_word == dollar_quoted_string_tag;
}
bool scan(TSLexer *lexer, const bool *valid_symbols) {
if (valid_symbols[DOLLAR_QUOTED_STRING_TAG] && !dollar_quoted_string_started) {
return scan_dollar_quoted_string_tag(lexer);
}
if (valid_symbols[DOLLAR_QUOTED_STRING_CONTENT] && dollar_quoted_string_started) {
return scan_dollar_quoted_string_content(lexer);
}
if (valid_symbols[DOLLAR_QUOTED_STRING_END_TAG] && dollar_quoted_string_started) {
if (scan_dollar_quoted_string_end_tag(lexer)) {
dollar_quoted_string_started = false;
lexer->result_symbol = DOLLAR_QUOTED_STRING_END_TAG;
return true;
};
}
return false;
}
};
}
extern "C" {
void *tree_sitter_sql_external_scanner_create() {
return new Scanner();
}
bool tree_sitter_sql_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->scan(lexer, valid_symbols);
}
unsigned tree_sitter_sql_external_scanner_serialize(void *payload, char *state) {
Scanner *scanner = static_cast<Scanner *>(payload);
return scanner->serialize(state);
}
void tree_sitter_sql_external_scanner_deserialize(void *payload, const char *state, unsigned length) {
Scanner *scanner = static_cast<Scanner *>(payload);
scanner->deserialize(state, length);
}
void tree_sitter_sql_external_scanner_destroy(void *payload) {
Scanner *scanner = static_cast<Scanner *>(payload);
delete scanner;
}
}

@ -36,7 +36,8 @@ CREATE TABLE IF NOT EXISTS `tblsample` (
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -45,7 +46,8 @@ CREATE TABLE IF NOT EXISTS `tblsample` (
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -68,7 +70,8 @@ CREATE TABLE IF NOT EXISTS `tblsample` (
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -77,7 +80,8 @@ CREATE TABLE IF NOT EXISTS `tblsample` (
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -86,7 +90,8 @@ CREATE TABLE IF NOT EXISTS `tblsample` (
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(primary_key
(identifier)))))
@ -129,7 +134,8 @@ CREATE TABLE IF NOT EXISTS tblsample
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -138,7 +144,8 @@ CREATE TABLE IF NOT EXISTS tblsample
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -161,7 +168,8 @@ CREATE TABLE IF NOT EXISTS tblsample
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -170,7 +178,8 @@ CREATE TABLE IF NOT EXISTS tblsample
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(table_column
(identifier)
(type
@ -179,6 +188,7 @@ CREATE TABLE IF NOT EXISTS tblsample
(null_constraint
(NULL))
(column_default
(string)))
(string
(content))))
(primary_key
(identifier)))))

@ -13,7 +13,8 @@ INSERT INTO table2 SELECT * FROM generate_series(1, 100, 1);
(values_clause
(values_clause_body
(number)
(string))))
(string
(content)))))
(insert_statement
(identifier)
(select_statement

@ -28,7 +28,8 @@ LEFT OUTER JOIN t7 ON t.a = t7.a;
(identifier))
(alias
(interval_expression
(string))
(string
(content)))
(identifier))))
(from_clause
(alias

@ -14,7 +14,8 @@ SELECT interval '1 minute';
(select_clause
(select_clause_body
(interval_expression
(string))))))
(string
(content)))))))
================================================================================
SELECT statement with distinct expression
@ -202,7 +203,8 @@ SELECT '';
(select_statement
(select_clause
(select_clause_body
(string)))))
(string
(content))))))
================================================================================
SELECT statement with dollar quoted string
@ -216,7 +218,8 @@ SELECT $$hey$$;
(select_statement
(select_clause
(select_clause_body
(string)))))
(string
(content))))))
================================================================================
SELECT statement with dollar quoted brackets inside
@ -230,7 +233,8 @@ SELECT $$(a + b)$$;
(select_statement
(select_clause
(select_clause_body
(string)))))
(string
(content))))))
================================================================================
SELECT statement with type cast
@ -245,7 +249,8 @@ SELECT ''::JSONB;
(select_clause
(select_clause_body
(type_cast
(string)
(string
(content))
(type
(identifier)))))))
@ -528,7 +533,8 @@ SELECT 'aaaa'
(select_statement
(select_clause
(select_clause_body
(string)))))
(string
(content))))))
================================================================================
SELECT weird_string
@ -542,7 +548,8 @@ SELECT '%{a.b}'
(select_statement
(select_clause
(select_clause_body
(string)))))
(string
(content))))))
================================================================================
SELECT field_access
@ -558,7 +565,8 @@ SELECT foo->>'bar'
(select_clause_body
(field_access
(identifier)
(string))))))
(string
(content)))))))
================================================================================
SELECT in clause
@ -791,7 +799,8 @@ CREATE INDEX foo_idx ON table1 (col2, (lower(col1->>'attr')));
(identifier)
(field_access
(identifier)
(string))))))
(string
(content)))))))
================================================================================
CREATE INDEX using
@ -925,7 +934,8 @@ CREATE TABLE public.my_table (
(function_call
(identifier))
(interval_expression
(string))))))))
(string
(content)))))))))
================================================================================
CREATE TABLE with not null constraint
@ -983,7 +993,8 @@ CREATE TABLE my_table (col1 INT DEFAULT 'hey')
(type
(identifier))
(column_default
(string))))))
(string
(content)))))))
================================================================================
CREATE TABLE with primary key constraint
@ -1416,7 +1427,8 @@ CREATE TABLE foo(
(identifier)
(identifier))
(interval_expression
(string)))))))
(string
(content))))))))
================================================================================
CREATE TABLE with foreign key constraint with columns specified
@ -1457,6 +1469,9 @@ CREATE FUNCTION
CREATE FUNCTION add(integer, integer) RETURNS integer
AS 'select $1 + $2;'
LANGUAGE SQL;
CREATE FUNCTION add(integer, integer) RETURNS integer
AS $BODY$select $1 + $2;$BODY$
LANGUAGE SQL;
--------------------------------------------------------------------------------
@ -1473,7 +1488,23 @@ CREATE FUNCTION add(integer, integer) RETURNS integer
(type
(identifier))
(function_body
(string))
(string
(content)))
(language))
(create_function_statement
(identifier)
(create_function_parameters
(create_function_parameter
(type
(identifier)))
(create_function_parameter
(type
(identifier))))
(type
(identifier))
(function_body
(string
(content)))
(language)))
================================================================================
@ -1499,7 +1530,8 @@ CREATE OR REPLACE FUNCTION add(integer, integer) RETURNS integer
(type
(identifier))
(function_body
(string))
(string
(content)))
(language)))
================================================================================
@ -1522,7 +1554,8 @@ CREATE FUNCTION foo(int) RETURNS integer
(type
(identifier))
(function_body
(string))
(string
(content)))
(language)
(null_hint)
(optimizer_hint)
@ -1558,7 +1591,8 @@ CREATE FUNCTION add(IN int, OUT int, INOUT int, VARIADIC int) RETURNS int
(type
(identifier))
(function_body
(string))
(string
(content)))
(language)))
================================================================================
@ -1582,7 +1616,8 @@ CREATE FUNCTION add(text) RETURNS SETOF int
(type
(identifier)))
(function_body
(string))
(string
(content)))
(language)))
================================================================================
@ -1609,7 +1644,8 @@ CREATE FUNCTION add(text) RETURNS SETOF int NOT NULL
(null_constraint
(NULL))))
(function_body
(string))
(string
(content)))
(language)))
================================================================================
@ -1631,7 +1667,8 @@ CREATE FUNCTION add(integer) RETURNS integer LANGUAGE SQL AS 'select $1';
(identifier))
(language)
(function_body
(string))))
(string
(content)))))
================================================================================
CREATE FUNCTION with optimizer hint
@ -1653,7 +1690,8 @@ CREATE FUNCTION add(integer) RETURNS integer STABLE LANGUAGE SQL AS 'select $1';
(optimizer_hint)
(language)
(function_body
(string))))
(string
(content)))))
================================================================================
CREATE FUNCTION with constrained args
@ -1678,7 +1716,8 @@ CREATE FUNCTION add(integer NOT NULL) RETURNS integer STABLE LANGUAGE SQL AS 'se
(optimizer_hint)
(language)
(function_body
(string))))
(string
(content)))))
================================================================================
CREATE FUNCTION lowercase
@ -1699,7 +1738,8 @@ create function add(integer) returns integer language sql as 'select $1';
(identifier))
(language)
(function_body
(string))))
(string
(content)))))
================================================================================
CREATE FUNCTION highlight function body
@ -1720,7 +1760,8 @@ create function add(integer) returns integer language sql as 'select a, b';
(identifier))
(language)
(function_body
(string))))
(string
(content)))))
================================================================================
CREATE SCHEMA schema
@ -1767,16 +1808,20 @@ SET SESSION search_path = 'test';
(identifier))
(set_statement
(identifier)
(string))
(string
(content)))
(set_statement
(identifier)
(string))
(string
(content)))
(set_statement
(identifier)
(string))
(string
(content)))
(set_statement
(identifier)
(string)))
(string
(content))))
================================================================================
SELECT asterisk expressions
@ -2008,7 +2053,8 @@ ALTER TABLE foo SET (autovacuum_enabled = false);
(function_call
(identifier)
(type_cast
(string)
(string
(content))
(type
(identifier))))))))
(alter_statement
@ -2120,3 +2166,26 @@ SELECT a.`test 1`;
(dotted_name
(identifier)
(identifier))))))
================================================================================
Dollar quoted strings
================================================================================
SELECT $$a$$, $a$baz$a$, $a$$$$a$, $a$b$$a$, $TAG$afasf $$TAG$;
--------------------------------------------------------------------------------
(source_file
(select_statement
(select_clause
(select_clause_body
(string
(content))
(string
(content))
(string
(content))
(string
(content))
(string
(content))))))

@ -60,3 +60,10 @@ GrOUP
-- <- keyword
By a, b
-- <- keyword
SELECT $$a$$, $a$baz$a$, $a$$$$a$, $a$b$$a$;
-- <- keyword
-- ^ string
-- ^ string
-- ^ string
-- ^ string