difftastic/vendored_parsers/tree-sitter-clojure/grammar.js

529 lines
13 KiB
JavaScript

// one aim is to try to parse what is correct (in the sense of
// officially supported), but also be looser in parsing additional
// things. this is more or less in line with advice from tree-sitter
// folks.
function regex(...patts) {
return RegExp(patts.join(""));
}
// java.lang.Character.isWhitespace AND comma
//
// Space Separator (Zs) but NOT including (U+00A0, U+2007, U+202F)
// U+0020, U+1680, U+2000, U+2001, U+2002, U+2003, U+2004, U+2005,
// U+2006, U+2008, U+2009, U+200A, U+205F, U+3000
// Line Separator (Zl)
// U+2028
// Paragraph Separator (Zp)
// U+2029
// Horizontal Tabulation
// \t
// Line Feed
// \n
// Vertical Tabulation
// U+000B
// Form Feed
// \f
// Carriage Return
// \r
// File Separator
// U+001C
// Group Separator
// U+001D
// Record Separator
// U+001E
// Unit Separator
// U+001F
const WHITESPACE_CHAR =
regex("[",
"\\f\\n\\r\\t, ",
"\\u000B\\u001C\\u001D\\u001E\\u001F",
"\\u2028\\u2029\\u1680",
"\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008",
"\\u2009\\u200a\\u205f\\u3000",
"]");
const WHITESPACE =
token(repeat1(WHITESPACE_CHAR));
const COMMENT =
token(regex('(;|#!).*\n?'));
const DIGIT =
regex('[0-9]');
const ALPHANUMERIC =
regex('[0-9a-zA-Z]');
const HEX_DIGIT =
regex('[0-9a-fA-F]');
const OCTAL_DIGIT =
regex('[0-7]');
const HEX_NUMBER =
seq("0",
regex('[xX]'),
repeat1(HEX_DIGIT),
optional("N"));
const OCTAL_NUMBER =
seq("0",
repeat1(OCTAL_DIGIT),
optional("N"));
// XXX: not constraining number before r/R
// XXX: not constraining portion after r/R
const RADIX_NUMBER =
seq(repeat1(DIGIT),
regex('[rR]'),
repeat1(ALPHANUMERIC));
const RATIO =
seq(repeat1(DIGIT),
"/",
repeat1(DIGIT));
const DOUBLE =
seq(repeat1(DIGIT),
optional(seq(".",
repeat(DIGIT))),
optional(seq(regex('[eE]'),
optional(regex('[+-]')),
repeat1(DIGIT))),
optional("M"));
const INTEGER =
seq(repeat1(DIGIT),
optional(regex('[MN]')));
const NUMBER =
token(prec(10, seq(optional(regex('[+-]')),
choice(HEX_NUMBER,
OCTAL_NUMBER,
RADIX_NUMBER,
RATIO,
DOUBLE,
INTEGER))));
const NIL =
token('nil');
const BOOLEAN =
token(choice('false',
'true'));
const KEYWORD_HEAD =
regex("[^",
"\\f\\n\\r\\t ",
"()",
"\\[\\]",
"{}",
'"',
"@~^;`",
"\\\\",
",:/",
"\\u000B\\u001C\\u001D\\u001E\\u001F",
"\\u2028\\u2029\\u1680",
"\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008",
"\\u2009\\u200a\\u205f\\u3000",
"]");
const KEYWORD_BODY =
choice(regex("[:']"), KEYWORD_HEAD);
const KEYWORD_NAMESPACED_BODY =
token(repeat1(choice(regex("[:'/]"), KEYWORD_HEAD)));
const KEYWORD_NO_SIGIL =
token(seq(KEYWORD_HEAD,
repeat(KEYWORD_BODY)));
const KEYWORD_MARK =
token(":");
const AUTO_RESOLVE_MARK =
token("::");
const STRING =
token(seq('"',
repeat(regex('[^"\\\\]')),
repeat(seq("\\",
regex("."),
repeat(regex('[^"\\\\]')))),
'"'));
// XXX: better to match \o378 as a single item
const OCTAL_CHAR =
seq("o",
choice(seq(DIGIT, DIGIT, DIGIT),
seq(DIGIT, DIGIT),
seq(DIGIT)));
const NAMED_CHAR =
choice("backspace",
"formfeed",
"newline",
"return",
"space",
"tab");
// XXX: outside of: (c >= '\uD800' && c <= '\uDFFF') - LispReader.java
// but not doing this
const UNICODE =
seq("u",
HEX_DIGIT,
HEX_DIGIT,
HEX_DIGIT,
HEX_DIGIT);
// XXX: not quite sure what this is supposed to be...
// return Character.valueOf(token.charAt(0)); -- LispReader.java
// java char is 16 bits...what can tree-sitter manage?
//
// XXX: null is supposed to be usable but putting \x00 below
// does not seem to work
const ANY_CHAR =
regex('.|\n');
const CHARACTER =
token(seq("\\",
choice(OCTAL_CHAR,
NAMED_CHAR,
UNICODE,
ANY_CHAR)));
const SYMBOL_HEAD =
regex("[^",
"\\f\\n\\r\\t ",
"/",
"()\\[\\]{}",
'"',
"@~^;`",
"\\\\",
",:#'0-9",
"\\u000B\\u001C\\u001D\\u001E\\u001F",
"\\u2028\\u2029\\u1680",
"\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2008",
"\\u2009\\u200a\\u205f\\u3000",
"]");
const NS_DELIMITER =
token("/");
const SYMBOL_BODY =
choice(SYMBOL_HEAD,
regex("[:#'0-9]"));
const SYMBOL_NAMESPACED_NAME =
token(repeat1(choice(SYMBOL_HEAD,
regex("[/:#'0-9]"))));
// XXX: no attempt is made to enforce certain complex things, e.g.
//
// Symbols beginning or ending with ':' are reserved by Clojure.
// A symbol can contain one or more non-repeating ':'s
const SYMBOL =
token(seq(SYMBOL_HEAD,
repeat(SYMBOL_BODY)));
module.exports = grammar({
name: 'clojure',
extras: $ =>
[],
conflicts: $ =>
[],
inline: $ =>
[$._kwd_leading_slash,
$._kwd_just_slash,
$._kwd_qualified,
$._kwd_unqualified,
$._kwd_marker,
$._sym_qualified,
$._sym_unqualified],
rules: {
// THIS MUST BE FIRST -- even though this doesn't look like it matters
source: $ =>
repeat(choice($._form,
$._gap)),
_gap: $ =>
choice($._ws,
$.comment,
$.dis_expr),
_ws: $ =>
WHITESPACE,
comment: $ =>
COMMENT,
dis_expr: $ =>
seq(field('marker', "#_"),
repeat($._gap),
field('value', $._form)),
_form: $ =>
choice($.num_lit, // atom-ish
$.kwd_lit,
$.str_lit,
$.char_lit,
$.nil_lit,
$.bool_lit,
$.sym_lit,
// basic collection-ish
$.list_lit,
$.map_lit,
$.vec_lit,
// dispatch reader macros
$.set_lit,
$.anon_fn_lit,
$.regex_lit,
$.read_cond_lit,
$.splicing_read_cond_lit,
$.ns_map_lit,
$.var_quoting_lit,
$.sym_val_lit,
$.evaling_lit,
$.tagged_or_ctor_lit,
// some other reader macros
$.derefing_lit,
$.quoting_lit,
$.syn_quoting_lit,
$.unquote_splicing_lit,
$.unquoting_lit),
num_lit: $ =>
NUMBER,
kwd_lit: $ =>
choice($._kwd_leading_slash,
$._kwd_just_slash,
$._kwd_qualified,
$._kwd_unqualified),
// (namespace :/usr/bin/env) ; => ""
// (name :/usr/bin/env) ; => "usr/bin/env"
_kwd_leading_slash: $ =>
seq(field('marker', $._kwd_marker),
field('delimiter', NS_DELIMITER),
field('name', alias(KEYWORD_NAMESPACED_BODY, $.kwd_name))),
// (namespace :/) ;=> nil
// (name :/) ;=> "/"
_kwd_just_slash: $ =>
seq(field('marker', $._kwd_marker),
field('name', alias(NS_DELIMITER, $.kwd_name))),
_kwd_qualified: $ =>
prec(2, seq(field('marker', $._kwd_marker),
field('namespace', alias(KEYWORD_NO_SIGIL, $.kwd_ns)),
field('delimiter', NS_DELIMITER),
field('name', alias(KEYWORD_NAMESPACED_BODY, $.kwd_name)))),
_kwd_unqualified: $ =>
prec(1, seq(field('marker', $._kwd_marker),
field('name', alias(KEYWORD_NO_SIGIL, $.kwd_name)))),
_kwd_marker: $ =>
choice(KEYWORD_MARK, AUTO_RESOLVE_MARK),
str_lit: $ =>
STRING,
char_lit: $ =>
CHARACTER,
nil_lit: $ =>
NIL,
bool_lit: $ =>
BOOLEAN,
sym_lit: $ =>
seq(repeat($._metadata_lit),
choice($._sym_qualified, $._sym_unqualified)),
_sym_qualified: $ =>
prec(1, seq(field("namespace", alias(SYMBOL, $.sym_ns)),
field("delimiter", NS_DELIMITER),
field("name", alias(SYMBOL_NAMESPACED_NAME, $.sym_name)))),
_sym_unqualified: $ =>
field('name', alias(choice(NS_DELIMITER, // division symbol
SYMBOL),
$.sym_name)),
_metadata_lit: $ =>
seq(choice(field("meta", $.meta_lit),
field("old_meta", $.old_meta_lit)),
optional(repeat($._gap))),
meta_lit: $ =>
seq(field('marker', "^"),
repeat($._gap),
field('value', $._form)),
old_meta_lit: $ =>
seq(field('marker', "#^"),
repeat($._gap),
field('value', $._form)),
list_lit: $ =>
seq(repeat($._metadata_lit),
$._bare_list_lit),
_bare_list_lit: $ =>
seq(field('open', "("),
repeat(choice(field('value', $._form),
$._gap)),
field('close', ")")),
map_lit: $ =>
seq(repeat($._metadata_lit),
$._bare_map_lit),
_bare_map_lit: $ =>
seq(field('open', "{"),
repeat(choice(field('value', $._form),
$._gap)),
field('close', "}")),
vec_lit: $ =>
seq(repeat($._metadata_lit),
$._bare_vec_lit),
_bare_vec_lit: $ =>
seq(field('open', "["),
repeat(choice(field('value', $._form),
$._gap)),
field('close', "]")),
set_lit: $ =>
seq(repeat($._metadata_lit),
$._bare_set_lit),
_bare_set_lit: $ =>
seq(field('marker', "#"),
field('open', "{"),
repeat(choice(field('value', $._form),
$._gap)),
field('close', "}")),
anon_fn_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "#"),
$._bare_list_lit),
regex_lit: $ =>
seq(field('marker', "#"),
STRING),
read_cond_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "#?"),
// whitespace possible, but neither comment nor discard
repeat($._ws),
$._bare_list_lit),
splicing_read_cond_lit: $ =>
// XXX: metadata here doesn't seem to make sense, but the repl
// will accept: [^:x #?@(:clj [[:a]] :cljr [[:b]])]
seq(repeat($._metadata_lit),
field('marker', "#?@"),
// whitespace possible, but neither comment nor discard
repeat($._ws),
$._bare_list_lit),
auto_res_mark: $ =>
AUTO_RESOLVE_MARK,
ns_map_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "#"),
field('prefix', choice($.auto_res_mark,
$.kwd_lit)),
repeat($._gap),
$._bare_map_lit),
var_quoting_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "#'"),
repeat($._gap),
// XXX: symbol, reader conditional, and tagged literal can work
// any other things?
field('value', $._form)),
sym_val_lit: $ =>
seq(field('marker', "##"),
repeat($._gap),
field('value', $._form)),
evaling_lit: $ =>
seq(repeat($._metadata_lit), // ^:x #=(vector 1)
field('marker', "#="),
repeat($._gap),
field('value', choice($.list_lit,
$.read_cond_lit,
// #= ^:a java.lang.String
$.sym_lit))),
// #uuid "00000000-0000-0000-0000-000000000000"
// #user.Fun[1 2]
// #user.Fun{:a 1 :b 2}
tagged_or_ctor_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "#"),
// # uuid "00000000-0000-0000-0000-000000000000"
// # #_ 1 uuid "00000000-0000-0000-0000-000000000000"
// etc.
repeat($._gap),
// # ^:a uuid "00000000-0000-0000-0000-000000000000"
field('tag', $.sym_lit),
repeat($._gap),
field('value', $._form)),
derefing_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "@"),
repeat($._gap),
field('value', $._form)),
quoting_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "'"),
repeat($._gap),
field('value', $._form)),
syn_quoting_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "`"),
repeat($._gap),
field('value', $._form)),
unquote_splicing_lit: $ =>
// XXX: metadata here doesn't seem to make sense, but the repl
// will accept: `(^:x ~@[:a :b :c])
seq(repeat($._metadata_lit),
field('marker', "~@"),
repeat($._gap),
field('value', $._form)),
unquoting_lit: $ =>
seq(repeat($._metadata_lit),
field('marker', "~"),
repeat($._gap),
field('value', $._form)),
}
});
// Local Variables:
// mode: js-mode
// js-indent-align-list-continuation: t
// js-indent-level: 2
// End: