Prefer element reference over method invocation (#156)

* Prefer element reference over method invocation Ruby presents two ~syntactic~ sugarings that can not be distinguished unambiguously from syntax alone: First, array elements can be referenced using a bracketed argument after any amount of white space, so: x.[](0) x[0] x [0] are all equivalent. Second, methods may be invoked with omitted parends, so: f(y) f y are equivalent. The ambiguity can be seen when the function argument is a literal array: f [0] At this point, there is no syntactic information that can distinguish between element reference and procedural invocation. This can be seen by running this program in irb: irb(main):001:0> x = [0, 1, 2] => [0, 1, 2] irb(main):002:0> x.[](0) => 0 irb(main):003:0> x [0] => 0 irb(main):004:0> def y(z) irb(main):005:1> z irb(main):006:1> end => :y irb(main):007:0> y([0]) => [0] irb(main):008:0> y [0] => [0] Previously, tree-sitter-ruby handled this ambiguity by presenting both `x [0]` and `y [0]` as procedural invocation. However, this causes a parse error as described in tree-sitter/tree-sitter-ruby#146, when parsing d.find { |x| a(x) } [b] == c Here I add an optional, lower-precedence interpretation of `x [0]` as an element reference. Due to the construction of the grammar in this project, this unfortunately causes problems when attempting to parse constructs like: fun [0] do something end as the parser will eagerly consume `fun [0]` as the left-hand-side of a binary expression. To deal with this case, I explicitly add this construct to the `call` production rule. Unfortunately I had to resort to the GLR parser in order to resolve the ambiguity between these two rules. Finally, note that the tree obtained from the construct z [0] == 0 is context-sensitive in Ruby. If `z` is an array type, it is interpreted as `binary ( reference ( identifier, integer ), integer`. If `z` is a method, it is interpreted as `call ( identifier, binary ( array (integer), integer)`. Since tree-sitter assumes the parsed language is context-free, there's no good way for us to resolve this ambiguity. This commit prefers the second, method-invocation, interpretation, which appears to be more common within the test corpus. * Use external scanner logic to distinguish between arrays & subscripts When an opening square bracket appears immediately after a callable expression like "a" or "a.b", we must decide between two possible interpretations of the bracket: 1. It could be part of an element reference, as in `a[0] = true`. 2. Or it could be an array literal, passed as an argumet, as in `puts [1, 2, 3]` If there is no preceding whitespace, the bracket should *always* be treated as part of an element reference. This matches MRI's behavior. If there *is* preceding whitespace, MRI makes its decision in a context-sensitive way, based on whether the preceding expression is a local variable or a method name. This parser is not context-sensitive, so we instead will interpret the bracket as part of an array literal whenever that is syntactically valid, and interpret it as part of element reference otherwise. The external scanner can use the validity of other expression tokens like `string` to infer whether an array literal would be valid. Co-authored-by: Max Brunsfeld <maxbrunsfeld@gmail.com>
2021-02-04 16:29:15 +07:00 · 2021-02-04 16:29:15 +07:00 · 1fa06a9ea8
parent ebf6b3dd32
commit 1fa06a9ea8
5 changed files with 194268 additions and 191225 deletions
--- a/grammar.js
+++ b/grammar.js
@ -62,7 +62,8 @@ module.exports = grammar({
    $._singleton_class_left_angle_left_langle,
    $.hash_key_symbol,
    $._hash_splat_star_star,
-    $._binary_star_star
+    $._binary_star_star,
+    $._element_reference_bracket,
  ],

  extras: $ => [
@ -469,7 +470,7 @@ module.exports = grammar({

    element_reference: $ => prec.left(1, seq(
      field('object', $._primary),
-      token.immediate('['),
+      alias($._element_reference_bracket, '['),
      optional($._argument_list_with_trailing_comma),
      ']'
    )),
--- a/src/grammar.json
+++ b/src/grammar.json
@ -2346,11 +2346,13 @@
            }
          },
          {
-            "type": "IMMEDIATE_TOKEN",
+            "type": "ALIAS",
            "content": {
-              "type": "STRING",
-              "value": "["
-            }
+              "type": "SYMBOL",
+              "name": "_element_reference_bracket"
+            },
+            "named": false,
+            "value": "["
          },
          {
            "type": "CHOICE",
@ -6078,6 +6080,10 @@
    {
      "type": "SYMBOL",
      "name": "_binary_star_star"
+    },
+    {
+      "type": "SYMBOL",
+      "name": "_element_reference_bracket"
    }
  ],
  "inline": [],
--- a/src/parser.c
+++ b/src/parser.c
--- a/src/scanner.cc
+++ b/src/scanner.cc
@ -38,6 +38,7 @@ enum TokenType {
  HASH_KEY_SYMBOL,
  HASH_SPLAT_STAR_STAR,
  BINARY_STAR_STAR,
+  ELEMENT_REFERENCE_BRACKET,

  NONE
 };
@ -321,7 +322,6 @@ struct Scanner {
  }

  bool scan_symbol_identifier(TSLexer *lexer) {
-
    if (lexer->lookahead == '@') {
      advance(lexer);
      if (lexer->lookahead == '@') {
@ -876,6 +876,20 @@ struct Scanner {
        }
        break;

+      case '[':
+        // Treat a square bracket as an element reference if either:
+        // * the bracket is not preceded by any whitespace
+        // * an arbitrary expression is not valid at the current position.
+        if (valid_symbols[ELEMENT_REFERENCE_BRACKET] && (
+          !has_leading_whitespace ||
+          !valid_symbols[STRING_START]
+        )) {
+          advance(lexer);
+          lexer->result_symbol = ELEMENT_REFERENCE_BRACKET;
+          return true;
+        }
+        break;
+
      default:
        break;
    }
--- a/test/corpus/expressions.txt
+++ b/test/corpus/expressions.txt
@ -1263,9 +1263,9 @@ end
      (destructured_parameter (identifier) (identifier) (splat_parameter (identifier)) (destructured_parameter (identifier) (identifier)))
      (splat_parameter (identifier))))))

-===============================
-method call with array arguments
-===============================
+==================================================
+element reference and method with array arguments
+==================================================

 foo []
 foo [1]
@ -1274,9 +1274,60 @@ foo[1]
 ---

 (program
-  (call (identifier) (argument_list (array)))
-  (call (identifier) (argument_list (array (integer))))
-  (element_reference (identifier) (integer)))
+  (call method: (identifier) arguments: (argument_list (array)))
+  (call method: (identifier) arguments: (argument_list (array (integer))))
+  (element_reference object: (identifier) (integer)))
+
+=====================================
+element reference on call expression
+=====================================
+
+d(a) [0]
+d.find { |x| x > 1 } [0]
+d.find { |x| x > 1 } [0] == 0
+
+---
+
+(program
+  (element_reference (call (identifier) (argument_list (identifier))) (integer))
+  (element_reference
+    (call
+      (identifier)
+      (identifier)
+      (block (block_parameters (identifier)) (binary (identifier) (integer))))
+    (integer))
+  (binary
+    (element_reference
+      (call
+        (identifier)
+        (identifier)
+        (block (block_parameters (identifier)) (binary (identifier) (integer))))
+      (integer))
+    (integer)))
+
+======================================
+call with array and block
+======================================
+
+fun [0] { |x| x }
+
+fun [0] do
+  puts 1
+end
+
+---
+
+(program
+  (call
+    (identifier)
+    (argument_list (array (integer)))
+    (block (block_parameters (identifier)) (identifier)))
+  (call
+    (identifier)
+    (argument_list (array (integer)))
+    (do_block
+      (call (identifier) (argument_list (integer))))))
+

 ==============
 empty lambda expression