#include enum TokenType { STRING_FRAGMENT, INDENTED_STRING_FRAGMENT, PATH_START, PATH_FRAGMENT, }; static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } static void skip(TSLexer *lexer) { lexer->advance(lexer, true); } // Here we only parse literal fragment inside a string. // Delimiter, interpolation and escape sequence are handled by the parser and we simply stop at them. // // The implementation is inspired by tree-sitter-javascript: // https://github.com/tree-sitter/tree-sitter-javascript/blob/fdeb68ac8d2bd5a78b943528bb68ceda3aade2eb/src/scanner.c#L19 static bool scan_string_fragment(TSLexer *lexer) { lexer->result_symbol = STRING_FRAGMENT; for (bool has_content = false;; has_content = true) { lexer->mark_end(lexer); switch (lexer->lookahead) { case '"': case '\\': return has_content; case '$': advance(lexer); if (lexer->lookahead == '{') { return has_content; } else if (lexer->lookahead != '"' && lexer->lookahead != '\\') { // Any char following '$' other than '"', '\\' and '{' (which was handled above) // should be consumed as additional string content. // This means `$${` doesn't start an interpolation, but `$$${` does. advance(lexer); } break; // Simply give up on EOF or '\0'. case '\0': return false; default: advance(lexer); } } } // See comments of scan_string_fragment. static bool scan_indented_string_fragment(TSLexer *lexer) { lexer->result_symbol = INDENTED_STRING_FRAGMENT; for (bool has_content = false;; has_content = true) { lexer->mark_end(lexer); switch (lexer->lookahead) { case '$': advance(lexer); if (lexer->lookahead == '{') { return has_content; } else if (lexer->lookahead != '\'') { // Any char following '$' other than '\'' and '{' (which was handled above) // should be consumed as additional string content. // This means `$${` doesn't start an interpolation, but `$$${` does. advance(lexer); } break; case '\'': advance(lexer); if (lexer->lookahead == '\'') { // Two single quotes always stop current string fragment. // It can be either an end delimiter '', or escape sequences ''', ''$, ''\ return has_content; } break; // Simply give up on EOF or '\0'. case '\0': return false; default: advance(lexer); } } } static bool is_path_char(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '-' || c == '+' || c == '_' || c == '.' || c == '/'; } static bool scan_path_start(TSLexer *lexer) { lexer->result_symbol = PATH_START; bool have_sep = false; bool have_after_sep = false; char c = lexer->lookahead; // unlike string_fragments which which are preceded by initial token (i.e. '"') // and thus will have all leading external whitespace consumed, // we have no such luxury with the path_start token. // // so we must skip over any leading whitespace here. while (c == ' ' || c == '\n' || c == '\r' || c == '\t') { skip(lexer); c = lexer->lookahead; } while (true) { lexer->mark_end(lexer); c = lexer->lookahead; if (c == '/') { have_sep = true; } else if (is_path_char(c)) { if (have_sep) { have_after_sep = true; } } else if (c == '$') { // starting a interpolation, // so we have a valid token as long as we've seen a separator. // example: a/${x} return have_sep; } else { // we have a valid token if we've consumed anything after a separator. // example: a/b return have_after_sep; } advance(lexer); } } static bool scan_path_fragment(TSLexer *lexer) { lexer->result_symbol = PATH_FRAGMENT; for (bool has_content = false;; has_content = true) { lexer->mark_end(lexer); if (!is_path_char(lexer->lookahead)) { return has_content; } advance(lexer); } } void *tree_sitter_nix_external_scanner_create() { return NULL; } bool tree_sitter_nix_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { // This never happens in valid grammar. Only during error recovery, everything becomes valid. // See: https://github.com/tree-sitter/tree-sitter/issues/1259 // // We should not consume any content as string fragment during error recovery, or we'll break // more valid grammar below. // The test 'attrset typing field following string' covers this. if (valid_symbols[STRING_FRAGMENT] && valid_symbols[INDENTED_STRING_FRAGMENT] && valid_symbols[PATH_START] && valid_symbols[PATH_FRAGMENT] ) { return false; } else if (valid_symbols[STRING_FRAGMENT]) { return scan_string_fragment(lexer); } else if (valid_symbols[INDENTED_STRING_FRAGMENT]) { return scan_indented_string_fragment(lexer); } else if (valid_symbols[PATH_FRAGMENT] && is_path_char(lexer->lookahead)) { // path_fragments should be scanned as immediate tokens, with no preceding extras. // so we assert that the very first token is a path character, // and otherwise we fall through to the case below. // example: // a/b${c} d/e${f} // ^--- note that scanning for the path_fragment will start here. // this *should* be parsed as a function application. // so we want to fall through to the path_start case below, // which will skip the whitespace and correctly scan the following path_start. // // also, we want this above path_start, because wherever there's ambiguity we want to parse another fragment // instead of starting a new path. // example: // a/b${c}d/e${f} // if we swap the precedence, we'd effectively parse the above as the following function application: // (a/b${c}) (d/e${f}) return scan_path_fragment(lexer); } else if (valid_symbols[PATH_START]) { return scan_path_start(lexer); } return false; } unsigned tree_sitter_nix_external_scanner_serialize(void *payload, char *buffer) { return 0; } void tree_sitter_nix_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { } void tree_sitter_nix_external_scanner_destroy(void *payload) { }