#include "tree_sitter/parser.h" #include #include #include #include #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define VEC_RESIZE(vec, _cap) \ void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \ assert(tmp != NULL); \ (vec).data = tmp; \ assert((vec).data != NULL); \ (vec).cap = (_cap); #define VEC_GROW(vec, _cap) \ if ((vec).cap < (_cap)) { \ VEC_RESIZE((vec), (_cap)); \ } #define VEC_PUSH(vec, el) \ if ((vec).cap == (vec).len) { \ VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \ } \ (vec).data[(vec).len++] = (el); #define VEC_POP(vec) (vec).len--; #define VEC_BACK(vec) ((vec).data[(vec).len - 1]) #define VEC_FREE(vec) \ { \ if ((vec).data != NULL) \ free((vec).data); \ } #define VEC_CLEAR(vec) (vec).len = 0; #define VEC_REVERSE(vec) \ do { \ if ((vec).len > 1) { \ for (size_t i = 0, j = (vec).len - 1; i < j; i++, j--) { \ uint8_t tmp = (vec).data[i]; \ (vec).data[i] = (vec).data[j]; \ (vec).data[j] = tmp; \ } \ } \ } while (0) enum TokenType { VIRTUAL_END_DECL, VIRTUAL_OPEN_SECTION, VIRTUAL_END_SECTION, MINUS_WITHOUT_TRAILING_WHITESPACE, GLSL_CONTENT, BLOCK_COMMENT_CONTENT, }; typedef struct { uint32_t len; uint32_t cap; uint8_t *data; } vec; typedef struct { uint32_t indent_length; vec indents; vec runback; } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } // > You can detect error recovery in the external scanner by the fact that // > _all_ tokens are considered valid at once. // https://github.com/tree-sitter/tree-sitter/pull/1783#issuecomment-1181011411 static bool in_error_recovery(const bool *valid_symbols) { return (valid_symbols[VIRTUAL_END_DECL] && valid_symbols[VIRTUAL_OPEN_SECTION] && valid_symbols[VIRTUAL_END_SECTION] && valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] && valid_symbols[GLSL_CONTENT] && valid_symbols[BLOCK_COMMENT_CONTENT]); } static bool is_elm_space(TSLexer *lexer) { return lexer->lookahead == ' ' || lexer->lookahead == '\r' || lexer->lookahead == '\n'; } static int checkForIn(TSLexer *lexer, const bool *valid_symbols) { // Are we at the end of a let (in) declaration if (valid_symbols[VIRTUAL_END_SECTION] && lexer->lookahead == 'i') { skip(lexer); if (lexer->lookahead == 'n') { skip(lexer); if (is_elm_space(lexer) || lexer->eof(lexer)) { return 2; // Success } return 1; // Partial } return 1; // Partial } return 0; } static bool scan_block_comment(TSLexer *lexer) { lexer->mark_end(lexer); if (lexer->lookahead != '{') { return false; } advance(lexer); if (lexer->lookahead != '-') { return false; } advance(lexer); while (true) { switch (lexer->lookahead) { case '{': scan_block_comment(lexer); break; case '-': advance(lexer); if (lexer->lookahead == '}') { advance(lexer); return true; } break; case '\0': return true; default: advance(lexer); } } } static void advance_to_line_end(TSLexer *lexer) { while (true) { if (lexer->lookahead == '\n' || lexer->eof(lexer)) { break; } advance(lexer); } } static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { if (in_error_recovery(valid_symbols)) { return false; } // First handle eventual runback tokens, we saved on a previous scan op if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 && valid_symbols[VIRTUAL_END_DECL]) { VEC_POP(scanner->runback); lexer->result_symbol = VIRTUAL_END_DECL; return true; } if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 && valid_symbols[VIRTUAL_END_SECTION]) { VEC_POP(scanner->runback); lexer->result_symbol = VIRTUAL_END_SECTION; return true; } VEC_CLEAR(scanner->runback); // Check if we have newlines and how much indentation bool has_newline = false; bool found_in = false; bool can_call_mark_end = true; lexer->mark_end(lexer); while (true) { if (lexer->lookahead == ' ' || lexer->lookahead == '\r') { skip(lexer); } else if (lexer->lookahead == '\n') { skip(lexer); has_newline = true; while (true) { if (lexer->lookahead == ' ') { skip(lexer); } else { scanner->indent_length = lexer->get_column(lexer); break; } } } else if (!valid_symbols[BLOCK_COMMENT_CONTENT] && lexer->lookahead == '-') { advance(lexer); int32_t lookahead = lexer->lookahead; // Handle minus without a whitespace for negate if (valid_symbols[MINUS_WITHOUT_TRAILING_WHITESPACE] && ((lookahead >= 'a' && lookahead <= 'z') || (lookahead >= 'A' && lookahead <= 'Z') || lookahead == '(')) { if (can_call_mark_end) { lexer->result_symbol = MINUS_WITHOUT_TRAILING_WHITESPACE; lexer->mark_end(lexer); return true; } return false; } // Scan past line comments. As far as the special token // types we're scanning for here are concerned line comments // are like whitespace. There is nothing useful to be // learned from, say, their indentation. So we advance past // them here. // // The one thing we need to keep in mind is that we should // not call `lexer->mark_end(lexer)` after this point, or // the comment will be lost. if (lookahead == '-' && has_newline) { can_call_mark_end = false; advance(lexer); advance_to_line_end(lexer); } else if (valid_symbols[BLOCK_COMMENT_CONTENT] && lexer->lookahead == '}') { lexer->result_symbol = BLOCK_COMMENT_CONTENT; return true; } else { return false; } } else if (lexer->eof(lexer)) { if (valid_symbols[VIRTUAL_END_SECTION]) { lexer->result_symbol = VIRTUAL_END_SECTION; return true; } if (valid_symbols[VIRTUAL_END_DECL]) { lexer->result_symbol = VIRTUAL_END_DECL; return true; } break; } else { break; } } if (checkForIn(lexer, valid_symbols) == 2) { if (has_newline) { found_in = true; } else { lexer->result_symbol = VIRTUAL_END_SECTION; VEC_POP(scanner->indents); return true; } } // Open section if the grammar lets us but only push to indent stack if // we go further down in the stack if (valid_symbols[VIRTUAL_OPEN_SECTION] && !lexer->eof(lexer)) { VEC_PUSH(scanner->indents, lexer->get_column(lexer)); lexer->result_symbol = VIRTUAL_OPEN_SECTION; return true; } if (valid_symbols[BLOCK_COMMENT_CONTENT]) { if (!can_call_mark_end) { return false; } lexer->mark_end(lexer); while (true) { if (lexer->lookahead == '\0') { break; } if (lexer->lookahead != '{' && lexer->lookahead != '-') { advance(lexer); } else if (lexer->lookahead == '-') { lexer->mark_end(lexer); advance(lexer); if (lexer->lookahead == '}') { break; } } else if (scan_block_comment(lexer)) { lexer->mark_end(lexer); advance(lexer); if (lexer->lookahead == '-') { break; } } } lexer->result_symbol = BLOCK_COMMENT_CONTENT; return true; } if (has_newline) { // We had a newline now it's time to check if we need to add // multiple tokens to get back up to the right level VEC_CLEAR(scanner->runback); while (scanner->indent_length <= VEC_BACK(scanner->indents)) { if (scanner->indent_length == VEC_BACK(scanner->indents)) { if (found_in) { VEC_PUSH(scanner->runback, 1); found_in = false; break; } // Don't insert VIRTUAL_END_DECL when there is a line // comment incoming if (lexer->lookahead == '-') { skip(lexer); if (lexer->lookahead == '-') { break; } } // Don't insert VIRTUAL_END_DECL when there is a block // comment incoming if (lexer->lookahead == '{') { skip(lexer); if (lexer->lookahead == '-') { break; } } VEC_PUSH(scanner->runback, 0); break; } if (scanner->indent_length < VEC_BACK(scanner->indents)) { VEC_POP(scanner->indents); VEC_PUSH(scanner->runback, 1); found_in = false; } } // Needed for some of the more weird cases where let is in the same // line as everything before the in in the next line if (found_in) { VEC_PUSH(scanner->runback, 1); found_in = false; } // Our list is the wrong way around, reverse it VEC_REVERSE(scanner->runback); // Handle the first runback token if we have them, if there are more // they will be handled on the next scan operation if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 0 && valid_symbols[VIRTUAL_END_DECL]) { VEC_POP(scanner->runback); lexer->result_symbol = VIRTUAL_END_DECL; return true; } if (scanner->runback.len > 0 && VEC_BACK(scanner->runback) == 1 && valid_symbols[VIRTUAL_END_SECTION]) { VEC_POP(scanner->runback); lexer->result_symbol = VIRTUAL_END_SECTION; return true; } if (lexer->eof(lexer) && valid_symbols[VIRTUAL_END_SECTION]) { lexer->result_symbol = VIRTUAL_END_SECTION; return true; } } if (valid_symbols[GLSL_CONTENT]) { if (!can_call_mark_end) { return false; } lexer->result_symbol = GLSL_CONTENT; while (true) { switch (lexer->lookahead) { case '|': lexer->mark_end(lexer); advance(lexer); if (lexer->lookahead == ']') { advance(lexer); return true; } break; case '\0': lexer->mark_end(lexer); return true; default: advance(lexer); } } } return false; } // -------------------------------------------------------------------------------------------------------- // API // -------------------------------------------------------------------------------------------------------- /** * This function allocates the persistent state of the parser that is passed * into the other API functions. */ void *tree_sitter_elm_external_scanner_create() { Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner)); return scanner; } /** * Main logic entry point. * Since the state is a singular vector, it can just be cast and used directly. */ bool tree_sitter_elm_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { Scanner *scanner = (Scanner *)payload; return scan(scanner, lexer, valid_symbols); } /** * Copy the current state to another location for later reuse. * This is normally more complex, but since this parser's state constists solely * of a vector of integers, it can just be copied. */ unsigned tree_sitter_elm_external_scanner_serialize(void *payload, char *buffer) { Scanner *scanner = (Scanner *)payload; size_t size = 0; if (3 + scanner->indents.len + scanner->runback.len >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { return 0; } size_t runback_count = scanner->runback.len; if (runback_count > UINT8_MAX) { runback_count = UINT8_MAX; } buffer[size++] = (char)runback_count; if (runback_count > 0) { memcpy(&buffer[size], scanner->runback.data, runback_count); } size += runback_count; size_t indent_length_length = sizeof(scanner->indent_length); buffer[size++] = (char)indent_length_length; if (indent_length_length > 0) { memcpy(&buffer[size], &scanner->indent_length, indent_length_length); } size += indent_length_length; int iter = 1; for (; iter != scanner->indents.len && size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { buffer[size++] = (char)scanner->indents.data[iter]; } return size; } /** * Load another parser state into the currently active state. * `payload` is the state of the previous parser execution, while `buffer` is * the saved state of a different position (e.g. when doing incremental * parsing). */ void tree_sitter_elm_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { Scanner *scanner = (Scanner *)payload; VEC_CLEAR(scanner->runback); VEC_CLEAR(scanner->indents); VEC_PUSH(scanner->indents, 0); if (length == 0) { return; } size_t size = 0; size_t runback_count = (unsigned char)buffer[size++]; VEC_GROW(scanner->runback, runback_count) if (runback_count > 0) { memcpy(scanner->runback.data, &buffer[size], runback_count); scanner->runback.len = runback_count; size += runback_count; } size_t indent_length_length = (unsigned char)buffer[size++]; if (indent_length_length > 0) { memcpy(&scanner->indent_length, &buffer[size], indent_length_length); size += indent_length_length; } for (; size < length; size++) { VEC_PUSH(scanner->indents, (unsigned char)buffer[size]); } assert(size == length); } /** * Destroy the state. */ void tree_sitter_elm_external_scanner_destroy(void *payload) { Scanner *scanner = (Scanner *)payload; VEC_FREE(scanner->indents); VEC_FREE(scanner->runback); free(scanner); }