#include #include #include #include #include #include namespace { using std::vector; using std::iswspace; using std::memcpy; enum TokenType { NEWLINE, INDENT, DEDENT, STRING_START, STRING_CONTENT, STRING_END, }; struct Delimiter { enum { SingleQuote = 1 << 0, DoubleQuote = 1 << 1, BackQuote = 1 << 2, Raw = 1 << 3, Format = 1 << 4, Triple = 1 << 5, Bytes = 1 << 6, }; Delimiter() : flags(0) {} bool is_format() const { return flags & Format; } bool is_raw() const { return flags & Raw; } bool is_triple() const { return flags & Triple; } bool is_bytes() const { return flags & Bytes; } int32_t end_character() const { if (flags & SingleQuote) return '\''; if (flags & DoubleQuote) return '"'; if (flags & BackQuote) return '`'; return 0; } void set_format() { flags |= Format; } void set_raw() { flags |= Raw; } void set_triple() { flags |= Triple; } void set_bytes() { flags |= Bytes; } void set_end_character(int32_t character) { switch (character) { case '\'': flags |= SingleQuote; break; case '"': flags |= DoubleQuote; break; case '`': flags |= BackQuote; break; default: assert(false); } } char flags; }; struct Scanner { Scanner() { assert(sizeof(Delimiter) == sizeof(char)); deserialize(NULL, 0); } unsigned serialize(char *buffer) { size_t i = 0; size_t delimiter_count = delimiter_stack.size(); if (delimiter_count > UINT8_MAX) delimiter_count = UINT8_MAX; buffer[i++] = delimiter_count; if (delimiter_count > 0) { memcpy(&buffer[i], delimiter_stack.data(), delimiter_count); } i += delimiter_count; vector::iterator iter = indent_length_stack.begin() + 1, end = indent_length_stack.end(); for (; iter != end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++iter) { buffer[i++] = *iter; } return i; } void deserialize(const char *buffer, unsigned length) { delimiter_stack.clear(); indent_length_stack.clear(); indent_length_stack.push_back(0); if (length > 0) { size_t i = 0; size_t delimiter_count = (uint8_t)buffer[i++]; delimiter_stack.resize(delimiter_count); if (delimiter_count > 0) { memcpy(delimiter_stack.data(), &buffer[i], delimiter_count); } i += delimiter_count; for (; i < length; i++) { indent_length_stack.push_back(buffer[i]); } } } void advance(TSLexer *lexer) { lexer->advance(lexer, false); } void skip(TSLexer *lexer) { lexer->advance(lexer, true); } bool scan(TSLexer *lexer, const bool *valid_symbols) { if (valid_symbols[STRING_CONTENT] && !valid_symbols[INDENT] && !delimiter_stack.empty()) { Delimiter delimiter = delimiter_stack.back(); int32_t end_character = delimiter.end_character(); bool has_content = false; while (lexer->lookahead) { if ((lexer->lookahead == '{' || lexer->lookahead == '}') && delimiter.is_format()) { lexer->mark_end(lexer); lexer->result_symbol = STRING_CONTENT; return has_content; } else if (lexer->lookahead == '\\') { if (delimiter.is_raw()) { lexer->advance(lexer, false); } else if (delimiter.is_bytes()) { lexer->mark_end(lexer); lexer->advance(lexer, false); if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || lexer->lookahead == 'U') { // In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are not escape sequences // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals lexer->advance(lexer, false); } else { lexer->result_symbol = STRING_CONTENT; return has_content; } } else { lexer->mark_end(lexer); lexer->result_symbol = STRING_CONTENT; return has_content; } } else if (lexer->lookahead == end_character) { if (delimiter.is_triple()) { lexer->mark_end(lexer); lexer->advance(lexer, false); if (lexer->lookahead == end_character) { lexer->advance(lexer, false); if (lexer->lookahead == end_character) { if (has_content) { lexer->result_symbol = STRING_CONTENT; } else { lexer->advance(lexer, false); lexer->mark_end(lexer); delimiter_stack.pop_back(); lexer->result_symbol = STRING_END; } return true; } else { lexer->mark_end(lexer); lexer->result_symbol = STRING_CONTENT; return true; } } else { lexer->mark_end(lexer); lexer->result_symbol = STRING_CONTENT; return true; } } else { if (has_content) { lexer->result_symbol = STRING_CONTENT; } else { lexer->advance(lexer, false); delimiter_stack.pop_back(); lexer->result_symbol = STRING_END; } lexer->mark_end(lexer); return true; } } else if (lexer->lookahead == '\n' && has_content && !delimiter.is_triple()) { return false; } advance(lexer); has_content = true; } } lexer->mark_end(lexer); bool found_end_of_line = false; uint32_t indent_length = 0; int32_t first_comment_indent_length = -1; for (;;) { if (lexer->lookahead == '\n') { found_end_of_line = true; indent_length = 0; skip(lexer); } else if (lexer->lookahead == ' ') { indent_length++; skip(lexer); } else if (lexer->lookahead == '\r') { indent_length = 0; skip(lexer); } else if (lexer->lookahead == '\t') { indent_length += 8; skip(lexer); } else if (lexer->lookahead == '#') { if (first_comment_indent_length == -1) { first_comment_indent_length = (int32_t)indent_length; } while (lexer->lookahead && lexer->lookahead != '\n') { skip(lexer); } skip(lexer); indent_length = 0; } else if (lexer->lookahead == '\\') { skip(lexer); if (lexer->lookahead == '\r') { skip(lexer); } if (lexer->lookahead == '\n') { skip(lexer); } else { return false; } } else if (lexer->lookahead == '\f') { indent_length = 0; skip(lexer); } else if (lexer->lookahead == 0) { indent_length = 0; found_end_of_line = true; break; } else { break; } } if (found_end_of_line) { if (!indent_length_stack.empty()) { uint16_t current_indent_length = indent_length_stack.back(); if ( valid_symbols[INDENT] && indent_length > current_indent_length ) { indent_length_stack.push_back(indent_length); lexer->result_symbol = INDENT; return true; } if ( valid_symbols[DEDENT] && indent_length < current_indent_length && // Wait to create a dedent token until we've consumed any comments // whose indentation matches the current block. first_comment_indent_length < (int32_t)current_indent_length ) { indent_length_stack.pop_back(); lexer->result_symbol = DEDENT; return true; } } if (valid_symbols[NEWLINE]) { lexer->result_symbol = NEWLINE; return true; } } if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) { Delimiter delimiter; bool has_flags = false; while (lexer->lookahead) { if (lexer->lookahead == 'f' || lexer->lookahead == 'F') { delimiter.set_format(); } else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') { delimiter.set_raw(); } else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') { delimiter.set_bytes(); } else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') { break; } has_flags = true; advance(lexer); } if (lexer->lookahead == '`') { delimiter.set_end_character('`'); advance(lexer); lexer->mark_end(lexer); } else if (lexer->lookahead == '\'') { delimiter.set_end_character('\''); advance(lexer); lexer->mark_end(lexer); if (lexer->lookahead == '\'') { advance(lexer); if (lexer->lookahead == '\'') { advance(lexer); lexer->mark_end(lexer); delimiter.set_triple(); } } } else if (lexer->lookahead == '"') { delimiter.set_end_character('"'); advance(lexer); lexer->mark_end(lexer); if (lexer->lookahead == '"') { advance(lexer); if (lexer->lookahead == '"') { advance(lexer); lexer->mark_end(lexer); delimiter.set_triple(); } } } if (delimiter.end_character()) { delimiter_stack.push_back(delimiter); lexer->result_symbol = STRING_START; return true; } else if (has_flags) { return false; } } return false; } vector indent_length_stack; vector delimiter_stack; }; } extern "C" { void *tree_sitter_python_external_scanner_create() { return new Scanner(); } bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { Scanner *scanner = static_cast(payload); return scanner->scan(lexer, valid_symbols); } unsigned tree_sitter_python_external_scanner_serialize(void *payload, char *buffer) { Scanner *scanner = static_cast(payload); return scanner->serialize(buffer); } void tree_sitter_python_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { Scanner *scanner = static_cast(payload); scanner->deserialize(buffer, length); } void tree_sitter_python_external_scanner_destroy(void *payload) { Scanner *scanner = static_cast(payload); delete scanner; } }