/* Copyright (c) 2013. The YARA Authors. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Lexical analyzer for regular expressions */ %{ /* Disable warnings for unused functions in this file. As we redefine YY_FATAL_ERROR macro to use our own function re_yyfatal, the yy_fatal_error function generated by Flex is not actually used, causing a compiler warning. Flex doesn't offer any options to remove the yy_fatal_error function. When they include something like %option noyy_fatal_error as they do with noyywrap then we can remove this pragma. */ #ifdef __GNUC__ #pragma GCC diagnostic ignored "-Wunused-function" #endif #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #define snprintf _snprintf #endif // Bitmap with 1 bit for each of the 256 characters in the ASCII table. The bit // is set to 1 if the corresponding character is alphanumeric or 0 if otherwise. static uint8_t word_chars[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; // Bitmap with 1 bit for each of the 256 characters in the ASCII table. The bit // is set to 1 if the corresponding character is considered a space. Space // characters include horizontal and vertical tabs, carriage return, new line // and form feed (\t, \v, \r, \n, \f). static uint8_t space_chars[] = { 0x00, 0x3E, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; int escaped_char_value( char* text, uint8_t* value, bool strict_escape); int read_escaped_char( yyscan_t yyscanner, uint8_t* escaped_char, bool strict_escape); %} %option reentrant bison-bridge %option noyywrap %option nounistd %option nounput %option never-interactive %option yylineno %option prefix="re_yy" %option outfile="lex.yy.c" %option verbose %option warn %x char_class digit [0-9] hex_digit [0-9a-fA-F] %% \{{digit}*[ ]*,[ ]*{digit}*\} { // Examples: {3,8} {3, 8}, {3 ,8} {3 , 8} {0,5} {,5} {7,} int hi_bound; int lo_bound = atoi(yytext + 1); char* comma = strchr(yytext, ','); char* hi_bound_ptr = comma + 1; // Skip spaces after the comma, if any. while (*hi_bound_ptr == ' ') hi_bound_ptr++; if (*hi_bound_ptr == '}') hi_bound = RE_MAX_RANGE; else hi_bound = atoi(hi_bound_ptr); if (hi_bound > RE_MAX_RANGE) { yyerror(yyscanner, lex_env, "repeat interval too large"); yyterminate(); } if (hi_bound < lo_bound || hi_bound < 0 || lo_bound < 0) { yyerror(yyscanner, lex_env, "bad repeat interval"); yyterminate(); } yylval->range = (hi_bound << 16) | lo_bound; return _RANGE_; } \{{digit}+\} { // Example: {10} int value = atoi(yytext + 1); // atoi can return a negative value if the input string represents a number // too large to fit in an integer. if (value > RE_MAX_RANGE || value < 0) { yyerror(yyscanner, lex_env, "repeat interval too large"); yyterminate(); } yylval->range = (value << 16) | value; return _RANGE_; } \[\^ { // Start of a negated character class. Example: [^abcd] BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = true; } \[\^\] { // Start of character negated class containing a ]. // Example: [^]abc] this must be interpreted as a class // not matching ], a, b, nor c BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = true; LEX_ENV->re_class.bitmap[']' / 8] |= 1 << ']' % 8; } \[\] { // Start of character class containing a ]. // Example: []abc] this must be interpreted as a class // matching ], a, b, or c. BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = false; LEX_ENV->re_class.bitmap[']' / 8] |= 1 << ']' % 8; } \[ { // Start of character class. Example: [abcd] BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = false; } [^\\\[\(\)\|\$\.\^\+\*\?] { // Any non-special character is passed as a CHAR token to the scanner. yylval->integer = yytext[0]; return _CHAR_; } \\w { return _WORD_CHAR_; } \\W { return _NON_WORD_CHAR_; } \\s { return _SPACE_; } \\S { return _NON_SPACE_; } \\d { return _DIGIT_; } \\D { return _NON_DIGIT_; } \\b { return _WORD_BOUNDARY_; } \\B { return _NON_WORD_BOUNDARY_; } \\{digit}+ { yyerror(yyscanner, lex_env, "backreferences are not allowed"); yyterminate(); } \\ { uint8_t c; int return_code; return_code = read_escaped_char(yyscanner, &c, LEX_ENV->strict_escape); if (return_code == VALID_ESCAPE_SEQUENCE) { yylval->integer = c; return _CHAR_; } else if (return_code == UNKNOWN_ESCAPE_SEQUENCE) { yywarning(yyscanner, lex_env, "unknown escape sequence"); yylval->integer = c; return _CHAR_; } else { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } } \] { // End of character class. yylval->re_class = (RE_CLASS*) yr_malloc(sizeof(RE_CLASS)); memcpy(yylval->re_class->bitmap, LEX_ENV->re_class.bitmap, 32); yylval->re_class->negated = LEX_ENV->re_class.negated; BEGIN(INITIAL); return _CLASS_; } (\\x{hex_digit}{2}|\\.|[^]\\])-[^]] { // A range inside a character class. The regexp is... // // ( \x{hex_digit}{2} Hex digit (i.e: \x01) ... // | \. ...or any escaped character (i.e. \\, \-) ... // | [^]\] ...or any character except ] and \ ... // ) // - ... followed by - // [^]] ... followed by any character except ] // // Some examples: // // [abc0-9] // ^-^ matching range 0-9 // // [a-za-] // ^-^- matching range a-z // // [\.-a] // ^--^- matching range \.-a // uint16_t c; uint8_t start = yytext[0]; uint8_t end = yytext[2]; if (start == '\\') { if (!escaped_char_value(yytext, &start, LEX_ENV->strict_escape)) { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } if (yytext[1] == 'x') end = yytext[5]; else end = yytext[3]; } if (end == '\\') { if (!read_escaped_char(yyscanner, &end, LEX_ENV->strict_escape)) { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } } if (end < start) { yyerror(yyscanner, lex_env, "bad character range"); yyterminate(); } for (c = start; c <= end; c++) { LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } } \\w { for (int i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= word_chars[i]; } \\W { for (int i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= ~word_chars[i]; } \\s { for (int i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= space_chars[i]; } \\S { for (int i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= ~space_chars[i]; } \\d { for (char c = '0'; c <= '9'; c++) LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } \\D { for (int i = 0; i < 32; i++) { // digits 0-7 are in the sixth byte of the vector, let that byte alone if (i == 6) continue; // digits 8 and 9 are the lowest two bits in the seventh byte of the // vector, let those bits alone. if (i == 7) LEX_ENV->re_class.bitmap[i] |= 0xFC; else LEX_ENV->re_class.bitmap[i] = 0xFF; } } \\ { uint8_t c; int return_code; return_code = read_escaped_char(yyscanner, &c, LEX_ENV->strict_escape); if (return_code == VALID_ESCAPE_SEQUENCE) { LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } else if (return_code == UNKNOWN_ESCAPE_SEQUENCE) { yywarning(yyscanner, lex_env, "unknown escape sequence"); LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } else { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } } . { if (yytext[0] >= 32 && yytext[0] < 127) { // A character class (i.e: [0-9a-f]) is represented by a 256-bits vector, // here we set to 1 the vector's bit corresponding to the input character. LEX_ENV->re_class.bitmap[yytext[0] / 8] |= 1 << yytext[0] % 8; } else { yyerror(yyscanner, lex_env, "non-ascii character"); yyterminate(); } } <> { // End of regexp reached while scanning a character class. yyerror(yyscanner, lex_env, "missing terminating ] for character class"); yyterminate(); } . { if (yytext[0] >= 32 && yytext[0] < 127) { return yytext[0]; } else { yyerror(yyscanner, lex_env, "non-ascii character"); yyterminate(); } } <> { yyterminate(); } %% int escaped_char_value( char* text, uint8_t* value, bool strict_escape) { unsigned int hex_value; char hex[3]; assert(text[0] == '\\'); switch(text[1]) { case 'x': if (!isxdigit(text[2]) || !isxdigit(text[3])) return 0; hex[0] = text[2]; hex[1] = text[3]; hex[2] = '\0'; sscanf(hex, "%x", &hex_value); *value = (uint8_t) hex_value; break; case 'n': *value = '\n'; break; case 't': *value = '\t'; break; case 'r': *value = '\r'; break; case 'f': *value = '\f'; break; case 'a': *value = '\a'; break; // Support metacharacters in escape sequences case '\\': case '^': case '$': case '.': case '|': case '(': case ')': case '[': case ']': // Support other special characters that are used in rules and need to be escaped case '*': case '+': case '?': case '"': case '\'': case '-': case '{': case '}': case '#': case ':': case '_': case '=': case '/': case '!': case ',': case '@': case '<': case '>': case '~': case '&': case '%': *value = text[1]; break; default: *value = text[1]; if (strict_escape) return UNKNOWN_ESCAPE_SEQUENCE; return VALID_ESCAPE_SEQUENCE; } return VALID_ESCAPE_SEQUENCE; } #ifdef __cplusplus #define RE_YY_INPUT yyinput #else #define RE_YY_INPUT input #endif int read_escaped_char( yyscan_t yyscanner, uint8_t* escaped_char, bool strict_escape) { char text[4] = {0, 0, 0, 0}; text[0] = '\\'; text[1] = RE_YY_INPUT(yyscanner); if (text[1] == EOF || text[1] == 0) return 0; if (text[1] == 'x') { text[2] = RE_YY_INPUT(yyscanner); if (text[2] == EOF || text[2] == 0) return 0; text[3] = RE_YY_INPUT(yyscanner); if (text[3] == EOF || text[3] == 0) return 0; } return escaped_char_value(text, escaped_char, strict_escape); } // // yyfatal (actually named re_yyfatal because of the '%option prefix="re_yy"' // directive) is called when a fatal error occurs in the parser. When this // happens we are deep inside the parsing logic generated by flex/bison and // the only way to exit gracefully from there is using setjmp/longjmp. // void yyfatal( yyscan_t yyscanner, const char *error_message) { jmp_buf* recovery_trampoline = (jmp_buf*) yr_thread_storage_get_value( &yr_yyfatal_trampoline_tls); longjmp(*recovery_trampoline, 1); } void yyerror( yyscan_t yyscanner, RE_LEX_ENVIRONMENT* lex_env, const char *error_message) { // if lex_env->last_error was set to some error code before // don't overwrite it, we are interested in the first error, not in // subsequent errors like "syntax error, unexpected $end" caused by // early parser termination. if (lex_env->last_error == ERROR_SUCCESS || lex_env->last_error == ERROR_UNKNOWN_ESCAPE_SEQUENCE) { lex_env->last_error = ERROR_INVALID_REGULAR_EXPRESSION; strlcpy( lex_env->last_error_message, error_message, sizeof(lex_env->last_error_message)); } } void yywarning( yyscan_t yyscanner, RE_LEX_ENVIRONMENT* lex_env, const char *error_message) { // Do not overwrite Errors // print out warning only if there is not any other error beforehand if (lex_env->last_error == ERROR_SUCCESS) { lex_env->last_error = ERROR_UNKNOWN_ESCAPE_SEQUENCE; strlcpy( lex_env->last_error_message, error_message, sizeof(lex_env->last_error_message)); } } int yr_parse_re_string( const char* re_string, RE_AST** re_ast, RE_ERROR* error, int flags) { yyscan_t yyscanner; jmp_buf recovery_trampoline; RE_LEX_ENVIRONMENT lex_env; lex_env.last_error = ERROR_SUCCESS; if (flags & RE_PARSER_FLAG_ENABLE_STRICT_ESCAPE_SEQUENCES) lex_env.strict_escape = true; else lex_env.strict_escape = false; lex_env.last_error_message[0] = '\0'; yr_thread_storage_set_value( &yr_yyfatal_trampoline_tls, &recovery_trampoline); // setjmp returns a non-zero value only when we are returning to this // point via a call to longjmp to the recovery trampoline. if (setjmp(recovery_trampoline) != 0) return ERROR_INTERNAL_FATAL_ERROR; FAIL_ON_ERROR(yr_re_ast_create(re_ast)); if (yylex_init(&yyscanner) != 0) { yr_re_ast_destroy(*re_ast); *re_ast = NULL; return ERROR_INSUFFICIENT_MEMORY; } yyset_extra(*re_ast, yyscanner); yy_scan_string(re_string, yyscanner); yyparse(yyscanner, &lex_env); yylex_destroy(yyscanner); if (lex_env.last_error != ERROR_SUCCESS) { if (lex_env.last_error != ERROR_UNKNOWN_ESCAPE_SEQUENCE) { yr_re_ast_destroy(*re_ast); *re_ast = NULL; } strlcpy( error->message, lex_env.last_error_message, sizeof(error->message)); return lex_env.last_error; } return ERROR_SUCCESS; }