/* Copyright (c) 2013. The YARA Authors. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Lexical analyzer for regular expressions */ %{ /* Disable warnings for unused functions in this file. As we redefine YY_FATAL_ERROR macro to use our own function re_yyfatal, the yy_fatal_error function generated by Flex is not actually used, causing a compiler warning. Flex doesn't offer any options to remove the yy_fatal_error function. When they include something like %option noyy_fatal_error as they do with noyywrap then we can remove this pragma. */ #ifdef __GNUC__ #pragma GCC diagnostic ignored "-Wunused-function" #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #define snprintf _snprintf #endif // Bitmap with 1 bit for each of the 256 characters in the ASCII table. The bit // is set to 1 if the corresponding character is alphanumeric or 0 if otherwise. static uint8_t word_chars[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; // Bitmap with 1 bit for each of the 256 characters in the ASCII table. The bit // is set to 1 if the corresponding character is considered a space. Space // characters include horizontal and vertical tabs, carriage return, new line // and form feed (\t, \v, \r, \n, \f). static uint8_t space_chars[] = { 0x00, 0x3E, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; int escaped_char_value( char* text, uint8_t* value); int read_escaped_char( yyscan_t yyscanner, uint8_t* escaped_char); %} %option reentrant bison-bridge %option noyywrap %option nounistd %option nounput %option never-interactive %option yylineno %option prefix="re_yy" %option outfile="lex.yy.c" %option verbose %option warn %x char_class digit [0-9] hex_digit [0-9a-fA-F] %% \{{digit}*,{digit}*\} { // Examples: {3,8} {0,5} {,5} {7,} int hi_bound; int lo_bound = atoi(yytext + 1); char* comma = strchr(yytext, ','); if (comma - yytext == strlen(yytext) - 2) // if comma is followed by the closing curly bracket // (example: {2,}) set high bound value to maximum. hi_bound = INT16_MAX; else hi_bound = atoi(comma + 1); if (hi_bound > INT16_MAX) { yyerror(yyscanner, lex_env, "repeat interval too large"); yyterminate(); } if (hi_bound < lo_bound || hi_bound < 0 || lo_bound < 0) { yyerror(yyscanner, lex_env, "bad repeat interval"); yyterminate(); } if (hi_bound == 0 && lo_bound == 0) { yyerror(yyscanner, lex_env, "bad repeat interval"); yyterminate(); } yylval->range = (hi_bound << 16) | lo_bound; return _RANGE_; } \{{digit}+\} { // Example: {10} int value = atoi(yytext + 1); // atoi can return a negative value if the input string represents a number // too large to fit in an integer. if (value > INT16_MAX || value < 0) { yyerror(yyscanner, lex_env, "repeat interval too large"); yyterminate(); } if (value == 0) { yyerror(yyscanner, lex_env, "bad repeat interval"); yyterminate(); } yylval->range = (value << 16) | value; return _RANGE_; } \[\^ { // Start of a negated character class. Example: [^abcd] BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = true; } \[\^\] { // Start of character negated class containing a ]. // Example: [^]abc] this must be interpreted as a class // not matching ], a, b, nor c BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = true; LEX_ENV->re_class.bitmap[']' / 8] |= 1 << ']' % 8; } \[\] { // Start of character class containing a ]. // Example: []abc] this must be interpreted as a class // matching ], a, b, or c. BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = false; LEX_ENV->re_class.bitmap[']' / 8] |= 1 << ']' % 8; } \[ { // Start of character class. Example: [abcd] BEGIN(char_class); memset(LEX_ENV->re_class.bitmap, 0, 32); LEX_ENV->re_class.negated = false; } [^\\\[\(\)\|\$\.\^\+\*\?] { // Any non-special character is passed as a CHAR token to the scanner. yylval->integer = yytext[0]; return _CHAR_; } \\w { return _WORD_CHAR_; } \\W { return _NON_WORD_CHAR_; } \\s { return _SPACE_; } \\S { return _NON_SPACE_; } \\d { return _DIGIT_; } \\D { return _NON_DIGIT_; } \\b { return _WORD_BOUNDARY_; } \\B { return _NON_WORD_BOUNDARY_; } \\{digit}+ { yyerror(yyscanner, lex_env, "backreferences are not allowed"); yyterminate(); } \\ { uint8_t c; if (read_escaped_char(yyscanner, &c)) { yylval->integer = c; return _CHAR_; } else { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } } \] { // End of character class. yylval->re_class = (RE_CLASS*) yr_malloc(sizeof(RE_CLASS)); memcpy(yylval->re_class->bitmap, LEX_ENV->re_class.bitmap, 32); yylval->re_class->negated = LEX_ENV->re_class.negated; BEGIN(INITIAL); return _CLASS_; } (\\x{hex_digit}{2}|\\.|[^\\])\-[^]] { // A range inside a character class. // [abc0-9] // ^- matching here uint16_t c; uint8_t start = yytext[0]; uint8_t end = yytext[2]; if (start == '\\') { if (!escaped_char_value(yytext, &start)) { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } if (yytext[1] == 'x') end = yytext[5]; else end = yytext[3]; } if (end == '\\') { if (!read_escaped_char(yyscanner, &end)) { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } } if (end < start) { yyerror(yyscanner, lex_env, "bad character range"); yyterminate(); } for (c = start; c <= end; c++) { LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } } \\w { int i; for (i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= word_chars[i]; } \\W { int i; for (i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= ~word_chars[i]; } \\s { int i; for (i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= space_chars[i]; } \\S { int i; for (i = 0; i < 32; i++) LEX_ENV->re_class.bitmap[i] |= ~space_chars[i]; } \\d { char c; for (c = '0'; c <= '9'; c++) LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } \\D { int i; for (i = 0; i < 32; i++) { // digits 0-7 are in the sixth byte of the vector, let that byte alone if (i == 6) continue; // digits 8 and 9 are the lowest two bits in the seventh byte of the // vector, let those bits alone. if (i == 7) LEX_ENV->re_class.bitmap[i] |= 0xFC; else LEX_ENV->re_class.bitmap[i] = 0xFF; } } \\ { uint8_t c; if (read_escaped_char(yyscanner, &c)) { LEX_ENV->re_class.bitmap[c / 8] |= 1 << c % 8; } else { yyerror(yyscanner, lex_env, "illegal escape sequence"); yyterminate(); } } . { if (yytext[0] >= 32 && yytext[0] < 127) { // A character class (i.e: [0-9a-f]) is represented by a 256-bits vector, // here we set to 1 the vector's bit corresponding to the input character. LEX_ENV->re_class.bitmap[yytext[0] / 8] |= 1 << yytext[0] % 8; } else { yyerror(yyscanner, lex_env, "non-ascii character"); yyterminate(); } } <> { // End of regexp reached while scanning a character class. yyerror(yyscanner, lex_env, "missing terminating ] for character class"); yyterminate(); } . { if (yytext[0] >= 32 && yytext[0] < 127) { return yytext[0]; } else { yyerror(yyscanner, lex_env, "non-ascii character"); yyterminate(); } } <> { yyterminate(); } %% int escaped_char_value( char* text, uint8_t* value) { unsigned int hex_value; char hex[3]; assert(text[0] == '\\'); switch(text[1]) { case 'x': if (!isxdigit(text[2]) || !isxdigit(text[3])) return 0; hex[0] = text[2]; hex[1] = text[3]; hex[2] = '\0'; sscanf(hex, "%x", &hex_value); *value = (uint8_t) hex_value; break; case 'n': *value = '\n'; break; case 't': *value = '\t'; break; case 'r': *value = '\r'; break; case 'f': *value = '\f'; break; case 'a': *value = '\a'; break; default: *value = text[1]; } return 1; } #ifdef __cplusplus #define RE_YY_INPUT yyinput #else #define RE_YY_INPUT input #endif int read_escaped_char( yyscan_t yyscanner, uint8_t* escaped_char) { char text[4] = {0, 0, 0, 0}; text[0] = '\\'; text[1] = RE_YY_INPUT(yyscanner); if (text[1] == EOF || text[1] == 0) return 0; if (text[1] == 'x') { text[2] = RE_YY_INPUT(yyscanner); if (text[2] == EOF || text[2] == 0) return 0; text[3] = RE_YY_INPUT(yyscanner); if (text[3] == EOF || text[3] == 0) return 0; } return escaped_char_value(text, escaped_char); } void yyfatal( yyscan_t yyscanner, const char *error_message) { jmp_buf* recovery_state = (jmp_buf*) yr_thread_storage_get_value( &yr_recovery_state_key); longjmp(*recovery_state, 1); } void yyerror( yyscan_t yyscanner, RE_LEX_ENVIRONMENT* lex_env, const char *error_message) { // if lex_env->last_error was set to some error code before // don't overwrite it, we are interested in the first error, not in // subsequent errors like "syntax error, unexpected $end" caused by // early parser termination. if (lex_env->last_error == ERROR_SUCCESS) { lex_env->last_error = ERROR_INVALID_REGULAR_EXPRESSION; strlcpy( lex_env->last_error_message, error_message, sizeof(lex_env->last_error_message)); } } int yr_parse_re_string( const char* re_string, RE_AST** re_ast, RE_ERROR* error) { yyscan_t yyscanner; jmp_buf recovery_state; RE_LEX_ENVIRONMENT lex_env; lex_env.last_error = ERROR_SUCCESS; lex_env.last_error_message[0] = '\0'; yr_thread_storage_set_value(&yr_recovery_state_key, &recovery_state); if (setjmp(recovery_state) != 0) return ERROR_INTERNAL_FATAL_ERROR; FAIL_ON_ERROR(yr_re_ast_create(re_ast)); yylex_init(&yyscanner); yyset_extra(*re_ast, yyscanner); yy_scan_string(re_string, yyscanner); yyparse(yyscanner, &lex_env); yylex_destroy(yyscanner); if (lex_env.last_error != ERROR_SUCCESS) { yr_re_ast_destroy(*re_ast); *re_ast = NULL; strlcpy( error->message, lex_env.last_error_message, sizeof(error->message)); return lex_env.last_error; } return ERROR_SUCCESS; }