/* Copyright 2018, UCAR/Unidata and OPeNDAP, Inc. See the COPYRIGHT file for more information. */ #include "config.h" #ifdef HAVE_STRINGS_H #include #endif #include "oc.h" #include "dapparselex.h" #include "dapy.h" #undef URLCVT /* NEVER turn this on */ /* Do we %xx decode all or part of a DAP Identifier: see dapdecode() */ #define DECODE_PARTIAL #define DAP2ENCODE #ifdef DAP2ENCODE #define KEEPSLASH #endif /* Forward */ static void dumptoken(DAPlexstate* lexstate); static void dapaddyytext(DAPlexstate* lex, int c); #ifndef DAP2ENCODE static int tohex(int c); #endif /****************************************************/ #ifdef INFORMATIONAL /* Set of all ascii printable characters */ static const char ascii[] = " !\"#$%&'()*+,-./:;<=>?@[]\\^_`|{}~"; /* Define the set of legal nonalphanum characters as specified in the DAP2 spec. */ static const char* daplegal ="_!~*'-\""; #endif static const char* ddsworddelims = "{}[]:;=,"; /* Define 1 and > 1st legal characters */ /* Note: for some reason I added # and removed !~'" what was I thinking? */ static const char* ddswordchars1 = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" "-+_/%\\.*!~'\""; static const char* ddswordcharsn = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" "-+_/%\\.*!~'\""; /* This includes sharp and colon for historical reasons */ static const char* daswordcharsn = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" "-+_/%\\.*#:!~'\""; /* Need to remove '.' to allow for fqns */ static const char* cewordchars1 = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" "-+_/%\\*!~'\""; static const char* cewordcharsn = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" "-+_/%\\*!~'\""; /* Current sets of legal characters */ /* static char* wordchars1 = NULL; static char* wordcharsn = NULL; static char* worddelims = NULL; */ static const char* keywords[] = { "alias", "array", "attributes", "byte", "dataset", "error", "float32", "float64", "grid", "int16", "int32", "maps", "sequence", "string", "structure", "uint16", "uint32", "url", "code", "message", "program_type", "program", NULL /* mark end of the keywords list */ }; static const int keytokens[] = { SCAN_ALIAS, SCAN_ARRAY, SCAN_ATTR, SCAN_BYTE, SCAN_DATASET, SCAN_ERROR, SCAN_FLOAT32, SCAN_FLOAT64, SCAN_GRID, SCAN_INT16, SCAN_INT32, SCAN_MAPS, SCAN_SEQUENCE, SCAN_STRING, SCAN_STRUCTURE, SCAN_UINT16, SCAN_UINT32, SCAN_URL, SCAN_CODE, SCAN_MESSAGE, SCAN_PTYPE, SCAN_PROG }; /**************************************************/ int daplex(YYSTYPE* lvalp, DAPparsestate* state) { DAPlexstate* lexstate = state->lexstate; int token; int c; unsigned int i; char* p; char* tmp; YYSTYPE lval = NULL; token = 0; ncbytesclear(lexstate->yytext); /* invariant: p always points to current char */ for(p=lexstate->next;token==0&&(c=*p);p++) { if(c == '\n') { lexstate->lineno++; } else if(c <= ' ' || c == '\177') { /* whitespace: ignore */ } else if(c == '#') { /* single line comment */ while((c=*(++p))) {if(c == '\n') break;} } else if(strchr(lexstate->worddelims,c) != NULL) { /* don't put in lexstate->yytext to avoid memory leak */ token = c; } else if(c == '"') { int more = 1; /* We have a string token; will be reported as WORD_STRING */ while(more && (c=*(++p))) { if(c == '"') { more = 0; continue; } #ifdef DAP2ENCODE if(c == '\\') { /* Resolve spec ambiguity about handling of \c: 1. !KEEPSLASH: convert \c to c for any character c 2. KEEPSLASH: convert \c to \c for any character c; that is, keep the backslash. It is clear that the problem being addressed was \". But it is unclear what to to do about \n: convert to Ascii LF or leave as \n. This code will leave as \n and assume higher levels of code will address the issue. */ #ifdef KEEPSLASH dapaddyytext(lexstate,c); #endif c=*(++p); if(c == '\0') more = 0; } #else /*Non-standard*/ switch (c) { case '\\': c=*(++p); switch (c) { case 'r': c = '\r'; break; case 'n': c = '\n'; break; case 'f': c = '\f'; break; case 't': c = '\t'; break; case 'x': { int d1,d2; c = '?'; ++p; d1 = tohex(*p++); if(d1 < 0) { daperror(state,"Illegal \\xDD in TOKEN_STRING"); } else { d2 = tohex(*p++); if(d2 < 0) { daperror(state,"Illegal \\xDD in TOKEN_STRING"); } else { c=(((unsigned int)d1)<<4) | (unsigned int)d2; } } } break; default: break; } break; default: break; } #endif /*!DAP2ENCODE*/ if(more) dapaddyytext(lexstate,c); } token=WORD_STRING; } else if(strchr(lexstate->wordchars1,c) != NULL) { int isdatamark = 0; /* we have a WORD_WORD */ dapaddyytext(lexstate,c); while((c=*(++p))) { #ifdef URLCVT if(c == '%' && p[1] != 0 && p[2] != 0 && strchr(hexdigits,p[1]) != NULL && strchr(hexdigits,p[2]) != NULL) { int d1,d2; d1 = tohex(p[1]); d2 = tohex(p[2]); if(d1 >= 0 || d2 >= 0) { c=(((unsigned int)d1)<<4) | (unsigned int)d2; p+=2; } } else { if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;} } dapaddyytext(lexstate,c); #else if(strchr(lexstate->wordcharsn,c) == NULL) {p--; break;} dapaddyytext(lexstate,c); #endif } /* Special check for Data: */ tmp = ncbytescontents(lexstate->yytext); if(strcmp(tmp,"Data")==0 && *p == ':') { dapaddyytext(lexstate,*p); p++; if(p[0] == '\n') { token = SCAN_DATA; isdatamark = 1; p++; } else if(p[0] == '\r' && p[1] == '\n') { token = SCAN_DATA; isdatamark = 1; p+=2; } } if(!isdatamark) { /* check for keyword */ token=WORD_WORD; /* assume */ for(i=0;;i++) { if(keywords[i] == NULL) break; if(strcasecmp(keywords[i],tmp)==0) { token=keytokens[i]; break; } } } } else { /* illegal */ } } lexstate->next = p; strncpy(lexstate->lasttokentext,ncbytescontents(lexstate->yytext),MAX_TOKEN_LENGTH); lexstate->lasttoken = token; if(ocdebug >= 2) dumptoken(lexstate); /*Put return value onto Bison stack*/ if(ncbyteslength(lexstate->yytext) == 0) lval = NULL; else { lval = ncbytesdup(lexstate->yytext); nclistpush(lexstate->reclaim,(void*)lval); } if(lvalp) *lvalp = lval; return token; /* Return the type of the token. */ } static void dapaddyytext(DAPlexstate* lex, int c) { ncbytesappend(lex->yytext,c); } #ifndef DAP2ENCODE static int tohex(int c) { if(c >= 'a' && c <= 'f') return (c - 'a') + 0xa; if(c >= 'A' && c <= 'F') return (c - 'A') + 0xa; if(c >= '0' && c <= '9') return (c - '0'); return -1; } #endif static void dumptoken(DAPlexstate* lexstate) { fprintf(stderr,"TOKEN = |%s|\n",ncbytescontents(lexstate->yytext)); } /* Simple lexer */ void dapsetwordchars(DAPlexstate* lexstate, int kind) { switch (kind) { case 0: lexstate->worddelims = ddsworddelims; lexstate->wordchars1 = ddswordchars1; lexstate->wordcharsn = ddswordcharsn; break; case 1: lexstate->worddelims = ddsworddelims; lexstate->wordchars1 = ddswordchars1; lexstate->wordcharsn = daswordcharsn; break; case 2: lexstate->worddelims = ddsworddelims; lexstate->wordchars1 = cewordchars1; lexstate->wordcharsn = cewordcharsn; break; default: break; } } void daplexinit(char* input, DAPlexstate** lexstatep) { DAPlexstate* lexstate; if(lexstatep == NULL) return; /* no point in building it */ lexstate = (DAPlexstate*)malloc(sizeof(DAPlexstate)); *lexstatep = lexstate; if(lexstate == NULL) return; memset((void*)lexstate,0,sizeof(DAPlexstate)); lexstate->input = strdup(input); lexstate->next = lexstate->input; lexstate->yytext = ncbytesnew(); lexstate->reclaim = nclistnew(); dapsetwordchars(lexstate,0); /* Assume DDS */ } void daplexcleanup(DAPlexstate** lexstatep) { DAPlexstate* lexstate = *lexstatep; if(lexstate == NULL) return; if(lexstate->input != NULL) ocfree(lexstate->input); if(lexstate->reclaim != NULL) { while(nclistlength(lexstate->reclaim) > 0) { char* word = (char*)nclistpop(lexstate->reclaim); if(word) free(word); } nclistfree(lexstate->reclaim); } ncbytesfree(lexstate->yytext); free(lexstate); *lexstatep = NULL; } /* Dap identifiers will come to us with some characters escaped using the URL notation of %HH. The assumption here is that any character that is encoded is left encoded, except as follows: 1. if the encoded character is in fact a legal DAP2 character (alphanum+"_!~*'-\"") then it is decoded, otherwise not. */ #ifdef DECODE_PARTIAL static const char* decodeset = /* Specify which characters are decoded */ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_!~*'-\"@"; #endif char* dapdecode(DAPlexstate* lexstate, char* name) { char* decoded = NULL; #ifdef DECODE_PARTIAL decoded = ncuridecodepartial(name,decodeset); /* Decode selected */ #else decoded = ncuridecode(name); /* Decode everything */ #endif nclistpush(lexstate->reclaim,(void*)decoded); return decoded; }