// Copyright (C) 2004-2021 Artifex Software, Inc. // // This file is part of MuPDF. // // MuPDF is free software: you can redistribute it and/or modify it under the // terms of the GNU Affero General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more // details. // // You should have received a copy of the GNU Affero General Public License // along with MuPDF. If not, see // // Alternative licensing terms are available from the licensor. // For commercial licensing, see or contact // Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato, // CA 94945, U.S.A., +1(415)492-9861, for further information. #include "mupdf/fitz.h" #include "mupdf/pdf.h" #include #define IS_NUMBER \ '+':case'-':case'.':case'0':case'1':case'2':case'3':\ case'4':case'5':case'6':case'7':case'8':case'9' #define IS_WHITE \ '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20' #define IS_HEX \ '0':case'1':case'2':case'3':case'4':case'5':case'6':\ case'7':case'8':case'9':case'A':case'B':case'C':\ case'D':case'E':case'F':case'a':case'b':case'c':\ case'd':case'e':case'f' #define IS_DELIM \ '(':case')':case'<':case'>':case'[':case']':case'{':\ case'}':case'/':case'%' #define RANGE_0_9 \ '0':case'1':case'2':case'3':case'4':case'5':\ case'6':case'7':case'8':case'9' #define RANGE_a_f \ 'a':case'b':case'c':case'd':case'e':case'f' #define RANGE_A_F \ 'A':case'B':case'C':case'D':case'E':case'F' #define RANGE_0_7 \ '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7' /* #define DUMP_LEXER_STREAM */ #ifdef DUMP_LEXER_STREAM static inline int lex_byte(fz_context *ctx, fz_stream *stm) { int c = fz_read_byte(ctx, stm); if (c == EOF) fz_write_printf(ctx, fz_stdout(ctx), ""); else if (c >= 32 && c < 128) fz_write_printf(ctx, fz_stdout(ctx), "%c", c); else fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c); return c; } #else #define lex_byte(C,S) fz_read_byte(C,S) #endif static inline int iswhite(int ch) { return ch == '\000' || ch == '\011' || ch == '\012' || ch == '\014' || ch == '\015' || ch == '\040'; } static inline int fz_isprint(int ch) { return ch >= ' ' && ch <= '~'; } static inline int unhex(int ch) { if (ch >= '0' && ch <= '9') return ch - '0'; if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; return 0; } static void lex_white(fz_context *ctx, fz_stream *f) { int c; do { c = lex_byte(ctx, f); } while ((c <= 32) && (iswhite(c))); if (c != EOF) fz_unread_byte(ctx, f); } static void lex_comment(fz_context *ctx, fz_stream *f) { int c; do { c = lex_byte(ctx, f); } while ((c != '\012') && (c != '\015') && (c != EOF)); } /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */ static float acrobat_compatible_atof(char *s) { int neg = 0; int i = 0; while (*s == '-') { neg = 1; ++s; } while (*s == '+') { ++s; } while (*s >= '0' && *s <= '9') { /* We deliberately ignore overflow here. * Tests show that Acrobat handles * overflows in exactly the same way we do: * 123450000000000000000678 is read as 678. */ i = i * 10 + (*s - '0'); ++s; } if (*s == '.') { float v = i; float n = 0; float d = 1; ++s; while (*s >= '0' && *s <= '9') { n = 10 * n + (*s - '0'); d = 10 * d; ++s; } v += n / d; return neg ? -v : v; } else { return neg ? -i : i; } } /* Fast but inaccurate atoi. */ static int fast_atoi(char *s) { int neg = 0; int i = 0; while (*s == '-') { neg = 1; ++s; } while (*s == '+') { ++s; } while (*s >= '0' && *s <= '9') { /* We deliberately ignore overflow here. */ i = i * 10 + (*s - '0'); ++s; } return neg ? -i : i; } static int lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) { char *s = buf->scratch; char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */ char *isreal = (c == '.' ? s : NULL); int neg = (c == '-'); int isbad = 0; *s++ = c; c = lex_byte(ctx, f); /* skip extra '-' signs at start of number */ if (neg) { while (c == '-') c = lex_byte(ctx, f); } while (s < e) { switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(ctx, f); goto end; case EOF: goto end; case '.': if (isreal) isbad = 1; isreal = s; *s++ = c; break; case '-': /* Bug 703248: Some PDFs (particularly those * generated by google docs) apparently have * numbers like 0.000000000000-5684342 in them. * We'll stop our interpretation at the -, but * keep reading to skip over the trailing * digits so they aren't parsed later. */ *s++ = '\0'; break; case RANGE_0_9: *s++ = c; break; default: isbad = 1; *s++ = c; break; } c = lex_byte(ctx, f); } end: *s = '\0'; if (isbad) return PDF_TOK_KEYWORD; if (isreal) { /* We'd like to use the fastest possible atof * routine, but we'd rather match acrobats * handling of broken numbers. As such, we * spot common broken cases and call an * acrobat compatible routine where required. */ if (neg > 1 || isreal - buf->scratch >= 10) buf->f = acrobat_compatible_atof(buf->scratch); else buf->f = fz_atof(buf->scratch); return PDF_TOK_REAL; } else { buf->i = fast_atoi(buf->scratch); return PDF_TOK_INT; } } static void lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) { char *s = lb->scratch; char *e = s + fz_minz(127, lb->size); int c; while (1) { if (s == e) { if (e - lb->scratch < 127) { s += pdf_lexbuf_grow(ctx, lb); e = lb->scratch + fz_minz(127, lb->size); } else { /* truncate names that are too long */ fz_warn(ctx, "name is too long"); *s = 0; lb->len = s - lb->scratch; s = NULL; } } c = lex_byte(ctx, f); switch (c) { case IS_WHITE: case IS_DELIM: fz_unread_byte(ctx, f); goto end; case EOF: goto end; case '#': { int hex[2]; int i; for (i = 0; i < 2; i++) { c = fz_peek_byte(ctx, f); switch (c) { case RANGE_0_9: if (i == 1 && c == '0' && hex[0] == 0) goto illegal; hex[i] = lex_byte(ctx, f) - '0'; break; case RANGE_a_f: hex[i] = lex_byte(ctx, f) - 'a' + 10; break; case RANGE_A_F: hex[i] = lex_byte(ctx, f) - 'A' + 10; break; default: case EOF: goto illegal; } } if (s) *s++ = (hex[0] << 4) + hex[1]; break; illegal: if (i == 1) fz_unread_byte(ctx, f); if (s) *s++ = '#'; continue; } default: if (s) *s++ = c; break; } } end: if (s) { *s = '\0'; lb->len = s - lb->scratch; } } static int lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) { char *s = lb->scratch; char *e = s + lb->size; int bal = 1; int oct; int c; while (1) { if (s == e) { s += pdf_lexbuf_grow(ctx, lb); e = lb->scratch + lb->size; } c = lex_byte(ctx, f); switch (c) { case EOF: return PDF_TOK_ERROR; case '(': bal++; *s++ = c; break; case ')': bal --; if (bal == 0) goto end; *s++ = c; break; case '\\': c = lex_byte(ctx, f); switch (c) { case EOF: return PDF_TOK_ERROR; case 'n': *s++ = '\n'; break; case 'r': *s++ = '\r'; break; case 't': *s++ = '\t'; break; case 'b': *s++ = '\b'; break; case 'f': *s++ = '\f'; break; case '(': *s++ = '('; break; case ')': *s++ = ')'; break; case '\\': *s++ = '\\'; break; case RANGE_0_7: oct = c - '0'; c = lex_byte(ctx, f); if (c >= '0' && c <= '7') { oct = oct * 8 + (c - '0'); c = lex_byte(ctx, f); if (c >= '0' && c <= '7') oct = oct * 8 + (c - '0'); else if (c != EOF) fz_unread_byte(ctx, f); } else if (c != EOF) fz_unread_byte(ctx, f); *s++ = oct; break; case '\n': break; case '\r': c = lex_byte(ctx, f); if ((c != '\n') && (c != EOF)) fz_unread_byte(ctx, f); break; default: *s++ = c; } break; default: *s++ = c; break; } } end: lb->len = s - lb->scratch; return PDF_TOK_STRING; } static int lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) { char *s = lb->scratch; char *e = s + lb->size; int a = 0, x = 0; int c; while (1) { if (s == e) { s += pdf_lexbuf_grow(ctx, lb); e = lb->scratch + lb->size; } c = lex_byte(ctx, f); switch (c) { case IS_WHITE: break; default: fz_warn(ctx, "invalid character in hex string"); /* fall through */ case IS_HEX: if (x) { *s++ = a * 16 + unhex(c); x = !x; } else { a = unhex(c); x = !x; } break; case '>': if (x) { *s++ = a * 16; /* pad truncated string with '0' */ } goto end; case EOF: return PDF_TOK_ERROR; } } end: lb->len = s - lb->scratch; return PDF_TOK_STRING; } static pdf_token pdf_token_from_keyword(char *key) { switch (*key) { case 'R': if (!strcmp(key, "R")) return PDF_TOK_R; break; case 't': if (!strcmp(key, "true")) return PDF_TOK_TRUE; if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER; break; case 'f': if (!strcmp(key, "false")) return PDF_TOK_FALSE; break; case 'n': if (!strcmp(key, "null")) return PDF_TOK_NULL; if (!strcmp(key, "newobj")) return PDF_TOK_NEWOBJ; break; case 'o': if (!strcmp(key, "obj")) return PDF_TOK_OBJ; break; case 'e': if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ; if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM; break; case 's': if (!strcmp(key, "stream")) return PDF_TOK_STREAM; if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF; break; case 'x': if (!strcmp(key, "xref")) return PDF_TOK_XREF; break; } while (*key) { if (!fz_isprint(*key)) return PDF_TOK_ERROR; ++key; } return PDF_TOK_KEYWORD; } void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) { lb->size = lb->base_size = size; lb->len = 0; lb->scratch = &lb->buffer[0]; } void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb) { if (lb && lb->size != lb->base_size) fz_free(ctx, lb->scratch); } ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb) { char *old = lb->scratch; size_t newsize = lb->size * 2; if (lb->size == lb->base_size) { lb->scratch = Memento_label(fz_malloc(ctx, newsize), "pdf_lexbuf"); memcpy(lb->scratch, lb->buffer, lb->size); } else { lb->scratch = fz_realloc(ctx, lb->scratch, newsize); } lb->size = newsize; return lb->scratch - old; } pdf_token pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) { while (1) { int c = lex_byte(ctx, f); switch (c) { case EOF: return PDF_TOK_EOF; case IS_WHITE: lex_white(ctx, f); break; case '%': lex_comment(ctx, f); break; case '/': lex_name(ctx, f, buf); return PDF_TOK_NAME; case '(': return lex_string(ctx, f, buf); case ')': return PDF_TOK_ERROR; case '<': c = lex_byte(ctx, f); if (c == '<') return PDF_TOK_OPEN_DICT; if (c != EOF) fz_unread_byte(ctx, f); return lex_hex_string(ctx, f, buf); case '>': c = lex_byte(ctx, f); if (c == '>') return PDF_TOK_CLOSE_DICT; if (c != EOF) fz_unread_byte(ctx, f); return PDF_TOK_ERROR; case '[': return PDF_TOK_OPEN_ARRAY; case ']': return PDF_TOK_CLOSE_ARRAY; case '{': return PDF_TOK_OPEN_BRACE; case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: return lex_number(ctx, f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(ctx, f); lex_name(ctx, f, buf); return pdf_token_from_keyword(buf->scratch); } } } pdf_token pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) { while (1) { int c = lex_byte(ctx, f); switch (c) { case EOF: return PDF_TOK_EOF; case IS_WHITE: lex_white(ctx, f); break; case '%': lex_comment(ctx, f); break; case '/': lex_name(ctx, f, buf); return PDF_TOK_NAME; case '(': return PDF_TOK_ERROR; /* no strings allowed */ case ')': return PDF_TOK_ERROR; /* no strings allowed */ case '<': c = lex_byte(ctx, f); if (c == '<') return PDF_TOK_OPEN_DICT; if (c != EOF) fz_unread_byte(ctx, f); return PDF_TOK_ERROR; /* no strings allowed */ case '>': c = lex_byte(ctx, f); if (c == '>') return PDF_TOK_CLOSE_DICT; if (c != EOF) fz_unread_byte(ctx, f); return PDF_TOK_ERROR; case '[': return PDF_TOK_OPEN_ARRAY; case ']': return PDF_TOK_CLOSE_ARRAY; case '{': return PDF_TOK_OPEN_BRACE; case '}': return PDF_TOK_CLOSE_BRACE; case IS_NUMBER: return lex_number(ctx, f, buf, c); default: /* isregular: !isdelim && !iswhite && c != EOF */ fz_unread_byte(ctx, f); lex_name(ctx, f, buf); return pdf_token_from_keyword(buf->scratch); } } } void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) { switch (tok) { case PDF_TOK_NAME: fz_append_printf(ctx, fzbuf, "/%s", buf->scratch); break; case PDF_TOK_STRING: if (buf->len >= buf->size) pdf_lexbuf_grow(ctx, buf); buf->scratch[buf->len] = 0; fz_append_pdf_string(ctx, fzbuf, buf->scratch); break; case PDF_TOK_OPEN_DICT: fz_append_string(ctx, fzbuf, "<<"); break; case PDF_TOK_CLOSE_DICT: fz_append_string(ctx, fzbuf, ">>"); break; case PDF_TOK_OPEN_ARRAY: fz_append_byte(ctx, fzbuf, '['); break; case PDF_TOK_CLOSE_ARRAY: fz_append_byte(ctx, fzbuf, ']'); break; case PDF_TOK_OPEN_BRACE: fz_append_byte(ctx, fzbuf, '{'); break; case PDF_TOK_CLOSE_BRACE: fz_append_byte(ctx, fzbuf, '}'); break; case PDF_TOK_INT: fz_append_printf(ctx, fzbuf, "%ld", buf->i); break; case PDF_TOK_REAL: fz_append_printf(ctx, fzbuf, "%g", buf->f); break; default: fz_append_data(ctx, fzbuf, buf->scratch, buf->len); break; } }