// Copyright (C) 2004-2023 Artifex Software, Inc. // // This file is part of MuPDF. // // MuPDF is free software: you can redistribute it and/or modify it under the // terms of the GNU Affero General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more // details. // // You should have received a copy of the GNU Affero General Public License // along with MuPDF. If not, see // // Alternative licensing terms are available from the licensor. // For commercial licensing, see or contact // Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato, // CA 94945, U.S.A., +1(415)492-9861, for further information. #include "mupdf/fitz.h" #include "mupdf/ucdn.h" #include "html-imp.h" #include #include #include enum { T, R, B, L }; #define DEFAULT_DIR FZ_BIDI_LTR static const char *html_default_css = "@page{margin:3em 2em}" "a{color:#06C;text-decoration:underline}" "address{display:block;font-style:italic}" "b{font-weight:bold}" "bdo{direction:rtl;unicode-bidi:bidi-override}" "blockquote{display:block;margin:1em 40px}" "body{display:block;margin:1em}" "cite{font-style:italic}" "code{font-family:monospace}" "dd{display:block;margin:0 0 0 40px}" "del{text-decoration:line-through}" "div{display:block}" "dl{display:block;margin:1em 0}" "dt{display:block}" "em{font-style:italic}" "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" "head{display:none}" "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" "html{display:block}" "i{font-style:italic}" "ins{text-decoration:underline}" "kbd{font-family:monospace}" "li{display:list-item}" "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" "p{display:block;margin:1em 0}" "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" "samp{font-family:monospace}" "script{display:none}" "small{font-size:0.83em}" "strong{font-weight:bold}" "style{display:none}" "sub{font-size:0.83em;vertical-align:sub}" "sup{font-size:0.83em;vertical-align:super}" "table{display:table;border-spacing:2px}" "tbody{display:table-row-group}" "td{display:table-cell;padding:1px;background-color:inherit}" "tfoot{display:table-footer-group}" "th{display:table-cell;font-weight:bold;padding:1px;text-align:center;background-color:inherit}" "thead{display:table-header-group}" "tr{display:table-row}" "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" "ul ul{list-style-type:circle}" "ul ul ul{list-style-type:square}" "var{font-style:italic}" "colgroup{display:table-column-group}" "col{display:table-column}" "caption{display:block;text-align:center}" ; static const char *mobi_default_css = "pagebreak{display:block;page-break-before:always}" "dl,ol,ul{margin:0}" "p{margin:0}" "blockquote{margin:0 40px}" "center{display:block;text-align:center}" "big{font-size:1.17em}" "strike{text-decoration:line-through}" ; static const char *fb2_default_css = "@page{margin:3em 2em}" "FictionBook{display:block;margin:1em}" "stylesheet,binary{display:none}" "description>*{display:none}" "description>title-info{display:block}" "description>title-info>*{display:none}" "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" "image{display:block}" "p>image{display:inline}" "table{display:table}" "tr{display:table-row}" "th,td{display:table-cell}" "a{color:#06C;text-decoration:underline}" "a[type=note]{font-size:small;vertical-align:super}" "code{white-space:pre;font-family:monospace}" "emphasis{font-style:italic}" "strikethrough{text-decoration:line-through}" "strong{font-weight:bold}" "sub{font-size:small;vertical-align:sub}" "sup{font-size:small;vertical-align:super}" "image{margin:1em 0;text-align:center}" "cite,poem{margin:1em 2em}" "subtitle,epigraph,stanza{margin:1em 0}" "title>p{text-align:center;font-size:x-large}" "subtitle{text-align:center;font-size:large}" "p{margin-top:1em;text-align:justify}" "empty-line{padding-top:1em}" "p+p{margin-top:0;text-indent:1.5em}" "empty-line+p{margin-top:0}" "section>title{page-break-before:always}" ; struct genstate { fz_pool *pool; fz_html_font_set *set; fz_archive *zip; fz_tree *images; fz_xml_doc *xml; int is_fb2; const char *base_uri; fz_css *css; int at_bol; fz_html_box *emit_white; int last_brk_cls; int list_counter; int section_depth; fz_bidi_direction markup_dir; fz_text_language markup_lang; char *href; fz_css_style_splay *styles; }; static int iswhite(int c) { return c == ' ' || c == '\t' || c == '\r' || c == '\n'; } static int is_all_white(const char *s) { while (*s) { if (!iswhite(*s)) return 0; ++s; } return 1; } /* TODO: pool allocator for flow nodes */ /* TODO: store text by pointing to a giant buffer */ static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) { while (flow) { fz_html_flow *next = flow->next; if (flow->type == FLOW_IMAGE) fz_drop_image(ctx, flow->content.image); flow = next; } } static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras) { size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras); fz_html_flow *flow; /* Shouldn't happen, but bug 705324. */ if (top == NULL || top->type != BOX_FLOW) return NULL; flow = fz_pool_alloc(ctx, pool, size); flow->type = type; flow->expand = 0; flow->bidi_level = 0; flow->markup_lang = 0; flow->breaks_line = 0; flow->box = inline_box; (*top->s.build.flow_tail) = flow; top->s.build.flow_tail = &flow->next; return flow; } static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0); if (flow) flow->expand = 1; } static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0); } static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0); } static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0); } static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) { fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1); if (flow == NULL) return; memcpy(flow->content.text, a, b - a); flow->content.text[b - a] = 0; flow->markup_lang = lang; } static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) { fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0); if (flow) flow->content.image = fz_keep_image(ctx, img); } static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) { (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0); } fz_html_flow *fz_html_split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) { fz_html_flow *new_flow; char *text; size_t len; assert(flow->type == FLOW_WORD); if (offset == 0) return flow; text = flow->content.text; while (*text && offset) { int rune; text += fz_chartorune(&rune, text); offset--; } len = strlen(text); new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1); memcpy(new_flow, flow, offsetof(fz_html_flow, content)); new_flow->next = flow->next; flow->next = new_flow; strcpy(new_flow->content.text, text); *text = 0; return new_flow; } static void flush_space(fz_context *ctx, fz_html_box *flow, int lang, struct genstate *g) { static const char *space = " "; fz_pool *pool = g->pool; if (g->emit_white) { int bsp = g->emit_white->style->white_space & WS_ALLOW_BREAK_SPACE; if (!g->at_bol) { if (bsp) add_flow_space(ctx, pool, flow, g->emit_white); else add_flow_word(ctx, pool, flow, g->emit_white, space, space+1, lang); } g->emit_white = 0; } } /* pair-wise lookup table for UAX#14 linebreaks */ static const char *pairbrk[29] = { /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */ /* -PLPULSXYSROULLDNYAB2WMJ23LVTI- */ "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */ "_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */ "_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */ "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */ "_^^%%%^^^_______%%__^^^______", /* NS nonstarters */ "_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */ "_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */ "_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */ "%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */ "%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */ "%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */ "%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */ "%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */ "_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */ "_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */ "_^^%_%^^^__%____%%__^^^______", /* HY hyphens */ "_^^%_%^^^_______%%__^^^______", /* BA break after */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */ "_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */ "____________________^________", /* ZW zero width space */ "%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */ "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */ "_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */ "_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/vowel/trailing syllable */ "_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */ "_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */ "_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */ "_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */ }; static fz_html_box * find_flow_encloser(fz_context *ctx, fz_html_box *flow) { /* This code was written to assume that there will always be a * flow box enclosing callers of this. Bug 705324 shows that * this isn't always the case. In the absence of a reproducer * file, all I can do is try to patch around the issue so that * we won't crash. */ while (flow->type != BOX_FLOW) { if (flow->up == NULL) { fz_warn(ctx, "Flow encloser not found. Please report this file!"); break; } flow = flow->up; } return flow; } static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) { fz_html_box *flow; fz_pool *pool = g->pool; int collapse = box->style->white_space & WS_COLLAPSE; int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE; int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE; static const char *space = " "; flow = find_flow_encloser(ctx, box); if (flow == NULL) return; while (*text) { if (bnl && (*text == '\n' || *text == '\r')) { if (text[0] == '\r' && text[1] == '\n') text += 2; else text += 1; add_flow_break(ctx, pool, flow, box); g->at_bol = 1; } else if (iswhite(*text)) { if (collapse) { if (bnl) while (*text == ' ' || *text == '\t') ++text; else while (iswhite(*text)) ++text; g->emit_white = box; } else { // TODO: tabs if (bsp) add_flow_space(ctx, pool, flow, box); else add_flow_word(ctx, pool, flow, box, space, space+1, lang); ++text; } g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ } else { const char *prev, *mark = text; int c; flush_space(ctx, flow, lang, g); if (g->at_bol) g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; while (*text && !iswhite(*text)) { prev = text; text += fz_chartorune(&c, text); if (c == 0xAD) /* soft hyphen */ { if (mark != prev) add_flow_word(ctx, pool, flow, box, mark, prev, lang); add_flow_shyphen(ctx, pool, flow, box); mark = text; g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ } else if (bsp) /* allow soft breaks */ { int this_brk_cls = ucdn_get_resolved_linebreak_class(c); if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI) { int brk = pairbrk[g->last_brk_cls][this_brk_cls]; /* we handle spaces elsewhere, so ignore these classes */ if (brk == '@') brk = '^'; if (brk == '#') brk = '^'; if (brk == '%') brk = '^'; if (brk == '_') { if (mark != prev) add_flow_word(ctx, pool, flow, box, mark, prev, lang); add_flow_sbreak(ctx, pool, flow, box); mark = prev; } g->last_brk_cls = this_brk_cls; } } } if (mark != text) add_flow_word(ctx, pool, flow, box, mark, text, lang); g->at_bol = 0; } } } static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) { char path[2048]; fz_image *img = NULL; fz_buffer *buf = NULL; fz_var(img); fz_var(buf); fz_try(ctx) { if (!strncmp(src, "data:image/jpeg;base64,", 23)) buf = fz_new_buffer_from_base64(ctx, src+23, 0); else if (!strncmp(src, "data:image/png;base64,", 22)) buf = fz_new_buffer_from_base64(ctx, src+22, 0); else if (!strncmp(src, "data:image/gif;base64,", 22)) buf = fz_new_buffer_from_base64(ctx, src+22, 0); else { fz_strlcpy(path, base_uri, sizeof path); fz_strlcat(path, "/", sizeof path); fz_strlcat(path, src, sizeof path); fz_urldecode(path); buf = fz_read_archive_entry(ctx, zip, path); } #if FZ_ENABLE_SVG if (strstr(src, ".svg")) img = fz_new_image_from_svg(ctx, buf, base_uri, zip); else #endif img = fz_new_image_from_buffer(ctx, buf); } fz_always(ctx) fz_drop_buffer(ctx, buf); fz_catch(ctx) fz_warn(ctx, "html: cannot load image src='%s'", src); return img; } static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml_doc *xmldoc, fz_xml *node) { fz_image *img = NULL; #if FZ_ENABLE_SVG fz_try(ctx) img = fz_new_image_from_svg_xml(ctx, xmldoc, node, base_uri, zip); fz_catch(ctx) fz_warn(ctx, "html: cannot load embedded svg document"); #endif return img; } static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) { fz_html_box *flow; fz_pool *pool = g->pool; flow = find_flow_encloser(ctx, box); flush_space(ctx, flow, 0, g); if (!img) { const char *alt = "[image]"; add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); } else { fz_try(ctx) { add_flow_sbreak(ctx, pool, flow, box); add_flow_image(ctx, pool, flow, box, img); add_flow_sbreak(ctx, pool, flow, box); } fz_always(ctx) { fz_drop_image(ctx, img); } fz_catch(ctx) fz_rethrow(ctx); } g->at_bol = 0; } static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) { while (box) { fz_html_box *next = box->next; if (box->type == BOX_FLOW) fz_drop_html_flow(ctx, box->u.flow.head); fz_drop_html_box(ctx, box->down); box = next; } } static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor) { fz_html *html = (fz_html *)stor; fz_drop_html_box(ctx, html->tree.root); fz_drop_pool(ctx, html->tree.pool); } static void fz_drop_story_imp(fz_context *ctx, fz_storable *stor) { fz_story *story = (fz_story *)stor; fz_free(ctx, story->user_css); fz_drop_html_font_set(ctx, story->font_set); fz_drop_xml(ctx, story->dom); fz_drop_html_box(ctx, story->tree.root); fz_drop_buffer(ctx, story->warnings); fz_drop_archive(ctx, story->zip); /* The pool must be the last thing dropped. */ fz_drop_pool(ctx, story->tree.pool); } /* Drop a structure derived from an html_tree. The exact things * freed here will depend upon the drop function with which it * was created. */ static void fz_drop_html_tree(fz_context *ctx, fz_html_tree *tree) { fz_defer_reap_start(ctx); fz_drop_storable(ctx, &tree->storable); fz_defer_reap_end(ctx); } void fz_drop_html(fz_context *ctx, fz_html *html) { fz_drop_html_tree(ctx, &html->tree); } void fz_drop_story(fz_context *ctx, fz_story *story) { if (!story) return; fz_drop_html_tree(ctx, &story->tree); } fz_html *fz_keep_html(fz_context *ctx, fz_html *html) { return fz_keep_storable(ctx, &html->tree.storable); } static fz_html_box *new_box(fz_context *ctx, struct genstate *g, fz_xml *node, int type, fz_css_style *style) { fz_html_box *box; const char *tag = fz_xml_tag(node); const char *id = fz_xml_att(node, "id"); const char *href; if (type == BOX_INLINE) box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u)); else if (type == BOX_FLOW) box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.flow)); else box = fz_pool_alloc(ctx, g->pool, offsetof(fz_html_box, u) + sizeof(box->u.block)); box->type = type; box->is_first_flow = 0; box->markup_dir = g->markup_dir; box->structure = 0; box->list_item = 0; box->style = fz_css_enlist(ctx, style, &g->styles, g->pool); #ifndef NDEBUG if (tag) box->tag = fz_pool_strdup(ctx, g->pool, tag); else box->tag = "#anon"; #endif if (id) box->id = fz_pool_strdup(ctx, g->pool, id); if (tag && tag[0]=='a' && tag[1]==0) { // Support deprecated anchor syntax with id in "name" instead of "id" attribute. if (!id) { const char *name = fz_xml_att(node, "name"); if (name) box->id = fz_pool_strdup(ctx, g->pool, name); } if (g->is_fb2) { href = fz_xml_att(node, "l:href"); if (!href) href = fz_xml_att(node, "xlink:href"); } else { href = fz_xml_att(node, "href"); } if (href) g->href = fz_pool_strdup(ctx, g->pool, href); } if (g->href) box->href = g->href; if (type == BOX_FLOW) { box->u.flow.head = NULL; box->s.build.flow_tail = &box->u.flow.head; } return box; } static void append_box(fz_context *ctx, fz_html_box *parent, fz_html_box *child) { child->up = parent; if (!parent->down) parent->down = child; if (parent->s.build.last_child) parent->s.build.last_child->next = child; parent->s.build.last_child = child; } static fz_html_box *find_block_context(fz_context *ctx, fz_html_box *box) { while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) box = box->up; return box; } static fz_html_box *find_table_row_context(fz_context *ctx, fz_html_box *box) { fz_html_box *look = box; while (look && look->type != BOX_TABLE) look = look->up; if (look) return look; fz_warn(ctx, "table-row not inside table element"); return box; } static fz_html_box *find_table_cell_context(fz_context *ctx, fz_html_box *box) { fz_html_box *look = box; while (look && look->type != BOX_TABLE_ROW) look = look->up; if (look) return look; fz_warn(ctx, "table-cell not inside table-row element"); return box; } static fz_html_box *find_inline_context(fz_context *ctx, struct genstate *g, fz_html_box *box) { fz_css_style style; fz_html_box *flow_box; if (box->type == BOX_FLOW || box->type == BOX_INLINE) return box; // We have an inline element that is not in an existing flow/inline context. // Find the closest block level box to insert content into. while (box->type != BOX_BLOCK && box->type != BOX_TABLE_CELL) box = box->up; // Concatenate onto the last open flow box if we have one. if (box->s.build.last_child && box->s.build.last_child->type == BOX_FLOW) return box->s.build.last_child; // No flow box found, create and insert one! // TODO: null style instead of default for flow box? fz_default_css_style(ctx, &style); flow_box = new_box(ctx, g, NULL, BOX_FLOW, &style); flow_box->is_first_flow = !box->down; g->at_bol = 1; append_box(ctx, box, flow_box); return flow_box; } static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match); static void gen2_text(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) { fz_html_box *anon_box; fz_css_style style; const char *text; int collapse; text = fz_xml_text(node); collapse = root_box->style->white_space & WS_COLLAPSE; if (collapse && is_all_white(text)) { g->emit_white = root_box; } else { if (root_box->type != BOX_INLINE) { /* Create anonymous inline box, with the same style as the top block box. */ style = *root_box->style; // Make sure not to recursively multiply font sizes style.font_size.value = 1; style.font_size.unit = N_SCALE; root_box = find_inline_context(ctx, g, root_box); anon_box = new_box(ctx, g, NULL, BOX_INLINE, &style); append_box(ctx, root_box, anon_box); root_box = anon_box; } generate_text(ctx, root_box, text, g->markup_lang, g); } } static fz_html_box *gen2_inline(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; fz_html_box *flow_box; root_box = find_inline_context(ctx, g, root_box); this_box = new_box(ctx, g, node, BOX_INLINE, style); append_box(ctx, root_box, this_box); if (this_box->id) { flow_box = find_flow_encloser(ctx, this_box); add_flow_anchor(ctx, g->pool, flow_box, this_box); } return this_box; } static void gen2_break(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node) { fz_html_box *this_box; fz_html_box *flow_box; if (root_box->type != BOX_INLINE) { /* Create inline box to hold the
tag, with the same style as containing block. */ /* Make sure not to recursively multiply font sizes. */ fz_css_style style = *root_box->style; style.font_size.value = 1; style.font_size.unit = N_SCALE; this_box = new_box(ctx, g, node, BOX_INLINE, &style); append_box(ctx, find_inline_context(ctx, g, root_box), this_box); } else { this_box = root_box; } flow_box = find_flow_encloser(ctx, this_box); add_flow_break(ctx, g->pool, flow_box, this_box); g->at_bol = 1; } static fz_html_box *gen2_block(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; root_box = find_block_context(ctx, root_box); this_box = new_box(ctx, g, node, BOX_BLOCK, style); append_box(ctx, root_box, this_box); return this_box; } static fz_html_box *gen2_table(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; root_box = find_block_context(ctx, root_box); this_box = new_box(ctx, g, node, BOX_TABLE, style); append_box(ctx, root_box, this_box); return this_box; } static fz_html_box *gen2_table_row(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; root_box = find_table_row_context(ctx, root_box); this_box = new_box(ctx, g, node, BOX_TABLE_ROW, style); append_box(ctx, root_box, this_box); return this_box; } static fz_html_box *gen2_table_cell(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_style *style) { fz_html_box *this_box; root_box = find_table_cell_context(ctx, root_box); this_box = new_box(ctx, g, node, BOX_TABLE_CELL, style); append_box(ctx, root_box, this_box); return this_box; } static void gen2_image_common(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_image *img, int display, fz_css_style *style) { fz_html_box *img_block_box; fz_html_box *img_inline_box; if (display == DIS_INLINE || display == DIS_INLINE_BLOCK) { root_box = find_inline_context(ctx, g, root_box); img_inline_box = new_box(ctx, g, node, BOX_INLINE, style); append_box(ctx, root_box, img_inline_box); generate_image(ctx, img_inline_box, img, g); } else { root_box = find_block_context(ctx, root_box); img_block_box = new_box(ctx, g, node, BOX_BLOCK, style); append_box(ctx, root_box, img_block_box); root_box = find_inline_context(ctx, g, img_block_box); img_inline_box = new_box(ctx, g, NULL, BOX_INLINE, style); append_box(ctx, root_box, img_inline_box); generate_image(ctx, img_inline_box, img, g); } } static void gen2_image_html(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) { const char *src = fz_xml_att(node, "src"); if (src) { fz_css_style local_style = *style; fz_image *img; int w, h; const char *w_att = fz_xml_att(node, "width"); const char *h_att = fz_xml_att(node, "height"); if (w_att && (w = fz_atoi(w_att)) > 0) { local_style.width.value = w; local_style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; } if (h_att && (h = fz_atoi(h_att)) > 0) { local_style.height.value = h; local_style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; } img = load_html_image(ctx, g->zip, g->base_uri, src); gen2_image_common(ctx, g, root_box, node, img, display, &local_style); } } static void gen2_image_fb2(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) { const char *src = fz_xml_att(node, "l:href"); if (!src) src = fz_xml_att(node, "xlink:href"); if (src && src[0] == '#') { fz_image *img = fz_tree_lookup(ctx, g->images, src+1); gen2_image_common(ctx, g, root_box, node, fz_keep_image(ctx, img), display, style); } } static void gen2_image_svg(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, int display, fz_css_style *style) { fz_image *img = load_svg_image(ctx, g->zip, g->base_uri, g->xml, node); gen2_image_common(ctx, g, root_box, node, img, display, style); } static int structure_from_tag(const char *tag, struct genstate *g) { if (tag == NULL) return FZ_HTML_STRUCT_UNKNOWN; if (!strcmp(tag, "title") || !strcmp(tag, "subtitle")) { if (!g->is_fb2) return FZ_HTML_STRUCT_UNKNOWN; return g->section_depth ? (FZ_HTML_STRUCT_H1 - 1 + fz_mini(g->section_depth, 6)) : FZ_HTML_STRUCT_UNKNOWN; } else if (!strcmp(tag, "body")) return FZ_HTML_STRUCT_BODY; else if (!strcmp(tag, "div")) return FZ_HTML_STRUCT_DIV; else if (!strcmp(tag, "span")) return FZ_HTML_STRUCT_SPAN; else if (!strcmp(tag, "blockquote")) return FZ_HTML_STRUCT_BLOCKQUOTE; else if (!strcmp(tag, "p")) return FZ_HTML_STRUCT_P; else if (!strcmp(tag, "h1")) return FZ_HTML_STRUCT_H1; else if (!strcmp(tag, "h2")) return FZ_HTML_STRUCT_H2; else if (!strcmp(tag, "h3")) return FZ_HTML_STRUCT_H3; else if (!strcmp(tag, "h4")) return FZ_HTML_STRUCT_H4; else if (!strcmp(tag, "h5")) return FZ_HTML_STRUCT_H5; else if (!strcmp(tag, "h6")) return FZ_HTML_STRUCT_H6; else if (!strcmp(tag, "dl") || !strcmp(tag, "ul") || !strcmp(tag, "ol")) return FZ_HTML_STRUCT_L; else if (!strcmp(tag, "li") || !strcmp(tag, "dd") || !strcmp(tag, "dt")) return FZ_HTML_STRUCT_LI; else if (!strcmp(tag, "table")) return FZ_HTML_STRUCT_TABLE; else if (!strcmp(tag, "tr")) return FZ_HTML_STRUCT_TR; else if (!strcmp(tag, "th")) return FZ_HTML_STRUCT_TH; else if (!strcmp(tag, "td")) return FZ_HTML_STRUCT_TD; else if (!strcmp(tag, "thead")) return FZ_HTML_STRUCT_THEAD; else if (!strcmp(tag, "tbody")) return FZ_HTML_STRUCT_TBODY; else if (!strcmp(tag, "tfoot")) return FZ_HTML_STRUCT_TFOOT; return FZ_HTML_STRUCT_UNKNOWN; } static void gen2_tag(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *node, fz_css_match *match, int display, fz_css_style *style) { fz_html_box *this_box; const char *tag; const char *lang_att; const char *dir_att; int save_markup_dir = g->markup_dir; int save_markup_lang = g->markup_lang; char *save_href = g->href; if (display == DIS_NONE) return; tag = fz_xml_tag(node); dir_att = fz_xml_att(node, "dir"); if (dir_att) { if (!strcmp(dir_att, "auto")) g->markup_dir = FZ_BIDI_NEUTRAL; else if (!strcmp(dir_att, "rtl")) g->markup_dir = FZ_BIDI_RTL; else if (!strcmp(dir_att, "ltr")) g->markup_dir = FZ_BIDI_LTR; else g->markup_dir = DEFAULT_DIR; } lang_att = fz_xml_att(node, "lang"); if (lang_att) g->markup_lang = fz_text_language_from_string(lang_att); switch (display) { case DIS_INLINE_BLOCK: // TODO handle inline block as a flow node this_box = gen2_block(ctx, g, root_box, node, style); break; case DIS_BLOCK: this_box = gen2_block(ctx, g, root_box, node, style); this_box->structure = structure_from_tag(tag, g); break; case DIS_LIST_ITEM: this_box = gen2_block(ctx, g, root_box, node, style); this_box->list_item = ++g->list_counter; break; case DIS_TABLE: this_box = gen2_table(ctx, g, root_box, node, style); break; case DIS_TABLE_GROUP: // no box for table-row-group elements this_box = root_box; break; case DIS_TABLE_ROW: this_box = gen2_table_row(ctx, g, root_box, node, style); break; case DIS_TABLE_CELL: this_box = gen2_table_cell(ctx, g, root_box, node, style); break; case DIS_INLINE: default: this_box = gen2_inline(ctx, g, root_box, node, style); break; } if (!strcmp(tag, "ol")) { int save_list_counter = g->list_counter; g->list_counter = 0; gen2_children(ctx, g, this_box, node, match); g->list_counter = save_list_counter; } else if (!strcmp(tag, "section")) { int save_section_depth = g->section_depth; g->section_depth++; gen2_children(ctx, g, this_box, node, match); g->section_depth = save_section_depth; } else { gen2_children(ctx, g, this_box, node, match); } g->markup_dir = save_markup_dir; g->markup_lang = save_markup_lang; g->href = save_href; } static void gen2_children(fz_context *ctx, struct genstate *g, fz_html_box *root_box, fz_xml *root_node, fz_css_match *root_match) { fz_xml *node; const char *tag; fz_css_match match; fz_css_style style; int display; for (node = fz_xml_down(root_node); node; node = fz_xml_next(node)) { tag = fz_xml_tag(node); if (tag) { fz_match_css(ctx, &match, root_match, g->css, node); fz_apply_css_style(ctx, g->set, &style, &match); display = fz_get_css_match_display(&match); if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) { gen2_break(ctx, g, root_box, node); } else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) { gen2_image_html(ctx, g, root_box, node, display, &style); } else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) { gen2_image_fb2(ctx, g, root_box, node, display, &style); } else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) { gen2_image_svg(ctx, g, root_box, node, display, &style); } else { gen2_tag(ctx, g, root_box, node, &match, display, &style); } } else { gen2_text(ctx, g, root_box, node); } } } static char *concat_text(fz_context *ctx, fz_xml *root) { fz_xml *node; size_t i = 0, n = 1; char *s; for (node = fz_xml_down(root); node; node = fz_xml_next(node)) { const char *text = fz_xml_text(node); n += text ? strlen(text) : 0; } s = Memento_label(fz_malloc(ctx, n), "concat_html"); for (node = fz_xml_down(root); node; node = fz_xml_next(node)) { const char *text = fz_xml_text(node); if (text) { n = strlen(text); memcpy(s+i, text, n); i += n; } } s[i] = 0; return s; } static void html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href) { char path[2048]; char css_base_uri[2048]; fz_buffer *buf; fz_var(buf); fz_strlcpy(path, base_uri, sizeof path); fz_strlcat(path, "/", sizeof path); fz_strlcat(path, href, sizeof path); fz_urldecode(path); fz_cleanname(path); fz_dirname(css_base_uri, path, sizeof css_base_uri); buf = NULL; fz_try(ctx) { buf = fz_read_archive_entry(ctx, zip, path); fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); fz_add_css_font_faces(ctx, set, zip, css_base_uri, css); } fz_always(ctx) fz_drop_buffer(ctx, buf); fz_catch(ctx) fz_warn(ctx, "ignoring stylesheet %s", path); } static void html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) { fz_xml *html, *head, *node; html = fz_xml_find(root, "html"); head = fz_xml_find_down(html, "head"); for (node = fz_xml_down(head); node; node = fz_xml_next(node)) { if (fz_xml_is_tag(node, "link")) { char *rel = fz_xml_att(node, "rel"); if (rel && !fz_strcasecmp(rel, "stylesheet")) { char *type = fz_xml_att(node, "type"); if ((type && !strcmp(type, "text/css")) || !type) { char *href = fz_xml_att(node, "href"); if (href) { html_load_css_link(ctx, set, zip, base_uri, css, root, href); } } } } else if (fz_xml_is_tag(node, "style")) { char *s = concat_text(ctx, node); fz_try(ctx) { fz_parse_css(ctx, css, s, "