/* tectonic/xetex-io.c: low-level input/output functions tied to the XeTeX engine Copyright 2016-2019 The Tectonic Project Licensed under the MIT License. */ #include "xetex-core.h" #include "teckit-c-Engine.h" #include "xetex-swap.h" #include "xetex-xetexd.h" #include #include char *name_of_input_file = NULL; // Tectonic: This buffer is used for SyncTeX, which needs to emit absolute // filesystem paths -- which are difficult to derive in our virtualized I/O // system. The most backwards-compatible way to expose this information to the // engine was to add the `ttstub_get_last_input_abspath()` API used below. char abspath_of_input_file[1024] = ""; rust_input_handle_t tt_xetex_open_input (int filefmt) { rust_input_handle_t handle; if (filefmt == TTBC_FILE_FORMAT_TECTONIC_PRIMARY) { handle = ttstub_input_open_primary (); } else if (name_of_file[0] == '|') { // Tectonic TODO: issue #859. In mainline XeTeX, a pipe symbol indicates // piped input from an external command via `popen()`. Now that we have // shell-escape support we could also support this, but we don't have a // real implementation yet. For now, issue a warning. print_nl_cstr("Warning: "); diagnostic_begin_capture_warning_here(); print_cstr("piped inputs from external commands are not implemented in Tectonic"); capture_to_diagnostic(NULL); return NULL; } else { handle = ttstub_input_open (name_of_file, (ttbc_file_format) filefmt, 0); } if (handle == NULL) return NULL; if (ttstub_get_last_input_abspath(abspath_of_input_file, sizeof(abspath_of_input_file)) < 1) { abspath_of_input_file[0] = '\0'; } name_length = strlen(name_of_file); free(name_of_input_file); name_of_input_file = xstrdup(name_of_file); return handle; } /* tables/values used in UTF-8 interpretation - code is based on ConvertUTF.[ch] sample code published by the Unicode consortium */ const uint32_t offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; const uint8_t bytesFromUTF8[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; void set_input_file_encoding(UFILE* f, int32_t mode, int32_t encodingData) { if ((f->encodingMode == ICUMAPPING) && (f->conversionData != NULL)) ucnv_close((UConverter*)(f->conversionData)); f->conversionData = 0; switch (mode) { case UTF8: case UTF16BE: case UTF16LE: case RAW: f->encodingMode = mode; break; case ICUMAPPING: { char* name = gettexstring(encodingData); UErrorCode err = U_ZERO_ERROR; UConverter* cnv = ucnv_open(name, &err); if (cnv == NULL) { begin_diagnostic(); print_nl('E'); print_c_string("rror "); print_int(err); print_c_string(" creating Unicode converter for `"); print_c_string(name); print_c_string("'; reading as raw bytes"); end_diagnostic(1); f->encodingMode = RAW; } else { f->encodingMode = ICUMAPPING; f->conversionData = cnv; } free(name); } break; } } int u_open_in(UFILE **f, int32_t filefmt, const char *fopen_mode, int32_t mode, int32_t encodingData) { rust_input_handle_t handle; int B1, B2; handle = tt_xetex_open_input (filefmt); if (handle == NULL) return 0; *f = xmalloc(sizeof(UFILE)); (*f)->encodingMode = 0; (*f)->conversionData = 0; (*f)->savedChar = -1; (*f)->skipNextLF = 0; (*f)->handle = handle; if (mode == AUTO) { /* sniff encoding form */ B1 = ttstub_input_getc ((*f)->handle); B2 = ttstub_input_getc ((*f)->handle); if (B1 == 0xfe && B2 == 0xff) mode = UTF16BE; else if (B2 == 0xfe && B1 == 0xff) mode = UTF16LE; else if (B1 == 0 && B2 != 0) { mode = UTF16BE; ttstub_input_seek ((*f)->handle, 0, SEEK_SET); } else if (B2 == 0 && B1 != 0) { mode = UTF16LE; ttstub_input_seek ((*f)->handle, 0, SEEK_SET); } else if (B1 == 0xEF && B2 == 0xBB) { int B3 = ttstub_input_getc((*f)->handle); if (B3 == 0xBF) mode = UTF8; } if (mode == AUTO) { ttstub_input_seek ((*f)->handle, 0, SEEK_SET); mode = UTF8; } } set_input_file_encoding(*f, mode, encodingData); return 1; } static void buffer_overflow(void) { _tt_abort("unable to read an entire line (buf_size=%u)", (unsigned) buf_size); } static void conversion_error(int errcode) { begin_diagnostic(); print_nl('U'); print_c_string("nicode conversion failed (ICU error code = "); print_int(errcode); print_c_string(") discarding any remaining text"); end_diagnostic(1); } static void apply_normalization(uint32_t* buf, int len, int norm) { static TECkit_Converter normalizers[2] = { NULL, NULL }; TECkit_Status status; UInt32 inUsed, outUsed; TECkit_Converter *normPtr = &normalizers[norm - 1]; if (*normPtr == NULL) { status = TECkit_CreateConverter(NULL, 0, 1, NATIVE_UTF32, NATIVE_UTF32 | (norm == 1 ? kForm_NFC : kForm_NFD), &*normPtr); if (status != kStatus_NoError) _tt_abort ("failed to create normalizer: error code = %d", (int)status); } status = TECkit_ConvertBuffer(*normPtr, (Byte*)buf, len * sizeof(UInt32), &inUsed, (Byte*)&buffer[first], sizeof(*buffer) * (buf_size - first), &outUsed, 1); TECkit_ResetConverter(*normPtr); if (status != kStatus_NoError) buffer_overflow(); last = first + outUsed / sizeof(*buffer); } int input_line(UFILE* f) { static char* byteBuffer = NULL; static uint32_t *utf32Buf = NULL; int i, tmpLen; int norm = get_input_normalization_state(); if (f->handle == NULL) /* NULL 'handle' means this: */ _tt_abort ("reads from synthetic \"terminal\" file #0 should never happen"); last = first; if (f->encodingMode == ICUMAPPING) { uint32_t bytesRead = 0; UConverter* cnv; int outLen; UErrorCode errorCode = U_ZERO_ERROR; if (byteBuffer == NULL) byteBuffer = xmalloc(buf_size + 1); /* Recognize either LF or CR as a line terminator; skip initial LF if prev line ended with CR. */ i = ttstub_input_getc (f->handle); if (f->skipNextLF) { f->skipNextLF = 0; if (i == '\n') i = ttstub_input_getc (f->handle); } if (i != EOF && i != '\n' && i != '\r') byteBuffer[bytesRead++] = i; if (i != EOF && i != '\n' && i != '\r') while (bytesRead < buf_size && (i = ttstub_input_getc(f->handle)) != EOF && i != '\n' && i != '\r') byteBuffer[bytesRead++] = i; if (i == EOF && errno != EINTR && bytesRead == 0) return false; if (i != EOF && i != '\n' && i != '\r') buffer_overflow(); /* now apply the mapping to turn external bytes into Unicode characters in buffer */ cnv = (UConverter*)(f->conversionData); switch (norm) { case 1: // NFC case 2: // NFD if (utf32Buf == NULL) utf32Buf = xcalloc(buf_size, sizeof(uint32_t)); tmpLen = ucnv_toAlgorithmic(UCNV_UTF32_NativeEndian, cnv, (char*)utf32Buf, buf_size * sizeof(*utf32Buf), byteBuffer, bytesRead, &errorCode); if (errorCode != 0) { conversion_error((int)errorCode); return false; } apply_normalization(utf32Buf, tmpLen / sizeof(*utf32Buf), norm); // sets 'last' correctly break; default: // none outLen = ucnv_toAlgorithmic(UCNV_UTF32_NativeEndian, cnv, (char*)&buffer[first], sizeof(*buffer) * (buf_size - first), byteBuffer, bytesRead, &errorCode); if (errorCode != 0) { conversion_error((int)errorCode); return false; } outLen /= sizeof(*buffer); last = first + outLen; break; } } else { /* Recognize either LF or CR as a line terminator; skip initial LF if prev line ended with CR. */ i = get_uni_c(f); if (f->skipNextLF) { f->skipNextLF = 0; if (i == '\n') i = get_uni_c(f); } switch (norm) { case 1: // NFC case 2: // NFD // read Unicode chars into utf32Buf as UTF32 if (utf32Buf == NULL) utf32Buf = xcalloc(buf_size, sizeof(uint32_t)); tmpLen = 0; if (i != EOF && i != '\n' && i != '\r') utf32Buf[tmpLen++] = i; if (i != EOF && i != '\n' && i != '\r') while (tmpLen < buf_size && (i = get_uni_c(f)) != EOF && i != '\n' && i != '\r') utf32Buf[tmpLen++] = i; if (i == EOF && errno != EINTR && tmpLen == 0) return false; /* We didn't get the whole line because our buffer was too small. */ if (i != EOF && i != '\n' && i != '\r') buffer_overflow(); apply_normalization(utf32Buf, tmpLen, norm); break; default: // none if (last < buf_size && i != EOF && i != '\n' && i != '\r') buffer[last++] = i; if (i != EOF && i != '\n' && i != '\r') while (last < buf_size && (i = get_uni_c(f)) != EOF && i != '\n' && i != '\r') buffer[last++] = i; if (i == EOF && errno != EINTR && last == first) return false; /* We didn't get the whole line because our buffer was too small. */ if (i != EOF && i != '\n' && i != '\r') buffer_overflow(); break; } } /* If line ended with CR, remember to skip following LF. */ if (i == '\r') f->skipNextLF = 1; buffer[last] = ' '; if (last >= max_buf_stack) max_buf_stack = last; /* Trim trailing space or EOL characters. */ #define IS_SPC_OR_EOL(c) ((c) == ' ' || (c) == '\r' || (c) == '\n') while (last > first && IS_SPC_OR_EOL(buffer[last - 1])) --last; return true; } void u_close(UFILE* f) { if (f == NULL || f->handle == NULL) /* NULL handle is stdin/terminal file. Shouldn't happen but meh. */ return; ttstub_input_close (f->handle); if (f->encodingMode == ICUMAPPING && f->conversionData != NULL) ucnv_close ((UConverter*) f->conversionData); free (f); } int get_uni_c(UFILE* f) { int rval; int c; if (f->savedChar != -1) { rval = f->savedChar; f->savedChar = -1; return rval; } switch (f->encodingMode) { case UTF8: c = rval = ttstub_input_getc(f->handle); if (rval != EOF) { uint16_t extraBytes = bytesFromUTF8[rval]; switch (extraBytes) { /* note: code falls through cases! */ case 3: c = ttstub_input_getc(f->handle); if (c < 0x80 || c >= 0xC0) goto bad_utf8; rval <<= 6; rval += c; case 2: c = ttstub_input_getc(f->handle); if (c < 0x80 || c >= 0xC0) goto bad_utf8; rval <<= 6; rval += c; case 1: c = ttstub_input_getc(f->handle); if (c < 0x80 || c >= 0xC0) goto bad_utf8; rval <<= 6; rval += c; case 0: break; bad_utf8: if (c != EOF) ttstub_input_ungetc(f->handle, c); case 5: case 4: bad_utf8_warning(); return 0xFFFD; /* return without adjusting by offsetsFromUTF8 */ }; rval -= offsetsFromUTF8[extraBytes]; if (rval < 0 || rval > 0x10ffff) { bad_utf8_warning(); return 0xfffd; } } break; case UTF16BE: rval = ttstub_input_getc(f->handle); if (rval != EOF) { rval <<= 8; rval += ttstub_input_getc(f->handle); if (rval >= 0xd800 && rval <= 0xdbff) { int lo = ttstub_input_getc(f->handle); lo <<= 8; lo += ttstub_input_getc(f->handle); if (lo >= 0xdc00 && lo <= 0xdfff) rval = 0x10000 + (rval - 0xd800) * 0x400 + (lo - 0xdc00); else { rval = 0xfffd; f->savedChar = lo; } } else if (rval >= 0xdc00 && rval <= 0xdfff) rval = 0xfffd; } break; case UTF16LE: rval = ttstub_input_getc(f->handle); if (rval != EOF) { rval += (ttstub_input_getc(f->handle) << 8); if (rval >= 0xd800 && rval <= 0xdbff) { int lo = ttstub_input_getc(f->handle); lo += (ttstub_input_getc(f->handle) << 8); if (lo >= 0xdc00 && lo <= 0xdfff) rval = 0x10000 + (rval - 0xd800) * 0x400 + (lo - 0xdc00); else { rval = 0xfffd; f->savedChar = lo; } } else if (rval >= 0xdc00 && rval <= 0xdfff) rval = 0xfffd; } break; case RAW: rval = ttstub_input_getc(f->handle); break; default: _tt_abort("internal error; file input mode=%d", f->encodingMode); } return rval; } void make_utf16_name(void) { unsigned char* s = (unsigned char *) name_of_file; uint32_t rval; uint16_t* t; static int name16len = 0; if (name16len <= name_length) { free(name_of_file16); name16len = name_length + 10; name_of_file16 = xcalloc(name16len, sizeof(uint16_t)); } t = name_of_file16; while (s < (unsigned char *) name_of_file + name_length) { uint16_t extraBytes; rval = *(s++); extraBytes = bytesFromUTF8[rval]; switch (extraBytes) { /* note: code falls through cases! */ case 5: rval <<= 6; if (*s) rval += *(s++); case 4: rval <<= 6; if (*s) rval += *(s++); case 3: rval <<= 6; if (*s) rval += *(s++); case 2: rval <<= 6; if (*s) rval += *(s++); case 1: rval <<= 6; if (*s) rval += *(s++); case 0: ; }; rval -= offsetsFromUTF8[extraBytes]; if (rval > 0xffff) { rval -= 0x10000; *(t++) = 0xd800 + rval / 0x0400; *(t++) = 0xdc00 + rval % 0x0400; } else { *(t++) = rval; } } name_length16 = t - name_of_file16; } void open_or_close_in(void) { unsigned char c, n; int32_t k; c = cur_chr; scan_four_bit_int(); n = cur_val; if (read_open[n] != CLOSED) { u_close(read_file[n]); read_open[n] = CLOSED; } if (c != 0) { scan_optional_equals(); scan_file_name(); pack_file_name(cur_name, cur_area, cur_ext); if (u_open_in(&read_file[n], TTBC_FILE_FORMAT_TEX, "rb", INTPAR(xetex_default_input_mode), INTPAR(xetex_default_input_encoding))) { make_utf16_name(); name_in_progress = true; begin_name(); stop_at_space = false; k = 0; while ((k < name_length16) && (more_name(name_of_file16[k]))) k++; stop_at_space = true; end_name(); name_in_progress = false; read_open[n] = JUST_OPEN; } } }