// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1999-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: unistr_cnv.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:2 * * created on: 2004aug19 * created by: Markus W. Scherer * * Character conversion functions moved here from unistr.cpp */ #include "unicode/utypes.h" #if !UCONFIG_NO_CONVERSION #include "unicode/putil.h" #include "cstring.h" #include "cmemory.h" #include "unicode/ustring.h" #include "unicode/unistr.h" #include "unicode/ucnv.h" #include "ucnv_imp.h" #include "putilimp.h" #include "ustr_cnv.h" #include "ustr_imp.h" U_NAMESPACE_BEGIN //======================================== // Constructors //======================================== #if !U_CHARSET_IS_UTF8 UnicodeString::UnicodeString(const char *codepageData) { fUnion.fFields.fLengthAndFlags = kShortString; if(codepageData != 0) { doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0); } } UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) { fUnion.fFields.fLengthAndFlags = kShortString; if(codepageData != 0) { doCodepageCreate(codepageData, dataLength, 0); } } // else see unistr.cpp #endif UnicodeString::UnicodeString(const char *codepageData, const char *codepage) { fUnion.fFields.fLengthAndFlags = kShortString; if(codepageData != 0) { doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage); } } UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength, const char *codepage) { fUnion.fFields.fLengthAndFlags = kShortString; if(codepageData != 0) { doCodepageCreate(codepageData, dataLength, codepage); } } UnicodeString::UnicodeString(const char *src, int32_t srcLength, UConverter *cnv, UErrorCode &errorCode) { fUnion.fFields.fLengthAndFlags = kShortString; if(U_SUCCESS(errorCode)) { // check arguments if(src==nullptr) { // treat as an empty string, do nothing more } else if(srcLength<-1) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; } else { // get input length if(srcLength==-1) { srcLength=(int32_t)uprv_strlen(src); } if(srcLength>0) { if(cnv!=0) { // use the provided converter ucnv_resetToUnicode(cnv); doCodepageCreate(src, srcLength, cnv, errorCode); } else { // use the default converter cnv=u_getDefaultConverter(&errorCode); doCodepageCreate(src, srcLength, cnv, errorCode); u_releaseDefaultConverter(cnv); } } } if(U_FAILURE(errorCode)) { setToBogus(); } } } //======================================== // Codeset conversion //======================================== #if !U_CHARSET_IS_UTF8 int32_t UnicodeString::extract(int32_t start, int32_t length, char *target, uint32_t dstSize) const { return extract(start, length, target, dstSize, 0); } // else see unistr.cpp #endif int32_t UnicodeString::extract(int32_t start, int32_t length, char *target, uint32_t dstSize, const char *codepage) const { // if the arguments are illegal, then do nothing if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { return 0; } // pin the indices to legal values pinIndices(start, length); // We need to cast dstSize to int32_t for all subsequent code. // I don't know why the API was defined with uint32_t but we are stuck with it. // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize // as a limit in some functions, it may wrap around and yield a pointer // that compares less-than target. int32_t capacity; if(dstSize < 0x7fffffff) { // Assume that the capacity is real and a limit pointer won't wrap around. capacity = (int32_t)dstSize; } else { // Pin the capacity so that a limit pointer does not wrap around. char *targetLimit = (char *)U_MAX_PTR(target); // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff // greater than target and does not wrap around the top of the address space. capacity = (int32_t)(targetLimit - target); } // create the converter UConverter *converter; UErrorCode status = U_ZERO_ERROR; // just write the NUL if the string length is 0 if(length == 0) { return u_terminateChars(target, capacity, 0, &status); } // if the codepage is the default, use our cache // if it is an empty string, then use the "invariant character" conversion if (codepage == 0) { const char *defaultName = ucnv_getDefaultName(); if(UCNV_FAST_IS_UTF8(defaultName)) { return toUTF8(start, length, target, capacity); } converter = u_getDefaultConverter(&status); } else if (*codepage == 0) { // use the "invariant characters" conversion int32_t destLength; if(length <= capacity) { destLength = length; } else { destLength = capacity; } u_UCharsToChars(getArrayStart() + start, target, destLength); return u_terminateChars(target, capacity, length, &status); } else { converter = ucnv_open(codepage, &status); } length = doExtract(start, length, target, capacity, converter, status); // close the converter if (codepage == 0) { u_releaseDefaultConverter(converter); } else { ucnv_close(converter); } return length; } int32_t UnicodeString::extract(char *dest, int32_t destCapacity, UConverter *cnv, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { return 0; } if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { errorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } // nothing to do? if(isEmpty()) { return u_terminateChars(dest, destCapacity, 0, &errorCode); } // get the converter UBool isDefaultConverter; if(cnv==0) { isDefaultConverter=true; cnv=u_getDefaultConverter(&errorCode); if(U_FAILURE(errorCode)) { return 0; } } else { isDefaultConverter=false; ucnv_resetFromUnicode(cnv); } // convert int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode); // release the converter if(isDefaultConverter) { u_releaseDefaultConverter(cnv); } return len; } int32_t UnicodeString::doExtract(int32_t start, int32_t length, char *dest, int32_t destCapacity, UConverter *cnv, UErrorCode &errorCode) const { if(U_FAILURE(errorCode)) { if(destCapacity!=0) { *dest=0; } return 0; } const char16_t *src=getArrayStart()+start, *srcLimit=src+length; char *originalDest=dest; const char *destLimit; if(destCapacity==0) { destLimit=dest=0; } else if(destCapacity==-1) { // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used. destLimit=(char*)U_MAX_PTR(dest); // for NUL-termination, translate into highest int32_t destCapacity=0x7fffffff; } else { destLimit=dest+destCapacity; } // perform the conversion ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &errorCode); length=(int32_t)(dest-originalDest); // if an overflow occurs, then get the preflighting length if(errorCode==U_BUFFER_OVERFLOW_ERROR) { char buffer[1024]; destLimit=buffer+sizeof(buffer); do { dest=buffer; errorCode=U_ZERO_ERROR; ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, true, &errorCode); length+=(int32_t)(dest-buffer); } while(errorCode==U_BUFFER_OVERFLOW_ERROR); } return u_terminateChars(originalDest, destCapacity, length, &errorCode); } void UnicodeString::doCodepageCreate(const char *codepageData, int32_t dataLength, const char *codepage) { // if there's nothing to convert, do nothing if(codepageData == 0 || dataLength == 0 || dataLength < -1) { return; } if(dataLength == -1) { dataLength = (int32_t)uprv_strlen(codepageData); } UErrorCode status = U_ZERO_ERROR; // create the converter // if the codepage is the default, use our cache // if it is an empty string, then use the "invariant character" conversion UConverter *converter; if (codepage == 0) { const char *defaultName = ucnv_getDefaultName(); if(UCNV_FAST_IS_UTF8(defaultName)) { setToUTF8(StringPiece(codepageData, dataLength)); return; } converter = u_getDefaultConverter(&status); } else if(*codepage == 0) { // use the "invariant characters" conversion if(cloneArrayIfNeeded(dataLength, dataLength, false)) { u_charsToUChars(codepageData, getArrayStart(), dataLength); setLength(dataLength); } else { setToBogus(); } return; } else { converter = ucnv_open(codepage, &status); } // if we failed, set the appropriate flags and return if(U_FAILURE(status)) { setToBogus(); return; } // perform the conversion doCodepageCreate(codepageData, dataLength, converter, status); if(U_FAILURE(status)) { setToBogus(); } // close the converter if(codepage == 0) { u_releaseDefaultConverter(converter); } else { ucnv_close(converter); } } void UnicodeString::doCodepageCreate(const char *codepageData, int32_t dataLength, UConverter *converter, UErrorCode &status) { if(U_FAILURE(status)) { return; } // set up the conversion parameters const char *mySource = codepageData; const char *mySourceEnd = mySource + dataLength; char16_t *array, *myTarget; // estimate the size needed: int32_t arraySize; if(dataLength <= US_STACKBUF_SIZE) { // try to use the stack buffer arraySize = US_STACKBUF_SIZE; } else { // 1.25 char16_t's per source byte should cover most cases arraySize = dataLength + (dataLength >> 2); } // we do not care about the current contents UBool doCopyArray = false; for(;;) { if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) { setToBogus(); break; } // perform the conversion array = getArrayStart(); myTarget = array + length(); ucnv_toUnicode(converter, &myTarget, array + getCapacity(), &mySource, mySourceEnd, 0, true, &status); // update the conversion parameters setLength((int32_t)(myTarget - array)); // allocate more space and copy data, if needed if(status == U_BUFFER_OVERFLOW_ERROR) { // reset the error code status = U_ZERO_ERROR; // keep the previous conversion results doCopyArray = true; // estimate the new size needed, larger than before // try 2 char16_t's per remaining source byte arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource)); } else { break; } } } U_NAMESPACE_END #endif