Answer the question
In order to leave comments, you need to log in
How to decode UTF-16 in C++?
It is necessary to make a UTF-16 encoding decoder in C++ or C. There is a code in Pascal ( https://ru.wikipedia.org/wiki/UTF-16 ), but I can’t understand it, because I don’t know this language well.
Code in Pascal:
// В случае успеха возвращаются значения
// в диапазонах $0000..$D7FF и $E000..$10FFFF.
Function ReadUTF16Char: UInt32
Var Leading: Word // Лидирующее (первое) слово.
Var Trailing: Word // Последующее (второе) слово.
Leading = ReadWord();
If (Leading < $D800) Or (Leading > $DFFF) Then
Return WordToUInt32(Leading)
Else If (Leading >= $DC00) Then
Error("Недопустимая кодовая последовательность.")
Else
Var Code: UInt32
Code = WordToUInt32(Leading And $3FF) Shl 10
Trailing = ReadWord()
If ((Trailing < $DC00) Or (Trailing > $DFFF)) Then
Error("Недопустимая кодовая последовательность.")
Else
Code = Code Or WordToUInt32(Trailing And $3FF)
Return (Code + $10000)
End If
End If
End Function
Answer the question
In order to leave comments, you need to log in
It's not Pascal, it's typed BASIC.
Here is my working code - each call reads one code position, moving the first pointer forward by the required number of words. Will you translate into the work agreements you need? - for example, you can read information not from memory, but from a file or from somewhere else.
enum {
SURROGATE_MIN = 0xD800,
SURROGATE_MAX = 0xDFFF,
SURROGATE_LO_MIN = SURROGATE_MIN,
SURROGATE_HI_MIN = 0xDC00,
SURROGATE_LO_MAX = SURROGATE_HI_MIN - 1,
SURROGATE_HI_MAX = SURROGATE_MAX,
UNICODE_MAX = 0x10FFFF,
U8_1BYTE_MAX = 0x7F,
U8_2BYTE_MIN = 0x80,
U8_2BYTE_MAX = 0x7FF,
U8_3BYTE_MIN = 0x800,
U8_3BYTE_MAX = 0xFFFF,
U8_4BYTE_MIN = 0x10000,
U8_4BYTE_MAX = UNICODE_MAX,
U16_1WORD_MAX = 0xFFFF,
U16_2WORD_MIN = 0x10000,
U16_2WORD_MAX = UNICODE_MAX,
};
#define CHAR_BOM L'\uFEFF'
#define UNICODE_NONE (0xFFFFFFFFUL)
#define UNICODE_BAD (0xFFFFFFFEUL)
unsigned long str::getCp(const uint16_t*& aCurr, const uint16_t* aEnd)
{
if (aCurr == aEnd)
return UNICODE_NONE;
unsigned long cp = *(aCurr++);
if (cp < SURROGATE_HI_MIN) {
if (cp < SURROGATE_MIN) { // Low BMP char => OK
return cp;
} else { // Leading surrogate
if (aCurr == aEnd)
return UNICODE_BAD;
unsigned long trailing = *aCurr;
if (trailing < SURROGATE_HI_MIN || trailing > SURROGATE_HI_MAX)
return UNICODE_BAD;
++aCurr;
return (((cp & 0x3FF) << 10) | (trailing & 0x3FF)) + 0x10000;
}
} else {
if (cp <= SURROGATE_MAX) { // Trailing surrogate
return UNICODE_BAD;
} else { // High BMP char => OK
return cp;
}
}
}
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question