Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Why does multibyte character to char32_t conversion use UTF-8 as the multibyte encoding instead of the locale-specific one?

I have been trying to convert Chinese character input from Windows command prompt in Big5 to UTF-8 by first converting the received input to char32_t in UTF-32 encoding, then convert it to UTF-8. I've been calling the function mbtoc32 from <uchar.h> to do this job, however it kept sending "Encoding error".

The following is the conditions I have encountered:

  • Converting the sequence (Big5) to a wchar_t representation by mbstowcs is successful.
  • mbrtoc32 takes the multibyte sequence as UTF-8, though the locale is not. (Set to "", returns "Chinese (Traditional)_Hong Kong SAR.950" on my machine)

Below is the code I've been writing to try to debug my problem, however no success. It tries to convert the "香" Chinese character (U+9999) into the multibyte representation, then tries to convert the Big5 encoding of "香" (0xADBB) into wchar_t and char32_t. However, converting from multibyte (Big5) to char32_t returns encoding error. (In contradictory, inputting the UTF-8 sequence of "香" to mbrtoc32 does return 0x9999 successfully)

#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>

mbstate_t state;
int main(void){
    setlocale(LC_CTYPE, "");
    printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
    char32_t chi_c = 0x9999;
    printf("Character U+9999 is 香\n");
    char *mbc = (char *)calloc(32, sizeof(char));
    size_t mb_len;
    mb_len = c32rtomb(mbc, chi_c, &state);
    int i;
    printf("The multibyte representation of U+9999 is:\n");
    // 0xE9A699, UTF-8
    for (i = 0; i < mb_len; i++){
        printf("%#2x\t", *(mbc + i));
    }
    char *src_mbs = (char *)calloc(32, sizeof(char));
    // "香" in Big5 encoding
    *(src_mbs + 0) = 0xad;
    *(src_mbs + 1) = 0xbb;
    wchar_t res_wc;
    mbtowc(&res_wc, src_mbs, 32); // Success, res_wc == 0x9999
    char32_t res_c32;
    mb_len = mbrtoc32(&res_c32, src_mbs, (size_t)3, &state);
    // Returns (size_t)-1, encoding error
    if (mb_len == (size_t)-1){
        perror("Encoding error");
        return errno;
    }
    else {
        printf("\nThe 32-bit character representation of U+9999 is:\n%#x", res_wc);
    }
    return 0;
}

I've also read documentation from cppreference.com, it said,

In any case, the multibyte character encoding used by this function is specified by the currently active C locale.

I expect mbrtoc32 to behave like mbtowc, which is converting the character from the locale-specific encoding to UTF-32 (in this case Big5 to UTF-32).

Is there any solutions to use mbrtoc32 to convert the multibyte character into char32_t without having the "Encoding error"?

P.S.: I'm using Mingw-64 on Windows 10, compiled with gcc.

like image 546
FearlessSniper Avatar asked Nov 21 '25 21:11

FearlessSniper


1 Answers

I've found the problem. The Mingw-w64 I'm using is expecting all multi-byte string passed to mbrtoc32 and c32rtomb to be in UTF-8 encoding.

Code for mbrtoc32:

size_t mbrtoc32 (char32_t *__restrict__ pc32,
         const char *__restrict__ s,
         size_t n,
         mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
    if (*s == 0)
    {
    *pc32 = 0;
    return 0;
    }

    /* ASCII character - high bit unset */
    if ((*s & 0x80) == 0)
    {
    *pc32 = *s;
    return 1;
    }

    /* Multibyte chars */
    if ((*s & 0xE0) == 0xC0) /* 110xxxxx needs 2 bytes */
    {
    if (n < 2)
        return (size_t)-2;

    *pc32 = ((s[0] & 31) << 6) | (s[1] & 63);
    return 2;
    }
    else if ((*s & 0xf0) == 0xE0) /* 1110xxxx needs 3 bytes */
    {
    if (n < 3)
        return (size_t)-2;

    *pc32 = ((s[0] & 15) << 12) | ((s[1] & 63) << 6) | (s[2] & 63);
    return 3;
    }
    else if ((*s & 0xF8) == 0xF0) /* 11110xxx needs 4 bytes */
    {
    if (n < 4)
        return (size_t)-2;

    *pc32 = ((s[0] & 7) << 18) | ((s[1] & 63) << 12) | ((s[2] & 63) << 6) | (s[4] & 63);
    return 4;
    }

    errno = EILSEQ;
    return (size_t)-1;
}

and for c32rtomb:

size_t c32rtomb (char *__restrict__ s,
         char32_t c32,
         mbstate_t *__restrict__ __UNUSED_PARAM(ps))
{
    if (c32 <= 0x7F) /* 7 bits needs 1 byte */
    {
    *s = (char)c32 & 0x7F;
    return 1;
    }
    else if (c32 <= 0x7FF) /* 11 bits needs 2 bytes */
    {
    s[1] = 0x80 | (char)(c32 & 0x3F);
    s[0] = 0xC0 | (char)(c32 >> 6);
    return 2;
    }
    else if (c32 <= 0xFFFF) /* 16 bits needs 3 bytes */
    {
    s[2] = 0x80 | (char)(c32 & 0x3F);
    s[1] = 0x80 | (char)((c32 >> 6) & 0x3F);
    s[0] = 0xE0 | (char)(c32 >> 12);
    return 3;
    }
    else if (c32 <= 0x1FFFFF) /* 21 bits needs 4 bytes */
    {
    s[3] = 0x80 | (char)(c32 & 0x3F);
    s[2] = 0x80 | (char)((c32 >> 6) & 0x3F);
    s[1] = 0x80 | (char)((c32 >> 12) & 0x3F);
    s[0] = 0xF0 | (char)(c32 >> 18);
    return 4;
    }

    errno = EILSEQ;
    return (size_t)-1;
}

both of these functions expected the given multi-byte string to be in UTF-8 without considering the locale settings. Functions mbrtoc32 and c32rtomb on glibc simply calls their wide character counterpart to convert the characters. As wide character convertions are working properly on Mingw-w64, I used mbrtowc and wcrtomb to replace mbrtoc32 and c32rtomb respectively like the way on glibc:

#include <uchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>

mbstate_t state;
int main(void){
    setlocale(LC_CTYPE, "");
    printf("Your locale is: %s\n", setlocale(LC_CTYPE, NULL));
    char *src_mbs = "\xad\xbb"; // "香" in Big5 encoding
    char32_t src_c32 = 0x9999; // "香" code point
    unsigned char *r_mbc = (char *)calloc(32, sizeof(char));
    if (r_mbc == NULL){
        perror("Failed to allocate memory");
        return errno;
    }
    size_t mb_len = wcrtomb(r_mbc, (wchar_t)src_c32, &state); // Returns 0xADBB, Big5 of "香", OK
    printf("Character U+9999 is %s, ( ", r_mbc);
    for (int i = 0; i < mb_len; i++){
        printf("%#hhx ", *(r_mbc + i));
    }
    printf(")\n");
    // mb_len = c32rtomb(r_mbc, src_c32, &state); // Returns 0xE9A699, UTF-8 representation of "香", expected Big5
    // printf("\nThe multibyte representation of U+9999 is:\n");
    // for (i = 0; i < mb_len; i++){
    //     printf("%#hhX\t", *(r_mbc + i));
    // }
    char32_t r_c32 = 0;
    // mb_len = mbrtoc32(&r_c32, src_mbs, (size_t)3, &state);
    // Returns (size_t)-1, encoding error
    mb_len = mbrtowc((wchar_t *)&r_c32, src_mbs, (size_t)3, &state); // Returns 0x9999, OK
    if (mb_len == (size_t)-1){
        perror("Encoding error");
        return errno;
    }
    else {
        printf("\nThe 32-bit character representation of U+9999 is:\n%#x", r_c32);
    }
    return 0;
}
like image 123
FearlessSniper Avatar answered Nov 23 '25 10:11

FearlessSniper



Donate For Us

If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!