Skip to content
Snippets Groups Projects
Commit a8ceed4b authored by Rui Ueyama's avatar Rui Ueyama
Browse files

Add UTF-32 string literal

parent 7f39f986
Branches
No related merge requests found
......@@ -41,6 +41,17 @@ int main() {
ASSERT(u'b', u"βb"[1]);
ASSERT(0, u"βb"[2]);
ASSERT(4, sizeof(U""));
ASSERT(20, sizeof(U"\xffzzz"));
ASSERT(0, memcmp(U"", "\0\0\0\0", 4));
ASSERT(0, memcmp(U"abc", "a\0\0\0b\0\0\0c\0\0\0\0\0\0\0", 16));
ASSERT(0, memcmp(U"日本語", "\345e\0\0,g\0\0\236\212\0\0\0\0\0\0", 16));
ASSERT(0, memcmp(U"🍣", "c\363\001\0\0\0\0\0", 8));
ASSERT(u'β', U"βb"[0]);
ASSERT(u'b', U"βb"[1]);
ASSERT(0, U"βb"[2]);
ASSERT(1, U"\xffffffff"[0] >> 31);
printf("OK\n");
return 0;
}
......@@ -257,6 +257,28 @@ static Token *read_utf16_string_literal(Token *cur, char *start) {
return tok;
}
// Read a UTF-8-encoded string literal and transcode it in UTF-32.
//
// UTF-32 is a fixed-width encoding for Unicode. Each code point is
// encoded in 4 bytes.
static Token *read_utf32_string_literal(Token *cur, char *start, Type *ty) {
char *end = string_literal_end(start + 1);
uint32_t *buf = calloc(4, end - start);
int len = 0;
for (char *p = start + 1; p < end;) {
if (*p == '\\')
buf[len++] = read_escaped_char(&p, p + 1);
else
buf[len++] = decode_utf8(&p, p);
}
Token *tok = new_token(TK_STR, cur, start, end - start + 1);
tok->ty = array_of(ty, len + 1);
tok->str = (char *)buf;
return tok;
}
static Token *read_char_literal(Token *cur, char *start, Type *ty) {
char *p = start + 1;
if (*p == '\0')
......@@ -497,6 +519,13 @@ Token *tokenize(File *file) {
continue;
}
// UTF-32 string literal
if (startswith(p, "U\"")) {
cur = read_utf32_string_literal(cur, p + 1, ty_uint);
p += cur->len + 1;
continue;
}
// Character literal
if (*p == '\'') {
cur = read_char_literal(cur, p, ty_int);
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment