Skip to content

Commit

Permalink
Implemented valid UTF8 character checks
Browse files Browse the repository at this point in the history
  • Loading branch information
zbalkan committed Sep 26, 2024
1 parent 22600e0 commit 48e2945
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 9 deletions.
8 changes: 4 additions & 4 deletions src/common/utf8_op/src/utf8_op.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,21 @@
#define REPLACEMENT_INC 4096

/* Single byte: 0xxxxxxx */
#define valid_1(x) (x[0] & 0x80) == 0
#define valid_1(x) (((x)[0] & 0x80) == 0)

/* Two bytes: 110xxxxx 10xxxxxx */
/* Starting bytes 0xC0 and 0xC1 are forbidden (overlong) */
#define valid_2(x) (x[0] & 0xE0) == 0xC0 && (x[0] & 0x1E) != 0 && (x[1] & 0xC0) == 0x80
#define valid_2(x) (((x)[0] & 0xE0) == 0xC0 && (x)[0] >= (char)0xC2 && ((x)[1] & 0xC0) == 0x80)

/* Three bytes: 1110xxxx 10xxxxxx 10xxxxxx */
/* 0xE0 could start overlong encodings */
/* 0xED (range U+D800–U+DFFF) is reserved for UTF-16 surrogate halves */
#define valid_3(x) (x[0] & 0xF0) == 0xE0 && x[0] != (char)0xE0 && x[0] != (char)0xED && (x[1] & 0xC0) == 0x80 && (x[2] & 0xC0) == 0x80
#define valid_3(x) (((x)[0] & 0xF0) == 0xE0 && (((x)[0] != (char)0xE0 || ((x)[1] & 0xE0) != 0x80) && ((x)[0] != (char)0xED || ((x)[1] & 0xE0) != 0xA0)) && ((x)[1] & 0xC0) == 0x80 && ((x)[2] & 0xC0) == 0x80)

/* Four bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
/* 0xF0 could start overlong encodings */
/* Starting bytes 111101xx are forbidden (Unicode limit) */
#define valid_4(x) (x[0] & 0xF8) == 0xF0 && x[0] != (char)0xF0 && (x[0] & 0x04) == 0 && (x[1] & 0xC0) == 0x80 && (x[2] & 0xC0) == 0x80 && (x[3] & 0xC0) == 0x80
#define valid_4(x) (((x)[0] & 0xF8) == 0xF0 && (((x)[0] != (char)0xF0 || ((x)[1] & 0xF0) != 0x80) && ((x)[0] != (char)0xF4 || ((x)[1] & 0xF0) == 0x80)) && ((x)[1] & 0xC0) == 0x80 && ((x)[2] & 0xC0) == 0x80 && ((x)[3] & 0xC0) == 0x80)

/* Return whether a string is UTF-8 */
bool w_utf8_valid(const char * string) {
Expand Down
83 changes: 78 additions & 5 deletions src/common/utf8_op/tests/unit/tests/test_utf8_op.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,57 @@
#include "../../headers/shared.h"
#include "../wrappers/common.h"

// Tests
// Utility function for verifying the result
void assert_valid_utf8(const char *input, bool replacement, bool expect_valid) {
char *filtered = w_utf8_filter(input, replacement);
int result = w_utf8_valid(filtered);
if (expect_valid) {
assert_int_equal(result, 1);
} else {
assert_int_equal(result, 0);
}
free(filtered);
}

// Test valid UTF-8 sequences
void test_valid_utf8_sequences(void **state)
{
const char * valid_sequences[] = {
"Hello, World!", // ASCII characters (1-byte each)
"\xC3\x9C", // Ü (U+00DC, 2-byte UTF-8)
"\xC3\xBC", // ü (U+00FC, 2-byte UTF-8)
"\xE2\x98\x83", // ☃ (U+2603, 3-byte UTF-8)
"\xF0\x9F\x98\x81", // 😁 (U+1F601, 4-byte UTF-8)
"Σὲ γνωρίζω", // Greek text (multi-byte sequences)
"中文字符", // Chinese characters (3-byte UTF-8)
NULL // Null-terminated array
};

for (int i = 0; valid_sequences[i] != NULL; ++i) {
assert_valid_utf8(valid_sequences[i], false, true);
assert_valid_utf8(valid_sequences[i], true, true);
}
}

// Test invalid UTF-8 sequences
void test_invalid_utf8_sequences(void **state)
{
const char * invalid_sequences[] = {
"\xC0\xAF", // Overlong encoding of '/'
"\xE0\x80\xAF", // Overlong encoding (null character U+002F)
"\xED\xA0\x80", // UTF-16 surrogate half (invalid in UTF-8)
"\xF8\x88\x80\x80\x80", // 5-byte sequence (invalid, as UTF-8 only supports up to 4 bytes)
"\xFF", // Invalid single byte (not valid in UTF-8)
"\x80", // Continuation byte without a start
"\xC3\x28", // Invalid 2-byte sequence (invalid second byte)
NULL // Null-terminated array
};

for (int i = 0; invalid_sequences[i] != NULL; ++i) {
assert_valid_utf8(invalid_sequences[i], false, false);
assert_valid_utf8(invalid_sequences[i], true, true); // Replaced, thus valid output
}
}

void test_utf8_random_replace(void **state)
{
Expand All @@ -38,6 +88,10 @@ void test_utf8_random_replace(void **state)

char * copy = w_utf8_filter(buffer, true);
int r = w_utf8_valid(copy);

/* Check if the output is valid */
assert_int_equal(r, 1);

free(copy);
}

Expand All @@ -50,7 +104,6 @@ void test_utf8_random_not_replace(void **state)
randombytes(buffer, LENGTH - 1);

/* Avoid zeroes */

for (i = 0; i < LENGTH - 1; i++) {
buffer[i] = buffer[i] ? buffer[i] : '0';
}
Expand All @@ -59,13 +112,33 @@ void test_utf8_random_not_replace(void **state)

char * copy = w_utf8_filter(buffer, false);
int r = w_utf8_valid(copy);

/* The result could be either valid or invalid */
(void)r; // Use (void) to avoid unused variable warning in case you don't assert

free(copy);
}

void test_utf8_edge_cases(void **state)
{
const char * edge_cases[] = {
"\xF4\x8F\xBF\xBF", // U+10FFFF (highest valid UTF-8 character)
"\xF4\x90\x80\x80", // Beyond U+10FFFF (invalid)
NULL
};

// Check edge cases
assert_valid_utf8(edge_cases[0], false, true); // Should be valid
assert_valid_utf8(edge_cases[1], false, false); // Should be invalid
}

int main(void) {
const struct CMUnitTest tests[] = {
cmocka_unit_test(test_utf8_random_replace),
cmocka_unit_test(test_utf8_random_not_replace),
cmocka_unit_test(test_valid_utf8_sequences),
cmocka_unit_test(test_invalid_utf8_sequences),
cmocka_unit_test(test_utf8_random_replace),
cmocka_unit_test(test_utf8_random_not_replace),
cmocka_unit_test(test_utf8_edge_cases),
};
return cmocka_run_group_tests(tests, NULL, NULL);
}
}

0 comments on commit 48e2945

Please sign in to comment.