#include #include #include #include #include typedef enum { VALID, INCOMPLETE, NOTUNICODE, OVERLONG, MALFORMED } Status; static gboolean ucs4_equal (gunichar *a, gunichar *b) { while (*a && *b && (*a == *b)) { a++; b++; } return (*a == *b); } static gboolean utf16_equal (gunichar2 *a, gunichar2 *b) { while (*a && *b && (*a == *b)) { a++; b++; } return (*a == *b); } static gint utf16_count (gunichar2 *a) { gint result = 0; while (a[result]) result++; return result; } static void process (gint line, gchar *utf8, Status status, gunichar *ucs4, gint ucs4_len) { const gchar *end; gboolean is_valid = g_utf8_validate (utf8, -1, &end); GError *error = NULL; glong items_read, items_written; switch (status) { case VALID: g_assert_true (is_valid); break; case NOTUNICODE: case INCOMPLETE: case OVERLONG: case MALFORMED: g_assert_false (is_valid); break; } if (status == INCOMPLETE) { gunichar *ucs4_result; ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error); g_assert_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT); g_clear_error (&error); ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error); g_assert_no_error (error); g_assert_nonnull (ucs4_result); g_assert_cmpint (items_read, !=, strlen (utf8)); g_free (ucs4_result); } if (status == VALID || status == NOTUNICODE) { gunichar *ucs4_result; ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error); g_assert_no_error (error); g_assert_nonnull (ucs4_result); g_assert_true (ucs4_equal (ucs4_result, ucs4)); g_assert_cmpint (items_read, ==, strlen (utf8)); g_assert_cmpint (items_written, ==, ucs4_len); g_free (ucs4_result); } if (status == VALID) { gunichar *ucs4_result; gchar *utf8_result; ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written); g_assert_nonnull (ucs4_result); g_assert_true (ucs4_equal (ucs4_result, ucs4)); g_assert_cmpint (items_written, ==, ucs4_len); utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error); g_assert_no_error (error); g_assert_nonnull (utf8_result); g_assert_cmpstr ((char *) utf8_result, ==, (char *) utf8); g_assert_cmpint (items_read, ==, ucs4_len); g_assert_cmpint (items_written, ==, strlen (utf8)); g_free (utf8_result); g_free (ucs4_result); } if (status == VALID) { gunichar2 *utf16_expected_tmp; gunichar2 *utf16_expected; gunichar2 *utf16_from_utf8; gunichar2 *utf16_from_ucs4; gunichar *ucs4_result; gsize bytes_written; gint n_chars; gchar *utf8_result; #if G_BYTE_ORDER == G_LITTLE_ENDIAN #define TARGET "UTF-16LE" #else #define TARGET "UTF-16" #endif utf16_expected_tmp = (gunichar2 *) g_convert (utf8, -1, TARGET, "UTF-8", NULL, &bytes_written, NULL); g_assert_nonnull (utf16_expected_tmp); /* zero-terminate and remove BOM */ n_chars = bytes_written / 2; if (utf16_expected_tmp[0] == 0xfeff) /* BOM */ { n_chars--; utf16_expected = g_new (gunichar2, n_chars + 1); memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars); } else { /* We expect the result of the conversion via iconv() to be native-endian. */ g_assert_false (utf16_expected_tmp[0] == 0xfffe); utf16_expected = g_new (gunichar2, n_chars + 1); memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars); } utf16_expected[n_chars] = '\0'; utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error); g_assert_no_error (error); g_assert_nonnull (utf16_from_utf8); g_assert_cmpint (items_read, ==, (glong) strlen (utf8)); g_assert_cmpint (utf16_count (utf16_from_utf8), ==, items_written); utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error); g_assert_no_error (error); g_assert_nonnull (utf16_from_ucs4); g_assert_cmpint (items_read, ==, ucs4_len); g_assert_cmpint (utf16_count (utf16_from_ucs4), ==, items_written); g_assert_true (utf16_equal (utf16_from_utf8, utf16_expected)); g_assert_true (utf16_equal (utf16_from_ucs4, utf16_expected)); g_assert_cmpstr ((char *) utf16_from_utf8, ==, (char *) utf16_expected); g_assert_cmpstr ((char *) utf16_from_ucs4, ==, (char *) utf16_expected); utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error); g_assert_no_error (error); g_assert_nonnull (utf8_result); g_assert_cmpint (items_read, ==, utf16_count (utf16_from_utf8)); g_assert_cmpint (items_written, ==, (glong) strlen (utf8)); ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error); g_assert_no_error (error); g_assert_nonnull (ucs4_result); g_assert_cmpint (items_read, ==, utf16_count (utf16_from_utf8)); g_assert_cmpint (items_written, ==, ucs4_len); g_assert_cmpstr (utf8, ==, utf8_result); g_assert_cmpstr ((char *) ucs4, ==, (char *) ucs4_result); g_free (utf16_expected_tmp); g_free (utf16_expected); g_free (utf16_from_utf8); g_free (utf16_from_ucs4); g_free (utf8_result); g_free (ucs4_result); } } static void test_unicode_encoding (void) { gchar *testfile; gchar *contents; GError *error = NULL; gchar *p, *end; char *tmp; gint state = 0; gint line = 1; gint start_line = 0; /* Quiet GCC */ gchar *utf8 = NULL; /* Quiet GCC */ GArray *ucs4; Status status = VALID; /* Quiet GCC */ testfile = g_test_build_filename (G_TEST_DIST, "utf8.txt", NULL); g_file_get_contents (testfile, &contents, NULL, &error); g_assert_no_error (error); ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar)); p = contents; /* Loop over lines */ while (*p) { while (*p && (*p == ' ' || *p == '\t')) p++; end = p; while (*end && (*end != '\r' && *end != '\n')) end++; if (!*p || *p == '#' || *p == '\r' || *p == '\n') goto next_line; tmp = g_strstrip (g_strndup (p, end - p)); switch (state) { case 0: /* UTF-8 string */ start_line = line; utf8 = tmp; tmp = NULL; break; case 1: /* Status */ if (!strcmp (tmp, "VALID")) status = VALID; else if (!strcmp (tmp, "INCOMPLETE")) status = INCOMPLETE; else if (!strcmp (tmp, "NOTUNICODE")) status = NOTUNICODE; else if (!strcmp (tmp, "OVERLONG")) status = OVERLONG; else if (!strcmp (tmp, "MALFORMED")) status = MALFORMED; else g_assert_not_reached (); if (status != VALID && status != NOTUNICODE) state++; /* No UCS-4 data */ break; case 2: /* UCS-4 version */ p = strtok (tmp, " \t"); while (p) { gchar *endptr; gunichar ch = strtoul (p, &endptr, 16); g_assert_cmpint (*endptr, == ,'\0'); g_array_append_val (ucs4, ch); p = strtok (NULL, " \t"); } break; } g_free (tmp); state = (state + 1) % 3; if (state == 0) { process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len); g_array_set_size (ucs4, 0); g_free (utf8); } next_line: p = end; if (*p && *p == '\r') p++; if (*p && *p == '\n') p++; line++; } g_free (testfile); g_array_free (ucs4, TRUE); g_free (contents); } int main (int argc, char **argv) { g_test_init (&argc, &argv, NULL); g_test_add_func ("/unicode/encoding", test_unicode_encoding); return g_test_run (); }