| /* Copyright 2016 The Chromium OS Authors. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <sys/types.h> |
| |
| #ifdef CRAS_DBUS |
| #include <dbus/dbus.h> |
| #endif |
| |
| #include "cras_utf8.h" |
| #include "cras_util.h" |
| |
| static const uint8_t kUTF8ByteOrderMask[3] = { 0xef, 0xbb, 0xbf }; |
| |
| typedef struct u8range { |
| uint8_t min; |
| uint8_t max; |
| } u8range_t; |
| |
| static const u8range_t kUTF8TwoByteSeq[] = { |
| { 0xc2, 0xdf }, |
| { 0x80, 0xbf }, |
| { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqE0[] = { |
| { 0xe0, 0xe0 }, |
| { 0xa0, 0xbf }, |
| { 0x80, 0xbf }, |
| { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqE1EC[] = { |
| { 0xe1, 0xec }, |
| { 0x80, 0xbf }, |
| { 0x80, 0xbf }, |
| { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqED[] = { |
| { 0xed, 0xed }, |
| { 0x80, 0x9f }, |
| { 0x80, 0xbf }, |
| { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqEEEF[] = { |
| { 0xee, 0xef }, |
| { 0x80, 0xbf }, |
| { 0x80, 0xbf }, |
| { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqF0[] = { |
| { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, |
| { 0x80, 0xbf }, { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqF1F3[] = { |
| { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, |
| { 0x80, 0xbf }, { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8ByteSeqF4[] = { |
| { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, |
| { 0x80, 0xbf }, { 0, 0 }, |
| }; |
| |
| static const u8range_t kUTF8NullRange[] = { { 0, 0 } }; |
| |
| typedef struct utf8seq { |
| const u8range_t *ranges; |
| } utf8seq_t; |
| |
| static const utf8seq_t kUTF8Sequences[] = { |
| { kUTF8TwoByteSeq }, { kUTF8ByteSeqE0 }, { kUTF8ByteSeqE1EC }, |
| { kUTF8ByteSeqED }, { kUTF8ByteSeqEEEF }, { kUTF8ByteSeqF0 }, |
| { kUTF8ByteSeqF1F3 }, { kUTF8ByteSeqF4 }, { kUTF8NullRange } |
| }; |
| |
| int valid_utf8_string(const char *string, size_t *bad_pos) |
| { |
| int bom_chars = 0; |
| uint8_t byte; |
| const char *pos = string; |
| int ret = 1; |
| const utf8seq_t *seq = NULL; |
| const u8range_t *range = NULL; |
| |
| if (!pos) { |
| ret = 0; |
| goto error; |
| } |
| |
| while ((byte = (uint8_t) * (pos++))) { |
| if (!range || range->min == 0) { |
| if (byte < 128) { |
| /* Ascii character. */ |
| continue; |
| } |
| |
| if (bom_chars < ARRAY_SIZE(kUTF8ByteOrderMask)) { |
| if (byte == kUTF8ByteOrderMask[bom_chars]) { |
| bom_chars++; |
| continue; |
| } else { |
| /* Characters not matching BOM. |
| * Rewind and assume that there is |
| * no BOM. */ |
| bom_chars = |
| ARRAY_SIZE(kUTF8ByteOrderMask); |
| pos = string; |
| continue; |
| } |
| } |
| |
| /* Find the matching sequence of characters by |
| * matching the first character in the sequence. |
| */ |
| seq = kUTF8Sequences; |
| while (seq->ranges->min != 0) { |
| if (byte >= seq->ranges->min && |
| byte <= seq->ranges->max) { |
| /* Matching sequence. */ |
| break; |
| } |
| seq++; |
| } |
| |
| if (seq->ranges->min == 0) { |
| /* Could not find a matching sequence. */ |
| ret = 0; |
| goto error; |
| } |
| |
| /* Found the appropriate sequence. */ |
| range = seq->ranges + 1; |
| continue; |
| } |
| |
| if (byte >= range->min && byte <= range->max) { |
| range++; |
| continue; |
| } |
| |
| /* This character doesn't belong in UTF8. */ |
| ret = 0; |
| goto error; |
| } |
| |
| if (range && range->min != 0) { |
| /* Stopped in the middle of a sequence. */ |
| ret = 0; |
| } |
| |
| error: |
| if (bad_pos) |
| *bad_pos = pos - string - 1; |
| return ret; |
| } |
| |
| #ifdef CRAS_DBUS |
| /* Use the DBus implementation if available to ensure that the UTF-8 |
| * sequences match those expected by the DBus implementation. */ |
| |
| int is_utf8_string(const char *string) |
| { |
| return !!dbus_validate_utf8(string, NULL); |
| } |
| |
| #else |
| |
| int is_utf8_string(const char *string) |
| { |
| return valid_utf8_string(string, NULL); |
| } |
| |
| #endif |