Go Back

0 /*

1 * MIT License

2 * Copyright (c) 2010-2020 nsf <no.smile.face@gmail.com>

3 * 2015-2022 Adam Saponara <as@php.net>

4 * 2023-2024 RMF <rawmonk@rmf-dev.com>

5 */

6 #include <stdio.h>

7 #include <stddef.h>

8 #include <stdint.h>

9 #include "wcwidth.h"

10

11 static const unsigned char utf8_length[256] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

12 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

13 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

14 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

20 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,

21 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,

22 5, 6, 6, 1, 1};

23

24 static const unsigned char utf8_mask[6] = {0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01};

25

26 int utf8_char_length(char c) {

27 return utf8_length[(unsigned char)c];

28 }

29

30 int utf8_unicode_length(uint32_t c) {

31 int len;

32 if (c < 0x80) {

33 len = 1;

34 } else if (c < 0x800) {

35 len = 2;

36 } else if (c < 0x10000) {

37 len = 3;

38 } else if (c < 0x200000) {

39 len = 4;

40 } else if (c < 0x4000000) {

41 len = 5;

42 } else {

43 len = 6;

44 }

45 return len;

46 }

47

48 int utf8_char_to_unicode(uint32_t *out, const char *c) {

49

50 int i;

51 unsigned char len, mask;

52 uint32_t result;

53

54 len = utf8_char_length(*c);

55 mask = utf8_mask[len - 1];

56 result = c[0] & mask;

57 for (i = 1; i < len; ++i) {

58 result <<= 6;

59 result |= c[i] & 0x3f;

60 }

61

62 *out = result;

63 return (int)len;

64 }

65

66 int utf8_unicode_to_char(char *out, uint32_t c) {

67 int len = 0;

68 int first;

69 int i;

70

71 if (c < 0x80) {

72 first = 0;

73 len = 1;

74 } else if (c < 0x800) {

75 first = 0xc0;

76 len = 2;

77 } else if (c < 0x10000) {

78 first = 0xe0;

79 len = 3;

80 } else if (c < 0x200000) {

81 first = 0xf0;

82 len = 4;

83 } else if (c < 0x4000000) {

84 first = 0xf8;

85 len = 5;

86 } else {

87 first = 0xfc;

88 len = 6;

89 }

90

91 for (i = len - 1; i > 0; --i) {

92 out[i] = (c & 0x3f) | 0x80;

93 c >>= 6;

94 }

95 out[0] = c | first;

96

97 return len;

98 }

99

100 const char *utf8_next(const char **ptr) {

101 int i = utf8_char_length(**ptr);

102 *ptr += i;

103 return *ptr;

104 }

105

106 int utf8_previous(const char *ptr, int i) {

107 if (i) i--;

108 while (i > 0 && (ptr[i] & 0xC0) == 0x80) i--;

109 return i;

110 }

111

112 int utf8_width(const char *ptr, size_t length) {

113

114 int width;

115 size_t i;

116

117 width = 0;

118 for (i = 0; i < length; ) {

119 uint32_t ch;

120 i += utf8_char_to_unicode(&ch, &ptr[i]);

121 if (!ch) break;

122 width += mk_wcwidth(ch);

123 }

124 return width;

125 }

126

127 int utf8_cpy(char *dst, const char *src, size_t length) {

128 size_t i;

129 for (i = 0; i < length; ) {

130 size_t len = utf8_char_length(src[i]);

131 if (i + len >= length) {

132 dst[i] = '\0';

133 break;

134 }

135 while (len--) {

136 dst[i] = src[i];

137 i++;

138 }

139 }

140 return i;

141 }

142

143 int utf8_fgetc(FILE *f, uint32_t *out) {

144

145 int ch, len;

146

147 ch = fgetc(f);

148 len = utf8_char_length(ch);

149 if (ch == EOF) return EOF;

150 if (len > 1) {

151 char buf[32];

152 int pos = 0;

153 if ((unsigned)len >= sizeof(buf)) return -1;

154 buf[pos] = ch;

155 for (pos = 1; pos < len; pos++) {

156 ch = fgetc(f);

157 if (ch == EOF) return EOF;

158 buf[pos] = ch;

159 }

160 buf[pos] = 0;

161 utf8_char_to_unicode((uint32_t*)&ch, buf);

162 }

163 *out = ch;

164

165 return 0;

166 }

167

168 int utf8_len(const char *ptr, size_t length) {

169 const char *start = ptr, *end = ptr + length, *last;

170 for (last = NULL; ptr < end && *ptr; ptr += utf8_char_length(*ptr))

171 last = ptr;

172 if (ptr >= end) ptr = last;

173 return ptr ? (ptr - start) : 0;

174 }

175

176 int utf8_fprintf(FILE *f, const char *buf, size_t length) {

177 int i = utf8_len(buf, length);

178 return fwrite(buf, 1, i, f);

179 }

180