blob: ee02a15dc7e33584699a1d6317fa7770b82451ea [file] [log] [blame]
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001<?php
2
3declare(strict_types=1);
4
5namespace Ddeboer\Imap\Message;
6
7use Ddeboer\Imap\Exception\UnsupportedCharsetException;
8
9final class Transcoder
10{
11 /**
12 * @var array
13 *
14 * @see https://encoding.spec.whatwg.org/#encodings
15 * @see https://dxr.mozilla.org/mozilla-central/source/dom/encoding/labelsencodings.properties
16 * @see https://dxr.mozilla.org/mozilla1.9.1/source/intl/uconv/src/charsetalias.properties
17 * @see https://msdn.microsoft.com/en-us/library/cc194829.aspx
18 */
19 private static $charsetAliases = [
20 '128' => 'Shift_JIS',
21 '129' => 'EUC-KR',
22 '134' => 'GB2312',
23 '136' => 'Big5',
24 '161' => 'windows-1253',
25 '162' => 'windows-1254',
26 '177' => 'windows-1255',
27 '178' => 'windows-1256',
28 '186' => 'windows-1257',
29 '204' => 'windows-1251',
30 '222' => 'windows-874',
31 '238' => 'windows-1250',
32 '5601' => 'EUC-KR',
33 '646' => 'us-ascii',
34 '850' => 'IBM850',
35 '852' => 'IBM852',
36 '855' => 'IBM855',
37 '857' => 'IBM857',
38 '862' => 'IBM862',
39 '864' => 'IBM864',
40 '864i' => 'IBM864i',
41 '866' => 'IBM866',
42 'ansi-1251' => 'windows-1251',
43 'ansi_x3.4-1968' => 'us-ascii',
44 'arabic' => 'ISO-8859-6',
45 'ascii' => 'us-ascii',
46 'asmo-708' => 'ISO-8859-6',
47 'big5-hkscs' => 'Big5',
48 'chinese' => 'GB2312',
49 'cn-big5' => 'Big5',
50 'cns11643' => 'x-euc-tw',
51 'cp-866' => 'IBM866',
52 'cp1250' => 'windows-1250',
53 'cp1251' => 'windows-1251',
54 'cp1252' => 'windows-1252',
55 'cp1253' => 'windows-1253',
56 'cp1254' => 'windows-1254',
57 'cp1255' => 'windows-1255',
58 'cp1256' => 'windows-1256',
59 'cp1257' => 'windows-1257',
60 'cp1258' => 'windows-1258',
61 'cp819' => 'ISO-8859-1',
62 'cp850' => 'IBM850',
63 'cp852' => 'IBM852',
64 'cp855' => 'IBM855',
65 'cp857' => 'IBM857',
66 'cp862' => 'IBM862',
67 'cp864' => 'IBM864',
68 'cp864i' => 'IBM864i',
69 'cp866' => 'IBM866',
70 'cp932' => 'Shift_JIS',
71 'csbig5' => 'Big5',
72 'cseucjpkdfmtjapanese' => 'EUC-JP',
73 'cseuckr' => 'EUC-KR',
74 'cseucpkdfmtjapanese' => 'EUC-JP',
75 'csgb2312' => 'GB2312',
76 'csibm850' => 'IBM850',
77 'csibm852' => 'IBM852',
78 'csibm855' => 'IBM855',
79 'csibm857' => 'IBM857',
80 'csibm862' => 'IBM862',
81 'csibm864' => 'IBM864',
82 'csibm864i' => 'IBM864i',
83 'csibm866' => 'IBM866',
84 'csiso103t618bit' => 'T.61-8bit',
85 'csiso111ecmacyrillic' => 'ISO-IR-111',
86 'csiso2022jp' => 'ISO-2022-JP',
87 'csiso2022jp2' => 'ISO-2022-JP',
88 'csiso2022kr' => 'ISO-2022-KR',
89 'csiso58gb231280' => 'GB2312',
90 'csiso88596e' => 'ISO-8859-6-E',
91 'csiso88596i' => 'ISO-8859-6-I',
92 'csiso88598e' => 'ISO-8859-8-E',
93 'csiso88598i' => 'ISO-8859-8-I',
94 'csisolatin1' => 'ISO-8859-1',
95 'csisolatin2' => 'ISO-8859-2',
96 'csisolatin3' => 'ISO-8859-3',
97 'csisolatin4' => 'ISO-8859-4',
98 'csisolatin5' => 'ISO-8859-9',
99 'csisolatin6' => 'ISO-8859-10',
100 'csisolatin9' => 'ISO-8859-15',
101 'csisolatinarabic' => 'ISO-8859-6',
102 'csisolatincyrillic' => 'ISO-8859-5',
103 'csisolatingreek' => 'ISO-8859-7',
104 'csisolatinhebrew' => 'ISO-8859-8',
105 'cskoi8r' => 'KOI8-R',
106 'csksc56011987' => 'EUC-KR',
107 'csmacintosh' => 'x-mac-roman',
108 'csshiftjis' => 'Shift_JIS',
109 'csueckr' => 'EUC-KR',
110 'csunicode' => 'UTF-16BE',
111 'csunicode11' => 'UTF-16BE',
112 'csunicode11utf7' => 'UTF-7',
113 'csunicodeascii' => 'UTF-16BE',
114 'csunicodelatin1' => 'UTF-16BE',
115 'csviqr' => 'VIQR',
116 'csviscii' => 'VISCII',
117 'cyrillic' => 'ISO-8859-5',
118 'dos-874' => 'windows-874',
119 'ecma-114' => 'ISO-8859-6',
120 'ecma-118' => 'ISO-8859-7',
121 'ecma-cyrillic' => 'ISO-IR-111',
122 'elot_928' => 'ISO-8859-7',
123 'gb_2312' => 'GB2312',
124 'gb_2312-80' => 'GB2312',
125 'greek' => 'ISO-8859-7',
126 'greek8' => 'ISO-8859-7',
127 'hebrew' => 'ISO-8859-8',
128 'ibm-864' => 'IBM864',
129 'ibm-864i' => 'IBM864i',
130 'ibm819' => 'ISO-8859-1',
131 'ibm874' => 'windows-874',
132 'iso-10646' => 'UTF-16BE',
133 'iso-10646-j-1' => 'UTF-16BE',
134 'iso-10646-ucs-2' => 'UTF-16BE',
135 'iso-10646-ucs-4' => 'UTF-32BE',
136 'iso-10646-ucs-basic' => 'UTF-16BE',
137 'iso-10646-unicode-latin1' => 'UTF-16BE',
138 'iso-2022-cn-ext' => 'ISO-2022-CN',
139 'iso-2022-jp-2' => 'ISO-2022-JP',
140 'iso-8859-8i' => 'ISO-8859-8-I',
141 'iso-ir-100' => 'ISO-8859-1',
142 'iso-ir-101' => 'ISO-8859-2',
143 'iso-ir-103' => 'T.61-8bit',
144 'iso-ir-109' => 'ISO-8859-3',
145 'iso-ir-110' => 'ISO-8859-4',
146 'iso-ir-126' => 'ISO-8859-7',
147 'iso-ir-127' => 'ISO-8859-6',
148 'iso-ir-138' => 'ISO-8859-8',
149 'iso-ir-144' => 'ISO-8859-5',
150 'iso-ir-148' => 'ISO-8859-9',
151 'iso-ir-149' => 'EUC-KR',
152 'iso-ir-157' => 'ISO-8859-10',
153 'iso-ir-58' => 'GB2312',
154 'iso8859-1' => 'ISO-8859-1',
155 'iso8859-10' => 'ISO-8859-10',
156 'iso8859-11' => 'ISO-8859-11',
157 'iso8859-13' => 'ISO-8859-13',
158 'iso8859-14' => 'ISO-8859-14',
159 'iso8859-15' => 'ISO-8859-15',
160 'iso8859-2' => 'ISO-8859-2',
161 'iso8859-3' => 'ISO-8859-3',
162 'iso8859-4' => 'ISO-8859-4',
163 'iso8859-5' => 'ISO-8859-5',
164 'iso8859-6' => 'ISO-8859-6',
165 'iso8859-7' => 'ISO-8859-7',
166 'iso8859-8' => 'ISO-8859-8',
167 'iso8859-9' => 'ISO-8859-9',
168 'iso88591' => 'ISO-8859-1',
169 'iso885910' => 'ISO-8859-10',
170 'iso885911' => 'ISO-8859-11',
171 'iso885912' => 'ISO-8859-12',
172 'iso885913' => 'ISO-8859-13',
173 'iso885914' => 'ISO-8859-14',
174 'iso885915' => 'ISO-8859-15',
175 'iso88592' => 'ISO-8859-2',
176 'iso88593' => 'ISO-8859-3',
177 'iso88594' => 'ISO-8859-4',
178 'iso88595' => 'ISO-8859-5',
179 'iso88596' => 'ISO-8859-6',
180 'iso88597' => 'ISO-8859-7',
181 'iso88598' => 'ISO-8859-8',
182 'iso88599' => 'ISO-8859-9',
183 'iso_8859-1' => 'ISO-8859-1',
184 'iso_8859-15' => 'ISO-8859-15',
185 'iso_8859-1:1987' => 'ISO-8859-1',
186 'iso_8859-2' => 'ISO-8859-2',
187 'iso_8859-2:1987' => 'ISO-8859-2',
188 'iso_8859-3' => 'ISO-8859-3',
189 'iso_8859-3:1988' => 'ISO-8859-3',
190 'iso_8859-4' => 'ISO-8859-4',
191 'iso_8859-4:1988' => 'ISO-8859-4',
192 'iso_8859-5' => 'ISO-8859-5',
193 'iso_8859-5:1988' => 'ISO-8859-5',
194 'iso_8859-6' => 'ISO-8859-6',
195 'iso_8859-6:1987' => 'ISO-8859-6',
196 'iso_8859-7' => 'ISO-8859-7',
197 'iso_8859-7:1987' => 'ISO-8859-7',
198 'iso_8859-8' => 'ISO-8859-8',
199 'iso_8859-8:1988' => 'ISO-8859-8',
200 'iso_8859-9' => 'ISO-8859-9',
201 'iso_8859-9:1989' => 'ISO-8859-9',
202 'koi' => 'KOI8-R',
203 'koi8' => 'KOI8-R',
204 'koi8-ru' => 'KOI8-U',
205 'koi8_r' => 'KOI8-R',
206 'korean' => 'EUC-KR',
207 'ks_c_5601-1987' => 'EUC-KR',
208 'ks_c_5601-1989' => 'EUC-KR',
209 'ksc5601' => 'EUC-KR',
210 'ksc_5601' => 'EUC-KR',
211 'l1' => 'ISO-8859-1',
212 'l2' => 'ISO-8859-2',
213 'l3' => 'ISO-8859-3',
214 'l4' => 'ISO-8859-4',
215 'l5' => 'ISO-8859-9',
216 'l6' => 'ISO-8859-10',
217 'l9' => 'ISO-8859-15',
218 'latin1' => 'ISO-8859-1',
219 'latin2' => 'ISO-8859-2',
220 'latin3' => 'ISO-8859-3',
221 'latin4' => 'ISO-8859-4',
222 'latin5' => 'ISO-8859-9',
223 'latin6' => 'ISO-8859-10',
224 'logical' => 'ISO-8859-8-I',
225 'mac' => 'x-mac-roman',
226 'macintosh' => 'x-mac-roman',
227 'ms932' => 'Shift_JIS',
228 'ms_kanji' => 'Shift_JIS',
229 'shift-jis' => 'Shift_JIS',
230 'sjis' => 'Shift_JIS',
231 'sun_eu_greek' => 'ISO-8859-7',
232 't.61' => 'T.61-8bit',
233 'tis620' => 'TIS-620',
234 'unicode-1-1-utf-7' => 'UTF-7',
235 'unicode-1-1-utf-8' => 'UTF-8',
236 'unicode-2-0-utf-7' => 'UTF-7',
237 'visual' => 'ISO-8859-8',
238 'windows-31j' => 'Shift_JIS',
239 'windows-949' => 'EUC-KR',
240 'x-cp1250' => 'windows-1250',
241 'x-cp1251' => 'windows-1251',
242 'x-cp1252' => 'windows-1252',
243 'x-cp1253' => 'windows-1253',
244 'x-cp1254' => 'windows-1254',
245 'x-cp1255' => 'windows-1255',
246 'x-cp1256' => 'windows-1256',
247 'x-cp1257' => 'windows-1257',
248 'x-cp1258' => 'windows-1258',
249 'x-euc-jp' => 'EUC-JP',
250 'x-gbk' => 'gbk',
251 'x-iso-10646-ucs-2-be' => 'UTF-16BE',
252 'x-iso-10646-ucs-2-le' => 'UTF-16LE',
253 'x-iso-10646-ucs-4-be' => 'UTF-32BE',
254 'x-iso-10646-ucs-4-le' => 'UTF-32LE',
255 'x-sjis' => 'Shift_JIS',
256 'x-unicode-2-0-utf-7' => 'UTF-7',
257 'x-x-big5' => 'Big5',
258 'zh_cn.euc' => 'GB2312',
259 'zh_tw-big5' => 'Big5',
260 'zh_tw-euc' => 'x-euc-tw',
261 ];
262
263 /**
264 * Decode text to UTF-8.
265 *
266 * @param string $text Text to decode
267 * @param string $fromCharset Original charset
268 */
269 public static function decode(string $text, string $fromCharset): string
270 {
271 static $utf8Aliases = [
272 'unicode-1-1-utf-8' => true,
273 'utf8' => true,
274 'utf-8' => true,
275 'UTF8' => true,
276 'UTF-8' => true,
277 ];
278
279 if (isset($utf8Aliases[$fromCharset])) {
280 return $text;
281 }
282
283 $originalFromCharset = $fromCharset;
284 $lowercaseFromCharset = \strtolower($fromCharset);
285 if (isset(self::$charsetAliases[$lowercaseFromCharset])) {
286 $fromCharset = self::$charsetAliases[$lowercaseFromCharset];
287 }
288
289 \set_error_handler(static function (): bool {
290 return true;
291 });
292
293 $iconvDecodedText = \iconv($fromCharset, 'UTF-8', $text);
294 if (false === $iconvDecodedText) {
295 $iconvDecodedText = \iconv($originalFromCharset, 'UTF-8', $text);
296 }
297
298 \restore_error_handler();
299
300 if (false !== $iconvDecodedText) {
301 return $iconvDecodedText;
302 }
303
304 $errorMessage = null;
305 $errorNumber = 0;
306 \set_error_handler(static function ($nr, $message) use (&$errorMessage, &$errorNumber): bool {
307 $errorMessage = $message;
308 $errorNumber = $nr;
309
310 return true;
311 });
312
Matthias Andreas Benkarde39c4f82021-01-06 17:59:39 +0100313 $decodedText = '';
314
315 try {
316 $decodedText = \mb_convert_encoding($text, 'UTF-8', $fromCharset);
317 } catch (\Error $error) {
318 $errorMessage = $error->getMessage();
319 }
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100320
321 \restore_error_handler();
322
323 if (null !== $errorMessage) {
324 throw new UnsupportedCharsetException(\sprintf(
325 'Unsupported charset "%s"%s: %s',
326 $originalFromCharset,
327 ($fromCharset !== $originalFromCharset) ? \sprintf(' (alias found: "%s")', $fromCharset) : '',
328 $errorMessage
329 ), $errorNumber);
330 }
331
332 return $decodedText;
333 }
334}