blob: bd439e0680aa3849af6baa6ad7ec54c683e55641 [file] [log] [blame]
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001<?php
2
3declare(strict_types=1);
4
5namespace Ddeboer\Imap\Message;
6
7use Ddeboer\Imap\Exception\UnsupportedCharsetException;
8
9final class Transcoder
10{
11 /**
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +010012 * @see https://encoding.spec.whatwg.org/#encodings
13 * @see https://dxr.mozilla.org/mozilla-central/source/dom/encoding/labelsencodings.properties
14 * @see https://dxr.mozilla.org/mozilla1.9.1/source/intl/uconv/src/charsetalias.properties
15 * @see https://msdn.microsoft.com/en-us/library/cc194829.aspx
16 */
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +020017 private const CHARSET_ALIASES = [
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +010018 '128' => 'Shift_JIS',
19 '129' => 'EUC-KR',
20 '134' => 'GB2312',
21 '136' => 'Big5',
22 '161' => 'windows-1253',
23 '162' => 'windows-1254',
24 '177' => 'windows-1255',
25 '178' => 'windows-1256',
26 '186' => 'windows-1257',
27 '204' => 'windows-1251',
28 '222' => 'windows-874',
29 '238' => 'windows-1250',
30 '5601' => 'EUC-KR',
31 '646' => 'us-ascii',
32 '850' => 'IBM850',
33 '852' => 'IBM852',
34 '855' => 'IBM855',
35 '857' => 'IBM857',
36 '862' => 'IBM862',
37 '864' => 'IBM864',
38 '864i' => 'IBM864i',
39 '866' => 'IBM866',
40 'ansi-1251' => 'windows-1251',
41 'ansi_x3.4-1968' => 'us-ascii',
42 'arabic' => 'ISO-8859-6',
43 'ascii' => 'us-ascii',
44 'asmo-708' => 'ISO-8859-6',
45 'big5-hkscs' => 'Big5',
46 'chinese' => 'GB2312',
47 'cn-big5' => 'Big5',
48 'cns11643' => 'x-euc-tw',
49 'cp-866' => 'IBM866',
50 'cp1250' => 'windows-1250',
51 'cp1251' => 'windows-1251',
52 'cp1252' => 'windows-1252',
53 'cp1253' => 'windows-1253',
54 'cp1254' => 'windows-1254',
55 'cp1255' => 'windows-1255',
56 'cp1256' => 'windows-1256',
57 'cp1257' => 'windows-1257',
58 'cp1258' => 'windows-1258',
59 'cp819' => 'ISO-8859-1',
60 'cp850' => 'IBM850',
61 'cp852' => 'IBM852',
62 'cp855' => 'IBM855',
63 'cp857' => 'IBM857',
64 'cp862' => 'IBM862',
65 'cp864' => 'IBM864',
66 'cp864i' => 'IBM864i',
67 'cp866' => 'IBM866',
68 'cp932' => 'Shift_JIS',
69 'csbig5' => 'Big5',
70 'cseucjpkdfmtjapanese' => 'EUC-JP',
71 'cseuckr' => 'EUC-KR',
72 'cseucpkdfmtjapanese' => 'EUC-JP',
73 'csgb2312' => 'GB2312',
74 'csibm850' => 'IBM850',
75 'csibm852' => 'IBM852',
76 'csibm855' => 'IBM855',
77 'csibm857' => 'IBM857',
78 'csibm862' => 'IBM862',
79 'csibm864' => 'IBM864',
80 'csibm864i' => 'IBM864i',
81 'csibm866' => 'IBM866',
82 'csiso103t618bit' => 'T.61-8bit',
83 'csiso111ecmacyrillic' => 'ISO-IR-111',
84 'csiso2022jp' => 'ISO-2022-JP',
85 'csiso2022jp2' => 'ISO-2022-JP',
86 'csiso2022kr' => 'ISO-2022-KR',
87 'csiso58gb231280' => 'GB2312',
88 'csiso88596e' => 'ISO-8859-6-E',
89 'csiso88596i' => 'ISO-8859-6-I',
90 'csiso88598e' => 'ISO-8859-8-E',
91 'csiso88598i' => 'ISO-8859-8-I',
92 'csisolatin1' => 'ISO-8859-1',
93 'csisolatin2' => 'ISO-8859-2',
94 'csisolatin3' => 'ISO-8859-3',
95 'csisolatin4' => 'ISO-8859-4',
96 'csisolatin5' => 'ISO-8859-9',
97 'csisolatin6' => 'ISO-8859-10',
98 'csisolatin9' => 'ISO-8859-15',
99 'csisolatinarabic' => 'ISO-8859-6',
100 'csisolatincyrillic' => 'ISO-8859-5',
101 'csisolatingreek' => 'ISO-8859-7',
102 'csisolatinhebrew' => 'ISO-8859-8',
103 'cskoi8r' => 'KOI8-R',
104 'csksc56011987' => 'EUC-KR',
105 'csmacintosh' => 'x-mac-roman',
106 'csshiftjis' => 'Shift_JIS',
107 'csueckr' => 'EUC-KR',
108 'csunicode' => 'UTF-16BE',
109 'csunicode11' => 'UTF-16BE',
110 'csunicode11utf7' => 'UTF-7',
111 'csunicodeascii' => 'UTF-16BE',
112 'csunicodelatin1' => 'UTF-16BE',
113 'csviqr' => 'VIQR',
114 'csviscii' => 'VISCII',
115 'cyrillic' => 'ISO-8859-5',
116 'dos-874' => 'windows-874',
117 'ecma-114' => 'ISO-8859-6',
118 'ecma-118' => 'ISO-8859-7',
119 'ecma-cyrillic' => 'ISO-IR-111',
120 'elot_928' => 'ISO-8859-7',
121 'gb_2312' => 'GB2312',
122 'gb_2312-80' => 'GB2312',
123 'greek' => 'ISO-8859-7',
124 'greek8' => 'ISO-8859-7',
125 'hebrew' => 'ISO-8859-8',
126 'ibm-864' => 'IBM864',
127 'ibm-864i' => 'IBM864i',
128 'ibm819' => 'ISO-8859-1',
129 'ibm874' => 'windows-874',
130 'iso-10646' => 'UTF-16BE',
131 'iso-10646-j-1' => 'UTF-16BE',
132 'iso-10646-ucs-2' => 'UTF-16BE',
133 'iso-10646-ucs-4' => 'UTF-32BE',
134 'iso-10646-ucs-basic' => 'UTF-16BE',
135 'iso-10646-unicode-latin1' => 'UTF-16BE',
136 'iso-2022-cn-ext' => 'ISO-2022-CN',
137 'iso-2022-jp-2' => 'ISO-2022-JP',
138 'iso-8859-8i' => 'ISO-8859-8-I',
139 'iso-ir-100' => 'ISO-8859-1',
140 'iso-ir-101' => 'ISO-8859-2',
141 'iso-ir-103' => 'T.61-8bit',
142 'iso-ir-109' => 'ISO-8859-3',
143 'iso-ir-110' => 'ISO-8859-4',
144 'iso-ir-126' => 'ISO-8859-7',
145 'iso-ir-127' => 'ISO-8859-6',
146 'iso-ir-138' => 'ISO-8859-8',
147 'iso-ir-144' => 'ISO-8859-5',
148 'iso-ir-148' => 'ISO-8859-9',
149 'iso-ir-149' => 'EUC-KR',
150 'iso-ir-157' => 'ISO-8859-10',
151 'iso-ir-58' => 'GB2312',
152 'iso8859-1' => 'ISO-8859-1',
153 'iso8859-10' => 'ISO-8859-10',
154 'iso8859-11' => 'ISO-8859-11',
155 'iso8859-13' => 'ISO-8859-13',
156 'iso8859-14' => 'ISO-8859-14',
157 'iso8859-15' => 'ISO-8859-15',
158 'iso8859-2' => 'ISO-8859-2',
159 'iso8859-3' => 'ISO-8859-3',
160 'iso8859-4' => 'ISO-8859-4',
161 'iso8859-5' => 'ISO-8859-5',
162 'iso8859-6' => 'ISO-8859-6',
163 'iso8859-7' => 'ISO-8859-7',
164 'iso8859-8' => 'ISO-8859-8',
165 'iso8859-9' => 'ISO-8859-9',
166 'iso88591' => 'ISO-8859-1',
167 'iso885910' => 'ISO-8859-10',
168 'iso885911' => 'ISO-8859-11',
169 'iso885912' => 'ISO-8859-12',
170 'iso885913' => 'ISO-8859-13',
171 'iso885914' => 'ISO-8859-14',
172 'iso885915' => 'ISO-8859-15',
173 'iso88592' => 'ISO-8859-2',
174 'iso88593' => 'ISO-8859-3',
175 'iso88594' => 'ISO-8859-4',
176 'iso88595' => 'ISO-8859-5',
177 'iso88596' => 'ISO-8859-6',
178 'iso88597' => 'ISO-8859-7',
179 'iso88598' => 'ISO-8859-8',
180 'iso88599' => 'ISO-8859-9',
181 'iso_8859-1' => 'ISO-8859-1',
182 'iso_8859-15' => 'ISO-8859-15',
183 'iso_8859-1:1987' => 'ISO-8859-1',
184 'iso_8859-2' => 'ISO-8859-2',
185 'iso_8859-2:1987' => 'ISO-8859-2',
186 'iso_8859-3' => 'ISO-8859-3',
187 'iso_8859-3:1988' => 'ISO-8859-3',
188 'iso_8859-4' => 'ISO-8859-4',
189 'iso_8859-4:1988' => 'ISO-8859-4',
190 'iso_8859-5' => 'ISO-8859-5',
191 'iso_8859-5:1988' => 'ISO-8859-5',
192 'iso_8859-6' => 'ISO-8859-6',
193 'iso_8859-6:1987' => 'ISO-8859-6',
194 'iso_8859-7' => 'ISO-8859-7',
195 'iso_8859-7:1987' => 'ISO-8859-7',
196 'iso_8859-8' => 'ISO-8859-8',
197 'iso_8859-8:1988' => 'ISO-8859-8',
198 'iso_8859-9' => 'ISO-8859-9',
199 'iso_8859-9:1989' => 'ISO-8859-9',
200 'koi' => 'KOI8-R',
201 'koi8' => 'KOI8-R',
202 'koi8-ru' => 'KOI8-U',
203 'koi8_r' => 'KOI8-R',
204 'korean' => 'EUC-KR',
205 'ks_c_5601-1987' => 'EUC-KR',
206 'ks_c_5601-1989' => 'EUC-KR',
207 'ksc5601' => 'EUC-KR',
208 'ksc_5601' => 'EUC-KR',
209 'l1' => 'ISO-8859-1',
210 'l2' => 'ISO-8859-2',
211 'l3' => 'ISO-8859-3',
212 'l4' => 'ISO-8859-4',
213 'l5' => 'ISO-8859-9',
214 'l6' => 'ISO-8859-10',
215 'l9' => 'ISO-8859-15',
216 'latin1' => 'ISO-8859-1',
217 'latin2' => 'ISO-8859-2',
218 'latin3' => 'ISO-8859-3',
219 'latin4' => 'ISO-8859-4',
220 'latin5' => 'ISO-8859-9',
221 'latin6' => 'ISO-8859-10',
222 'logical' => 'ISO-8859-8-I',
223 'mac' => 'x-mac-roman',
224 'macintosh' => 'x-mac-roman',
225 'ms932' => 'Shift_JIS',
226 'ms_kanji' => 'Shift_JIS',
227 'shift-jis' => 'Shift_JIS',
228 'sjis' => 'Shift_JIS',
229 'sun_eu_greek' => 'ISO-8859-7',
230 't.61' => 'T.61-8bit',
231 'tis620' => 'TIS-620',
232 'unicode-1-1-utf-7' => 'UTF-7',
233 'unicode-1-1-utf-8' => 'UTF-8',
234 'unicode-2-0-utf-7' => 'UTF-7',
235 'visual' => 'ISO-8859-8',
236 'windows-31j' => 'Shift_JIS',
237 'windows-949' => 'EUC-KR',
238 'x-cp1250' => 'windows-1250',
239 'x-cp1251' => 'windows-1251',
240 'x-cp1252' => 'windows-1252',
241 'x-cp1253' => 'windows-1253',
242 'x-cp1254' => 'windows-1254',
243 'x-cp1255' => 'windows-1255',
244 'x-cp1256' => 'windows-1256',
245 'x-cp1257' => 'windows-1257',
246 'x-cp1258' => 'windows-1258',
247 'x-euc-jp' => 'EUC-JP',
248 'x-gbk' => 'gbk',
249 'x-iso-10646-ucs-2-be' => 'UTF-16BE',
250 'x-iso-10646-ucs-2-le' => 'UTF-16LE',
251 'x-iso-10646-ucs-4-be' => 'UTF-32BE',
252 'x-iso-10646-ucs-4-le' => 'UTF-32LE',
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +0200253 'x-mac-ce' => 'windows-1250',
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100254 'x-sjis' => 'Shift_JIS',
255 'x-unicode-2-0-utf-7' => 'UTF-7',
256 'x-x-big5' => 'Big5',
257 'zh_cn.euc' => 'GB2312',
258 'zh_tw-big5' => 'Big5',
259 'zh_tw-euc' => 'x-euc-tw',
260 ];
261
262 /**
263 * Decode text to UTF-8.
264 *
265 * @param string $text Text to decode
266 * @param string $fromCharset Original charset
267 */
268 public static function decode(string $text, string $fromCharset): string
269 {
270 static $utf8Aliases = [
271 'unicode-1-1-utf-8' => true,
272 'utf8' => true,
273 'utf-8' => true,
274 'UTF8' => true,
275 'UTF-8' => true,
276 ];
277
278 if (isset($utf8Aliases[$fromCharset])) {
279 return $text;
280 }
281
282 $originalFromCharset = $fromCharset;
283 $lowercaseFromCharset = \strtolower($fromCharset);
Matthias Andreas Benkard7b2a3a12021-08-16 10:57:25 +0200284 if (isset(self::CHARSET_ALIASES[$lowercaseFromCharset])) {
285 $fromCharset = self::CHARSET_ALIASES[$lowercaseFromCharset];
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100286 }
287
288 \set_error_handler(static function (): bool {
289 return true;
290 });
291
292 $iconvDecodedText = \iconv($fromCharset, 'UTF-8', $text);
293 if (false === $iconvDecodedText) {
294 $iconvDecodedText = \iconv($originalFromCharset, 'UTF-8', $text);
295 }
296
297 \restore_error_handler();
298
299 if (false !== $iconvDecodedText) {
300 return $iconvDecodedText;
301 }
302
303 $errorMessage = null;
304 $errorNumber = 0;
305 \set_error_handler(static function ($nr, $message) use (&$errorMessage, &$errorNumber): bool {
306 $errorMessage = $message;
307 $errorNumber = $nr;
308
309 return true;
310 });
311
Matthias Andreas Benkarde39c4f82021-01-06 17:59:39 +0100312 $decodedText = '';
313
314 try {
315 $decodedText = \mb_convert_encoding($text, 'UTF-8', $fromCharset);
316 } catch (\Error $error) {
317 $errorMessage = $error->getMessage();
318 }
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +0100319
320 \restore_error_handler();
321
322 if (null !== $errorMessage) {
323 throw new UnsupportedCharsetException(\sprintf(
324 'Unsupported charset "%s"%s: %s',
325 $originalFromCharset,
326 ($fromCharset !== $originalFromCharset) ? \sprintf(' (alias found: "%s")', $fromCharset) : '',
327 $errorMessage
328 ), $errorNumber);
329 }
330
331 return $decodedText;
332 }
333}