blob: cd219f2268059d6fd013c116735aad78cc9c260b [file] [log] [blame]
Matthias Andreas Benkardb382b102021-01-02 15:32:21 +01001<?php namespace PhpMimeMailParser;
2
3use PhpMimeMailParser\Contracts\CharsetManager;
4
5class Charset implements CharsetManager
6{
7 /**
8 * Charset Aliases
9 */
10 private $charsetAlias = [
11 'ascii' => 'us-ascii',
12 'us-ascii' => 'us-ascii',
13 'ansi_x3.4-1968' => 'us-ascii',
14 '646' => 'us-ascii',
15 'iso-8859-1' => 'iso-8859-1',
16 'iso-8859-2' => 'iso-8859-2',
17 'iso-8859-3' => 'iso-8859-3',
18 'iso-8859-4' => 'iso-8859-4',
19 'iso-8859-5' => 'iso-8859-5',
20 'iso-8859-6' => 'iso-8859-6',
21 'iso-8859-6-i' => 'iso-8859-6-i',
22 'iso-8859-6-e' => 'iso-8859-6-e',
23 'iso-8859-7' => 'iso-8859-7',
24 'iso-8859-8' => 'iso-8859-8',
25 'iso-8859-8-i' => 'iso-8859-8',
26 'iso-8859-8-e' => 'iso-8859-8-e',
27 'iso-8859-9' => 'iso-8859-9',
28 'iso-8859-10' => 'iso-8859-10',
29 'iso-8859-11' => 'iso-8859-11',
30 'iso-8859-13' => 'iso-8859-13',
31 'iso-8859-14' => 'iso-8859-14',
32 'iso-8859-15' => 'iso-8859-15',
33 'iso-8859-16' => 'iso-8859-16',
34 'iso-ir-111' => 'iso-ir-111',
35 'iso-2022-cn' => 'iso-2022-cn',
36 'iso-2022-cn-ext' => 'iso-2022-cn',
37 'iso-2022-kr' => 'iso-2022-kr',
38 'iso-2022-jp' => 'iso-2022-jp',
39 'utf-16be' => 'utf-16be',
40 'utf-16le' => 'utf-16le',
41 'utf-16' => 'utf-16',
42 'windows-1250' => 'windows-1250',
43 'windows-1251' => 'windows-1251',
44 'windows-1252' => 'windows-1252',
45 'windows-1253' => 'windows-1253',
46 'windows-1254' => 'windows-1254',
47 'windows-1255' => 'windows-1255',
48 'windows-1256' => 'windows-1256',
49 'windows-1257' => 'windows-1257',
50 'windows-1258' => 'windows-1258',
51 'ibm866' => 'ibm866',
52 'ibm850' => 'ibm850',
53 'ibm852' => 'ibm852',
54 'ibm855' => 'ibm855',
55 'ibm857' => 'ibm857',
56 'ibm862' => 'ibm862',
57 'ibm864' => 'ibm864',
58 'utf-8' => 'utf-8',
59 'utf-7' => 'utf-7',
60 'shift_jis' => 'shift_jis',
61 'big5' => 'big5',
62 'euc-jp' => 'euc-jp',
63 'euc-kr' => 'euc-kr',
64 'gb2312' => 'gb2312',
65 'gb18030' => 'gb18030',
66 'viscii' => 'viscii',
67 'koi8-r' => 'koi8-r',
68 'koi8_r' => 'koi8-r',
69 'cskoi8r' => 'koi8-r',
70 'koi' => 'koi8-r',
71 'koi8' => 'koi8-r',
72 'koi8-u' => 'koi8-u',
73 'tis-620' => 'tis-620',
74 't.61-8bit' => 't.61-8bit',
75 'hz-gb-2312' => 'hz-gb-2312',
76 'big5-hkscs' => 'big5-hkscs',
77 'gbk' => 'gbk',
78 'cns11643' => 'x-euc-tw',
79 'x-imap4-modified-utf7' => 'x-imap4-modified-utf7',
80 'x-euc-tw' => 'x-euc-tw',
81 'x-mac-ce' => 'macce',
82 'x-mac-turkish' => 'macturkish',
83 'x-mac-greek' => 'macgreek',
84 'x-mac-icelandic' => 'macicelandic',
85 'x-mac-croatian' => 'maccroatian',
86 'x-mac-romanian' => 'macromanian',
87 'x-mac-cyrillic' => 'maccyrillic',
88 'x-mac-ukrainian' => 'macukrainian',
89 'x-mac-hebrew' => 'machebrew',
90 'x-mac-arabic' => 'macarabic',
91 'x-mac-farsi' => 'macfarsi',
92 'x-mac-devanagari' => 'macdevanagari',
93 'x-mac-gujarati' => 'macgujarati',
94 'x-mac-gurmukhi' => 'macgurmukhi',
95 'armscii-8' => 'armscii-8',
96 'x-viet-tcvn5712' => 'x-viet-tcvn5712',
97 'x-viet-vps' => 'x-viet-vps',
98 'iso-10646-ucs-2' => 'utf-16be',
99 'x-iso-10646-ucs-2-be' => 'utf-16be',
100 'x-iso-10646-ucs-2-le' => 'utf-16le',
101 'x-user-defined' => 'x-user-defined',
102 'x-johab' => 'x-johab',
103 'latin1' => 'iso-8859-1',
104 'iso_8859-1' => 'iso-8859-1',
105 'iso8859-1' => 'iso-8859-1',
106 'iso8859-2' => 'iso-8859-2',
107 'iso8859-3' => 'iso-8859-3',
108 'iso8859-4' => 'iso-8859-4',
109 'iso8859-5' => 'iso-8859-5',
110 'iso8859-6' => 'iso-8859-6',
111 'iso8859-7' => 'iso-8859-7',
112 'iso8859-8' => 'iso-8859-8',
113 'iso8859-9' => 'iso-8859-9',
114 'iso8859-10' => 'iso-8859-10',
115 'iso8859-11' => 'iso-8859-11',
116 'iso8859-13' => 'iso-8859-13',
117 'iso8859-14' => 'iso-8859-14',
118 'iso8859-15' => 'iso-8859-15',
119 'iso_8859-1:1987' => 'iso-8859-1',
120 'iso-ir-100' => 'iso-8859-1',
121 'l1' => 'iso-8859-1',
122 'ibm819' => 'iso-8859-1',
123 'cp819' => 'iso-8859-1',
124 'csisolatin1' => 'iso-8859-1',
125 'latin2' => 'iso-8859-2',
126 'iso_8859-2' => 'iso-8859-2',
127 'iso_8859-2:1987' => 'iso-8859-2',
128 'iso-ir-101' => 'iso-8859-2',
129 'l2' => 'iso-8859-2',
130 'csisolatin2' => 'iso-8859-2',
131 'latin3' => 'iso-8859-3',
132 'iso_8859-3' => 'iso-8859-3',
133 'iso_8859-3:1988' => 'iso-8859-3',
134 'iso-ir-109' => 'iso-8859-3',
135 'l3' => 'iso-8859-3',
136 'csisolatin3' => 'iso-8859-3',
137 'latin4' => 'iso-8859-4',
138 'iso_8859-4' => 'iso-8859-4',
139 'iso_8859-4:1988' => 'iso-8859-4',
140 'iso-ir-110' => 'iso-8859-4',
141 'l4' => 'iso-8859-4',
142 'csisolatin4' => 'iso-8859-4',
143 'cyrillic' => 'iso-8859-5',
144 'iso_8859-5' => 'iso-8859-5',
145 'iso_8859-5:1988' => 'iso-8859-5',
146 'iso-ir-144' => 'iso-8859-5',
147 'csisolatincyrillic' => 'iso-8859-5',
148 'arabic' => 'iso-8859-6',
149 'iso_8859-6' => 'iso-8859-6',
150 'iso_8859-6:1987' => 'iso-8859-6',
151 'iso-ir-127' => 'iso-8859-6',
152 'ecma-114' => 'iso-8859-6',
153 'asmo-708' => 'iso-8859-6',
154 'csisolatinarabic' => 'iso-8859-6',
155 'csiso88596i' => 'iso-8859-6-i',
156 'csiso88596e' => 'iso-8859-6-e',
157 'greek' => 'iso-8859-7',
158 'greek8' => 'iso-8859-7',
159 'sun_eu_greek' => 'iso-8859-7',
160 'iso_8859-7' => 'iso-8859-7',
161 'iso_8859-7:1987' => 'iso-8859-7',
162 'iso-ir-126' => 'iso-8859-7',
163 'elot_928' => 'iso-8859-7',
164 'ecma-118' => 'iso-8859-7',
165 'csisolatingreek' => 'iso-8859-7',
166 'hebrew' => 'iso-8859-8',
167 'iso_8859-8' => 'iso-8859-8',
168 'visual' => 'iso-8859-8',
169 'iso_8859-8:1988' => 'iso-8859-8',
170 'iso-ir-138' => 'iso-8859-8',
171 'csisolatinhebrew' => 'iso-8859-8',
172 'csiso88598i' => 'iso-8859-8',
173 'iso-8859-8i' => 'iso-8859-8',
174 'logical' => 'iso-8859-8',
175 'csiso88598e' => 'iso-8859-8-e',
176 'latin5' => 'iso-8859-9',
177 'iso_8859-9' => 'iso-8859-9',
178 'iso_8859-9:1989' => 'iso-8859-9',
179 'iso-ir-148' => 'iso-8859-9',
180 'l5' => 'iso-8859-9',
181 'csisolatin5' => 'iso-8859-9',
182 'unicode-1-1-utf-8' => 'utf-8',
183 'utf8' => 'utf-8',
184 'x-sjis' => 'shift_jis',
185 'shift-jis' => 'shift_jis',
186 'ms_kanji' => 'shift_jis',
187 'csshiftjis' => 'shift_jis',
188 'windows-31j' => 'shift_jis',
189 'cp932' => 'shift_jis',
190 'sjis' => 'shift_jis',
191 'cseucpkdfmtjapanese' => 'euc-jp',
192 'x-euc-jp' => 'euc-jp',
193 'csiso2022jp' => 'iso-2022-jp',
194 'iso-2022-jp-2' => 'iso-2022-jp',
195 'csiso2022jp2' => 'iso-2022-jp',
196 'csbig5' => 'big5',
197 'cn-big5' => 'big5',
198 'x-x-big5' => 'big5',
199 'zh_tw-big5' => 'big5',
200 'cseuckr' => 'euc-kr',
201 'ks_c_5601-1987' => 'euc-kr',
202 'iso-ir-149' => 'euc-kr',
203 'ks_c_5601-1989' => 'euc-kr',
204 'ksc_5601' => 'euc-kr',
205 'ksc5601' => 'euc-kr',
206 'korean' => 'euc-kr',
207 'csksc56011987' => 'euc-kr',
208 '5601' => 'euc-kr',
209 'windows-949' => 'euc-kr',
210 'gb_2312-80' => 'gb2312',
211 'iso-ir-58' => 'gb2312',
212 'chinese' => 'gb2312',
213 'csiso58gb231280' => 'gb2312',
214 'csgb2312' => 'gb2312',
215 'zh_cn.euc' => 'gb2312',
216 'gb_2312' => 'gb2312',
217 'x-cp1250' => 'windows-1250',
218 'x-cp1251' => 'windows-1251',
219 'x-cp1252' => 'windows-1252',
220 'x-cp1253' => 'windows-1253',
221 'x-cp1254' => 'windows-1254',
222 'x-cp1255' => 'windows-1255',
223 'x-cp1256' => 'windows-1256',
224 'x-cp1257' => 'windows-1257',
225 'x-cp1258' => 'windows-1258',
226 'windows-874' => 'windows-874',
227 'ibm874' => 'windows-874',
228 'dos-874' => 'windows-874',
229 'macintosh' => 'macintosh',
230 'x-mac-roman' => 'macintosh',
231 'mac' => 'macintosh',
232 'csmacintosh' => 'macintosh',
233 'cp866' => 'ibm866',
234 'cp-866' => 'ibm866',
235 '866' => 'ibm866',
236 'csibm866' => 'ibm866',
237 'cp850' => 'ibm850',
238 '850' => 'ibm850',
239 'csibm850' => 'ibm850',
240 'cp852' => 'ibm852',
241 '852' => 'ibm852',
242 'csibm852' => 'ibm852',
243 'cp855' => 'ibm855',
244 '855' => 'ibm855',
245 'csibm855' => 'ibm855',
246 'cp857' => 'ibm857',
247 '857' => 'ibm857',
248 'csibm857' => 'ibm857',
249 'cp862' => 'ibm862',
250 '862' => 'ibm862',
251 'csibm862' => 'ibm862',
252 'cp864' => 'ibm864',
253 '864' => 'ibm864',
254 'csibm864' => 'ibm864',
255 'ibm-864' => 'ibm864',
256 't.61' => 't.61-8bit',
257 'iso-ir-103' => 't.61-8bit',
258 'csiso103t618bit' => 't.61-8bit',
259 'x-unicode-2-0-utf-7' => 'utf-7',
260 'unicode-2-0-utf-7' => 'utf-7',
261 'unicode-1-1-utf-7' => 'utf-7',
262 'csunicode11utf7' => 'utf-7',
263 'csunicode' => 'utf-16be',
264 'csunicode11' => 'utf-16be',
265 'iso-10646-ucs-basic' => 'utf-16be',
266 'csunicodeascii' => 'utf-16be',
267 'iso-10646-unicode-latin1' => 'utf-16be',
268 'csunicodelatin1' => 'utf-16be',
269 'iso-10646' => 'utf-16be',
270 'iso-10646-j-1' => 'utf-16be',
271 'latin6' => 'iso-8859-10',
272 'iso-ir-157' => 'iso-8859-10',
273 'l6' => 'iso-8859-10',
274 'csisolatin6' => 'iso-8859-10',
275 'iso_8859-15' => 'iso-8859-15',
276 'csisolatin9' => 'iso-8859-15',
277 'l9' => 'iso-8859-15',
278 'ecma-cyrillic' => 'iso-ir-111',
279 'csiso111ecmacyrillic' => 'iso-ir-111',
280 'csiso2022kr' => 'iso-2022-kr',
281 'csviscii' => 'viscii',
282 'zh_tw-euc' => 'x-euc-tw',
283 'iso88591' => 'iso-8859-1',
284 'iso88592' => 'iso-8859-2',
285 'iso88593' => 'iso-8859-3',
286 'iso88594' => 'iso-8859-4',
287 'iso88595' => 'iso-8859-5',
288 'iso88596' => 'iso-8859-6',
289 'iso88597' => 'iso-8859-7',
290 'iso88598' => 'iso-8859-8',
291 'iso88599' => 'iso-8859-9',
292 'iso885910' => 'iso-8859-10',
293 'iso885911' => 'iso-8859-11',
294 'iso885912' => 'iso-8859-12',
295 'iso885913' => 'iso-8859-13',
296 'iso885914' => 'iso-8859-14',
297 'iso885915' => 'iso-8859-15',
298 'tis620' => 'tis-620',
299 'cp1250' => 'windows-1250',
300 'cp1251' => 'windows-1251',
301 'cp1252' => 'windows-1252',
302 'cp1253' => 'windows-1253',
303 'cp1254' => 'windows-1254',
304 'cp1255' => 'windows-1255',
305 'cp1256' => 'windows-1256',
306 'cp1257' => 'windows-1257',
307 'cp1258' => 'windows-1258',
308 'x-gbk' => 'gbk',
309 'windows-936' => 'gbk',
310 'ansi-1251' => 'windows-1251',
311 ];
312
313 /**
314 * {@inheritdoc}
315 */
316 public function decodeCharset($encodedString, $charset)
317 {
318 $charset = $this->getCharsetAlias($charset);
319
320 if ($charset == 'utf-8' || $charset == 'us-ascii') {
321 return $encodedString;
322 }
323
324 if (function_exists('mb_convert_encoding')) {
325 if ($charset == 'iso-2022-jp') {
326 return mb_convert_encoding($encodedString, 'utf-8', 'iso-2022-jp-ms');
327 }
328
329 if (array_search($charset, $this->getSupportedEncodings())) {
330 return mb_convert_encoding($encodedString, 'utf-8', $charset);
331 }
332 }
333
334 return iconv($charset, 'utf-8//translit//ignore', $encodedString);
335 }
336
337 /**
338 * {@inheritdoc}
339 */
340 public function getCharsetAlias($charset)
341 {
342 $charset = strtolower($charset);
343
344 if (array_key_exists($charset, $this->charsetAlias)) {
345 return $this->charsetAlias[$charset];
346 }
347
348 return 'us-ascii';
349 }
350
351 private function getSupportedEncodings()
352 {
353 return
354 array_map(
355 'strtolower',
356 array_unique(
357 array_merge(
358 $enc = mb_list_encodings(),
359 call_user_func_array(
360 'array_merge',
361 array_map(
362 "mb_encoding_aliases",
363 $enc
364 )
365 )
366 )
367 )
368 );
369 }
370}