blob: f7941701e8304a3bfaa969c406b48236dac7efb5 [file] [log] [blame]
Matthias Andreas Benkard832a54e2019-01-29 09:27:38 +01001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7package main
8
9import (
10 "bufio"
11 "fmt"
12 "log"
13 "net/http"
14 "sort"
15 "strings"
16 "unicode/utf8"
17
18 "golang.org/x/text/encoding"
19 "golang.org/x/text/internal/gen"
20)
21
22const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
23 "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
24 ` !"#$%&'()*+,-./0123456789:;<=>?` +
25 `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
26 "`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
27
28var encodings = []struct {
29 name string
30 mib string
31 comment string
32 varName string
33 replacement byte
34 mapping string
35}{
36 {
37 "IBM Code Page 037",
38 "IBM037",
39 "",
40 "CodePage037",
41 0x3f,
42 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
43 },
44 {
45 "IBM Code Page 437",
46 "PC8CodePage437",
47 "",
48 "CodePage437",
49 encoding.ASCIISub,
50 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
51 },
52 {
53 "IBM Code Page 850",
54 "PC850Multilingual",
55 "",
56 "CodePage850",
57 encoding.ASCIISub,
58 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
59 },
60 {
61 "IBM Code Page 852",
62 "PCp852",
63 "",
64 "CodePage852",
65 encoding.ASCIISub,
66 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
67 },
68 {
69 "IBM Code Page 855",
70 "IBM855",
71 "",
72 "CodePage855",
73 encoding.ASCIISub,
74 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
75 },
76 {
77 "Windows Code Page 858", // PC latin1 with Euro
78 "IBM00858",
79 "",
80 "CodePage858",
81 encoding.ASCIISub,
82 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
83 },
84 {
85 "IBM Code Page 860",
86 "IBM860",
87 "",
88 "CodePage860",
89 encoding.ASCIISub,
90 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
91 },
92 {
93 "IBM Code Page 862",
94 "PC862LatinHebrew",
95 "",
96 "CodePage862",
97 encoding.ASCIISub,
98 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
99 },
100 {
101 "IBM Code Page 863",
102 "IBM863",
103 "",
104 "CodePage863",
105 encoding.ASCIISub,
106 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
107 },
108 {
109 "IBM Code Page 865",
110 "IBM865",
111 "",
112 "CodePage865",
113 encoding.ASCIISub,
114 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
115 },
116 {
117 "IBM Code Page 866",
118 "IBM866",
119 "",
120 "CodePage866",
121 encoding.ASCIISub,
122 "http://encoding.spec.whatwg.org/index-ibm866.txt",
123 },
124 {
125 "IBM Code Page 1047",
126 "IBM1047",
127 "",
128 "CodePage1047",
129 0x3f,
130 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
131 },
132 {
133 "IBM Code Page 1140",
134 "IBM01140",
135 "",
136 "CodePage1140",
137 0x3f,
138 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
139 },
140 {
141 "ISO 8859-1",
142 "ISOLatin1",
143 "",
144 "ISO8859_1",
145 encoding.ASCIISub,
146 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
147 },
148 {
149 "ISO 8859-2",
150 "ISOLatin2",
151 "",
152 "ISO8859_2",
153 encoding.ASCIISub,
154 "http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
155 },
156 {
157 "ISO 8859-3",
158 "ISOLatin3",
159 "",
160 "ISO8859_3",
161 encoding.ASCIISub,
162 "http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
163 },
164 {
165 "ISO 8859-4",
166 "ISOLatin4",
167 "",
168 "ISO8859_4",
169 encoding.ASCIISub,
170 "http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
171 },
172 {
173 "ISO 8859-5",
174 "ISOLatinCyrillic",
175 "",
176 "ISO8859_5",
177 encoding.ASCIISub,
178 "http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
179 },
180 {
181 "ISO 8859-6",
182 "ISOLatinArabic",
183 "",
184 "ISO8859_6,ISO8859_6E,ISO8859_6I",
185 encoding.ASCIISub,
186 "http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
187 },
188 {
189 "ISO 8859-7",
190 "ISOLatinGreek",
191 "",
192 "ISO8859_7",
193 encoding.ASCIISub,
194 "http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
195 },
196 {
197 "ISO 8859-8",
198 "ISOLatinHebrew",
199 "",
200 "ISO8859_8,ISO8859_8E,ISO8859_8I",
201 encoding.ASCIISub,
202 "http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
203 },
204 {
205 "ISO 8859-9",
206 "ISOLatin5",
207 "",
208 "ISO8859_9",
209 encoding.ASCIISub,
210 "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
211 },
212 {
213 "ISO 8859-10",
214 "ISOLatin6",
215 "",
216 "ISO8859_10",
217 encoding.ASCIISub,
218 "http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
219 },
220 {
221 "ISO 8859-13",
222 "ISO885913",
223 "",
224 "ISO8859_13",
225 encoding.ASCIISub,
226 "http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
227 },
228 {
229 "ISO 8859-14",
230 "ISO885914",
231 "",
232 "ISO8859_14",
233 encoding.ASCIISub,
234 "http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
235 },
236 {
237 "ISO 8859-15",
238 "ISO885915",
239 "",
240 "ISO8859_15",
241 encoding.ASCIISub,
242 "http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
243 },
244 {
245 "ISO 8859-16",
246 "ISO885916",
247 "",
248 "ISO8859_16",
249 encoding.ASCIISub,
250 "http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
251 },
252 {
253 "KOI8-R",
254 "KOI8R",
255 "",
256 "KOI8R",
257 encoding.ASCIISub,
258 "http://encoding.spec.whatwg.org/index-koi8-r.txt",
259 },
260 {
261 "KOI8-U",
262 "KOI8U",
263 "",
264 "KOI8U",
265 encoding.ASCIISub,
266 "http://encoding.spec.whatwg.org/index-koi8-u.txt",
267 },
268 {
269 "Macintosh",
270 "Macintosh",
271 "",
272 "Macintosh",
273 encoding.ASCIISub,
274 "http://encoding.spec.whatwg.org/index-macintosh.txt",
275 },
276 {
277 "Macintosh Cyrillic",
278 "MacintoshCyrillic",
279 "",
280 "MacintoshCyrillic",
281 encoding.ASCIISub,
282 "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
283 },
284 {
285 "Windows 874",
286 "Windows874",
287 "",
288 "Windows874",
289 encoding.ASCIISub,
290 "http://encoding.spec.whatwg.org/index-windows-874.txt",
291 },
292 {
293 "Windows 1250",
294 "Windows1250",
295 "",
296 "Windows1250",
297 encoding.ASCIISub,
298 "http://encoding.spec.whatwg.org/index-windows-1250.txt",
299 },
300 {
301 "Windows 1251",
302 "Windows1251",
303 "",
304 "Windows1251",
305 encoding.ASCIISub,
306 "http://encoding.spec.whatwg.org/index-windows-1251.txt",
307 },
308 {
309 "Windows 1252",
310 "Windows1252",
311 "",
312 "Windows1252",
313 encoding.ASCIISub,
314 "http://encoding.spec.whatwg.org/index-windows-1252.txt",
315 },
316 {
317 "Windows 1253",
318 "Windows1253",
319 "",
320 "Windows1253",
321 encoding.ASCIISub,
322 "http://encoding.spec.whatwg.org/index-windows-1253.txt",
323 },
324 {
325 "Windows 1254",
326 "Windows1254",
327 "",
328 "Windows1254",
329 encoding.ASCIISub,
330 "http://encoding.spec.whatwg.org/index-windows-1254.txt",
331 },
332 {
333 "Windows 1255",
334 "Windows1255",
335 "",
336 "Windows1255",
337 encoding.ASCIISub,
338 "http://encoding.spec.whatwg.org/index-windows-1255.txt",
339 },
340 {
341 "Windows 1256",
342 "Windows1256",
343 "",
344 "Windows1256",
345 encoding.ASCIISub,
346 "http://encoding.spec.whatwg.org/index-windows-1256.txt",
347 },
348 {
349 "Windows 1257",
350 "Windows1257",
351 "",
352 "Windows1257",
353 encoding.ASCIISub,
354 "http://encoding.spec.whatwg.org/index-windows-1257.txt",
355 },
356 {
357 "Windows 1258",
358 "Windows1258",
359 "",
360 "Windows1258",
361 encoding.ASCIISub,
362 "http://encoding.spec.whatwg.org/index-windows-1258.txt",
363 },
364 {
365 "X-User-Defined",
366 "XUserDefined",
367 "It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
368 "XUserDefined",
369 encoding.ASCIISub,
370 ascii +
371 "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
372 "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
373 "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
374 "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
375 "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
376 "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
377 "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
378 "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
379 "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
380 "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
381 "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
382 "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
383 "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
384 "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
385 "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
386 "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
387 },
388}
389
390func getWHATWG(url string) string {
391 res, err := http.Get(url)
392 if err != nil {
393 log.Fatalf("%q: Get: %v", url, err)
394 }
395 defer res.Body.Close()
396
397 mapping := make([]rune, 128)
398 for i := range mapping {
399 mapping[i] = '\ufffd'
400 }
401
402 scanner := bufio.NewScanner(res.Body)
403 for scanner.Scan() {
404 s := strings.TrimSpace(scanner.Text())
405 if s == "" || s[0] == '#' {
406 continue
407 }
408 x, y := 0, 0
409 if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
410 log.Fatalf("could not parse %q", s)
411 }
412 if x < 0 || 128 <= x {
413 log.Fatalf("code %d is out of range", x)
414 }
415 if 0x80 <= y && y < 0xa0 {
416 // We diverge from the WHATWG spec by mapping control characters
417 // in the range [0x80, 0xa0) to U+FFFD.
418 continue
419 }
420 mapping[x] = rune(y)
421 }
422 return ascii + string(mapping)
423}
424
425func getUCM(url string) string {
426 res, err := http.Get(url)
427 if err != nil {
428 log.Fatalf("%q: Get: %v", url, err)
429 }
430 defer res.Body.Close()
431
432 mapping := make([]rune, 256)
433 for i := range mapping {
434 mapping[i] = '\ufffd'
435 }
436
437 charsFound := 0
438 scanner := bufio.NewScanner(res.Body)
439 for scanner.Scan() {
440 s := strings.TrimSpace(scanner.Text())
441 if s == "" || s[0] == '#' {
442 continue
443 }
444 var c byte
445 var r rune
446 if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
447 continue
448 }
449 mapping[c] = r
450 charsFound++
451 }
452
453 if charsFound < 200 {
454 log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
455 }
456
457 return string(mapping)
458}
459
460func main() {
461 mibs := map[string]bool{}
462 all := []string{}
463
464 w := gen.NewCodeWriter()
465 defer w.WriteGoFile("tables.go", "charmap")
466
467 printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
468
469 printf("import (\n")
470 printf("\t\"golang.org/x/text/encoding\"\n")
471 printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
472 printf(")\n\n")
473 for _, e := range encodings {
474 varNames := strings.Split(e.varName, ",")
475 all = append(all, varNames...)
476 varName := varNames[0]
477 switch {
478 case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
479 e.mapping = getWHATWG(e.mapping)
480 case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
481 e.mapping = getUCM(e.mapping)
482 }
483
484 asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
485 if asciiSuperset {
486 low = 0x80
487 }
488 lvn := 1
489 if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
490 lvn = 3
491 }
492 lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
493 printf("// %s is the %s encoding.\n", varName, e.name)
494 if e.comment != "" {
495 printf("//\n// %s\n", e.comment)
496 }
497 printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
498 varName, lowerVarName, lowerVarName, e.name)
499 if mibs[e.mib] {
500 log.Fatalf("MIB type %q declared multiple times.", e.mib)
501 }
502 printf("mib: identifier.%s,\n", e.mib)
503 printf("asciiSuperset: %t,\n", asciiSuperset)
504 printf("low: 0x%02x,\n", low)
505 printf("replacement: 0x%02x,\n", e.replacement)
506
507 printf("decode: [256]utf8Enc{\n")
508 i, backMapping := 0, map[rune]byte{}
509 for _, c := range e.mapping {
510 if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
511 backMapping[c] = byte(i)
512 }
513 var buf [8]byte
514 n := utf8.EncodeRune(buf[:], c)
515 if n > 3 {
516 panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
517 }
518 printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
519 if i%2 == 1 {
520 printf("\n")
521 }
522 i++
523 }
524 printf("},\n")
525
526 printf("encode: [256]uint32{\n")
527 encode := make([]uint32, 0, 256)
528 for c, i := range backMapping {
529 encode = append(encode, uint32(i)<<24|uint32(c))
530 }
531 sort.Sort(byRune(encode))
532 for len(encode) < cap(encode) {
533 encode = append(encode, encode[len(encode)-1])
534 }
535 for i, enc := range encode {
536 printf("0x%08x,", enc)
537 if i%8 == 7 {
538 printf("\n")
539 }
540 }
541 printf("},\n}\n")
542
543 // Add an estimate of the size of a single Charmap{} struct value, which
544 // includes two 256 elem arrays of 4 bytes and some extra fields, which
545 // align to 3 uint64s on 64-bit architectures.
546 w.Size += 2*4*256 + 3*8
547 }
548 // TODO: add proper line breaking.
549 printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
550}
551
552type byRune []uint32
553
554func (b byRune) Len() int { return len(b) }
555func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
556func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }