Matthias Andreas Benkard | 832a54e | 2019-01-29 09:27:38 +0100 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | package main |
| 8 | |
| 9 | // This program generates tables.go: |
| 10 | // go run maketables.go | gofmt > tables.go |
| 11 | |
| 12 | // TODO: Emoji extensions? |
| 13 | // http://www.unicode.org/faq/emoji_dingbats.html |
| 14 | // http://www.unicode.org/Public/UNIDATA/EmojiSources.txt |
| 15 | |
| 16 | import ( |
| 17 | "bufio" |
| 18 | "fmt" |
| 19 | "log" |
| 20 | "net/http" |
| 21 | "sort" |
| 22 | "strings" |
| 23 | ) |
| 24 | |
| 25 | type entry struct { |
| 26 | jisCode, table int |
| 27 | } |
| 28 | |
| 29 | func main() { |
| 30 | fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n") |
| 31 | fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n") |
| 32 | fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n") |
| 33 | |
| 34 | reverse := [65536]entry{} |
| 35 | for i := range reverse { |
| 36 | reverse[i].table = -1 |
| 37 | } |
| 38 | |
| 39 | tables := []struct { |
| 40 | url string |
| 41 | name string |
| 42 | }{ |
| 43 | {"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"}, |
| 44 | {"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"}, |
| 45 | } |
| 46 | for i, table := range tables { |
| 47 | res, err := http.Get(table.url) |
| 48 | if err != nil { |
| 49 | log.Fatalf("%q: Get: %v", table.url, err) |
| 50 | } |
| 51 | defer res.Body.Close() |
| 52 | |
| 53 | mapping := [65536]uint16{} |
| 54 | |
| 55 | scanner := bufio.NewScanner(res.Body) |
| 56 | for scanner.Scan() { |
| 57 | s := strings.TrimSpace(scanner.Text()) |
| 58 | if s == "" || s[0] == '#' { |
| 59 | continue |
| 60 | } |
| 61 | x, y := 0, uint16(0) |
| 62 | if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil { |
| 63 | log.Fatalf("%q: could not parse %q", table.url, s) |
| 64 | } |
| 65 | if x < 0 || 120*94 <= x { |
| 66 | log.Fatalf("%q: JIS code %d is out of range", table.url, x) |
| 67 | } |
| 68 | mapping[x] = y |
| 69 | if reverse[y].table == -1 { |
| 70 | reverse[y] = entry{jisCode: x, table: i} |
| 71 | } |
| 72 | } |
| 73 | if err := scanner.Err(); err != nil { |
| 74 | log.Fatalf("%q: scanner error: %v", table.url, err) |
| 75 | } |
| 76 | |
| 77 | fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n", |
| 78 | table.name, table.name, table.url) |
| 79 | fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name) |
| 80 | for i, m := range mapping { |
| 81 | if m != 0 { |
| 82 | fmt.Printf("\t%d: 0x%04X,\n", i, m) |
| 83 | } |
| 84 | } |
| 85 | fmt.Printf("}\n\n") |
| 86 | } |
| 87 | |
| 88 | // Any run of at least separation continuous zero entries in the reverse map will |
| 89 | // be a separate encode table. |
| 90 | const separation = 1024 |
| 91 | |
| 92 | intervals := []interval(nil) |
| 93 | low, high := -1, -1 |
| 94 | for i, v := range reverse { |
| 95 | if v.table == -1 { |
| 96 | continue |
| 97 | } |
| 98 | if low < 0 { |
| 99 | low = i |
| 100 | } else if i-high >= separation { |
| 101 | if high >= 0 { |
| 102 | intervals = append(intervals, interval{low, high}) |
| 103 | } |
| 104 | low = i |
| 105 | } |
| 106 | high = i + 1 |
| 107 | } |
| 108 | if high >= 0 { |
| 109 | intervals = append(intervals, interval{low, high}) |
| 110 | } |
| 111 | sort.Sort(byDecreasingLength(intervals)) |
| 112 | |
| 113 | fmt.Printf("const (\n") |
| 114 | fmt.Printf("\tjis0208 = 1\n") |
| 115 | fmt.Printf("\tjis0212 = 2\n") |
| 116 | fmt.Printf("\tcodeMask = 0x7f\n") |
| 117 | fmt.Printf("\tcodeShift = 7\n") |
| 118 | fmt.Printf("\ttableShift = 14\n") |
| 119 | fmt.Printf(")\n\n") |
| 120 | |
| 121 | fmt.Printf("const numEncodeTables = %d\n\n", len(intervals)) |
| 122 | fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n") |
| 123 | fmt.Printf("// sorted by decreasing length.\n") |
| 124 | for i, v := range intervals { |
| 125 | fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high) |
| 126 | } |
| 127 | fmt.Printf("//\n") |
| 128 | fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n") |
| 129 | fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n") |
| 130 | fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n") |
| 131 | fmt.Printf("// JIS code (94*j1 + j2) within that table.\n") |
| 132 | fmt.Printf("\n") |
| 133 | |
| 134 | for i, v := range intervals { |
| 135 | fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high) |
| 136 | fmt.Printf("var encode%d = [...]uint16{\n", i) |
| 137 | for j := v.low; j < v.high; j++ { |
| 138 | x := reverse[j] |
| 139 | if x.table == -1 { |
| 140 | continue |
| 141 | } |
| 142 | fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n", |
| 143 | j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94) |
| 144 | } |
| 145 | fmt.Printf("}\n\n") |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | // interval is a half-open interval [low, high). |
| 150 | type interval struct { |
| 151 | low, high int |
| 152 | } |
| 153 | |
| 154 | func (i interval) len() int { return i.high - i.low } |
| 155 | |
| 156 | // byDecreasingLength sorts intervals by decreasing length. |
| 157 | type byDecreasingLength []interval |
| 158 | |
| 159 | func (b byDecreasingLength) Len() int { return len(b) } |
| 160 | func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() } |
| 161 | func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |