blob: 5d052781bceaf8504923cff6da7d4a5f22044169 [file] [log] [blame]
Matthias Andreas Benkard832a54e2019-01-29 09:27:38 +01001// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7//go:generate go run gen.go
8//go:generate go run gen.go -test
9
10package main
11
12import (
13 "bytes"
14 "flag"
15 "fmt"
16 "go/format"
17 "io/ioutil"
18 "math/rand"
19 "os"
20 "sort"
21 "strings"
22)
23
24// identifier converts s to a Go exported identifier.
25// It converts "div" to "Div" and "accept-charset" to "AcceptCharset".
26func identifier(s string) string {
27 b := make([]byte, 0, len(s))
28 cap := true
29 for _, c := range s {
30 if c == '-' {
31 cap = true
32 continue
33 }
34 if cap && 'a' <= c && c <= 'z' {
35 c -= 'a' - 'A'
36 }
37 cap = false
38 b = append(b, byte(c))
39 }
40 return string(b)
41}
42
43var test = flag.Bool("test", false, "generate table_test.go")
44
45func genFile(name string, buf *bytes.Buffer) {
46 b, err := format.Source(buf.Bytes())
47 if err != nil {
48 fmt.Fprintln(os.Stderr, err)
49 os.Exit(1)
50 }
51 if err := ioutil.WriteFile(name, b, 0644); err != nil {
52 fmt.Fprintln(os.Stderr, err)
53 os.Exit(1)
54 }
55}
56
57func main() {
58 flag.Parse()
59
60 var all []string
61 all = append(all, elements...)
62 all = append(all, attributes...)
63 all = append(all, eventHandlers...)
64 all = append(all, extra...)
65 sort.Strings(all)
66
67 // uniq - lists have dups
68 w := 0
69 for _, s := range all {
70 if w == 0 || all[w-1] != s {
71 all[w] = s
72 w++
73 }
74 }
75 all = all[:w]
76
77 if *test {
78 var buf bytes.Buffer
79 fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
80 fmt.Fprintln(&buf, "//go:generate go run gen.go -test\n")
81 fmt.Fprintln(&buf, "package atom\n")
82 fmt.Fprintln(&buf, "var testAtomList = []string{")
83 for _, s := range all {
84 fmt.Fprintf(&buf, "\t%q,\n", s)
85 }
86 fmt.Fprintln(&buf, "}")
87
88 genFile("table_test.go", &buf)
89 return
90 }
91
92 // Find hash that minimizes table size.
93 var best *table
94 for i := 0; i < 1000000; i++ {
95 if best != nil && 1<<(best.k-1) < len(all) {
96 break
97 }
98 h := rand.Uint32()
99 for k := uint(0); k <= 16; k++ {
100 if best != nil && k >= best.k {
101 break
102 }
103 var t table
104 if t.init(h, k, all) {
105 best = &t
106 break
107 }
108 }
109 }
110 if best == nil {
111 fmt.Fprintf(os.Stderr, "failed to construct string table\n")
112 os.Exit(1)
113 }
114
115 // Lay out strings, using overlaps when possible.
116 layout := append([]string{}, all...)
117
118 // Remove strings that are substrings of other strings
119 for changed := true; changed; {
120 changed = false
121 for i, s := range layout {
122 if s == "" {
123 continue
124 }
125 for j, t := range layout {
126 if i != j && t != "" && strings.Contains(s, t) {
127 changed = true
128 layout[j] = ""
129 }
130 }
131 }
132 }
133
134 // Join strings where one suffix matches another prefix.
135 for {
136 // Find best i, j, k such that layout[i][len-k:] == layout[j][:k],
137 // maximizing overlap length k.
138 besti := -1
139 bestj := -1
140 bestk := 0
141 for i, s := range layout {
142 if s == "" {
143 continue
144 }
145 for j, t := range layout {
146 if i == j {
147 continue
148 }
149 for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
150 if s[len(s)-k:] == t[:k] {
151 besti = i
152 bestj = j
153 bestk = k
154 }
155 }
156 }
157 }
158 if bestk > 0 {
159 layout[besti] += layout[bestj][bestk:]
160 layout[bestj] = ""
161 continue
162 }
163 break
164 }
165
166 text := strings.Join(layout, "")
167
168 atom := map[string]uint32{}
169 for _, s := range all {
170 off := strings.Index(text, s)
171 if off < 0 {
172 panic("lost string " + s)
173 }
174 atom[s] = uint32(off<<8 | len(s))
175 }
176
177 var buf bytes.Buffer
178 // Generate the Go code.
179 fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n")
180 fmt.Fprintln(&buf, "//go:generate go run gen.go\n")
181 fmt.Fprintln(&buf, "package atom\n\nconst (")
182
183 // compute max len
184 maxLen := 0
185 for _, s := range all {
186 if maxLen < len(s) {
187 maxLen = len(s)
188 }
189 fmt.Fprintf(&buf, "\t%s Atom = %#x\n", identifier(s), atom[s])
190 }
191 fmt.Fprintln(&buf, ")\n")
192
193 fmt.Fprintf(&buf, "const hash0 = %#x\n\n", best.h0)
194 fmt.Fprintf(&buf, "const maxAtomLen = %d\n\n", maxLen)
195
196 fmt.Fprintf(&buf, "var table = [1<<%d]Atom{\n", best.k)
197 for i, s := range best.tab {
198 if s == "" {
199 continue
200 }
201 fmt.Fprintf(&buf, "\t%#x: %#x, // %s\n", i, atom[s], s)
202 }
203 fmt.Fprintf(&buf, "}\n")
204 datasize := (1 << best.k) * 4
205
206 fmt.Fprintln(&buf, "const atomText =")
207 textsize := len(text)
208 for len(text) > 60 {
209 fmt.Fprintf(&buf, "\t%q +\n", text[:60])
210 text = text[60:]
211 }
212 fmt.Fprintf(&buf, "\t%q\n\n", text)
213
214 genFile("table.go", &buf)
215
216 fmt.Fprintf(os.Stdout, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize)
217}
218
219type byLen []string
220
221func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) }
222func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] }
223func (x byLen) Len() int { return len(x) }
224
225// fnv computes the FNV hash with an arbitrary starting value h.
226func fnv(h uint32, s string) uint32 {
227 for i := 0; i < len(s); i++ {
228 h ^= uint32(s[i])
229 h *= 16777619
230 }
231 return h
232}
233
234// A table represents an attempt at constructing the lookup table.
235// The lookup table uses cuckoo hashing, meaning that each string
236// can be found in one of two positions.
237type table struct {
238 h0 uint32
239 k uint
240 mask uint32
241 tab []string
242}
243
244// hash returns the two hashes for s.
245func (t *table) hash(s string) (h1, h2 uint32) {
246 h := fnv(t.h0, s)
247 h1 = h & t.mask
248 h2 = (h >> 16) & t.mask
249 return
250}
251
252// init initializes the table with the given parameters.
253// h0 is the initial hash value,
254// k is the number of bits of hash value to use, and
255// x is the list of strings to store in the table.
256// init returns false if the table cannot be constructed.
257func (t *table) init(h0 uint32, k uint, x []string) bool {
258 t.h0 = h0
259 t.k = k
260 t.tab = make([]string, 1<<k)
261 t.mask = 1<<k - 1
262 for _, s := range x {
263 if !t.insert(s) {
264 return false
265 }
266 }
267 return true
268}
269
270// insert inserts s in the table.
271func (t *table) insert(s string) bool {
272 h1, h2 := t.hash(s)
273 if t.tab[h1] == "" {
274 t.tab[h1] = s
275 return true
276 }
277 if t.tab[h2] == "" {
278 t.tab[h2] = s
279 return true
280 }
281 if t.push(h1, 0) {
282 t.tab[h1] = s
283 return true
284 }
285 if t.push(h2, 0) {
286 t.tab[h2] = s
287 return true
288 }
289 return false
290}
291
292// push attempts to push aside the entry in slot i.
293func (t *table) push(i uint32, depth int) bool {
294 if depth > len(t.tab) {
295 return false
296 }
297 s := t.tab[i]
298 h1, h2 := t.hash(s)
299 j := h1 + h2 - i
300 if t.tab[j] != "" && !t.push(j, depth+1) {
301 return false
302 }
303 t.tab[j] = s
304 return true
305}
306
307// The lists of element names and attribute keys were taken from
308// https://html.spec.whatwg.org/multipage/indices.html#index
309// as of the "HTML Living Standard - Last Updated 16 April 2018" version.
310
311// "command", "keygen" and "menuitem" have been removed from the spec,
312// but are kept here for backwards compatibility.
313var elements = []string{
314 "a",
315 "abbr",
316 "address",
317 "area",
318 "article",
319 "aside",
320 "audio",
321 "b",
322 "base",
323 "bdi",
324 "bdo",
325 "blockquote",
326 "body",
327 "br",
328 "button",
329 "canvas",
330 "caption",
331 "cite",
332 "code",
333 "col",
334 "colgroup",
335 "command",
336 "data",
337 "datalist",
338 "dd",
339 "del",
340 "details",
341 "dfn",
342 "dialog",
343 "div",
344 "dl",
345 "dt",
346 "em",
347 "embed",
348 "fieldset",
349 "figcaption",
350 "figure",
351 "footer",
352 "form",
353 "h1",
354 "h2",
355 "h3",
356 "h4",
357 "h5",
358 "h6",
359 "head",
360 "header",
361 "hgroup",
362 "hr",
363 "html",
364 "i",
365 "iframe",
366 "img",
367 "input",
368 "ins",
369 "kbd",
370 "keygen",
371 "label",
372 "legend",
373 "li",
374 "link",
375 "main",
376 "map",
377 "mark",
378 "menu",
379 "menuitem",
380 "meta",
381 "meter",
382 "nav",
383 "noscript",
384 "object",
385 "ol",
386 "optgroup",
387 "option",
388 "output",
389 "p",
390 "param",
391 "picture",
392 "pre",
393 "progress",
394 "q",
395 "rp",
396 "rt",
397 "ruby",
398 "s",
399 "samp",
400 "script",
401 "section",
402 "select",
403 "slot",
404 "small",
405 "source",
406 "span",
407 "strong",
408 "style",
409 "sub",
410 "summary",
411 "sup",
412 "table",
413 "tbody",
414 "td",
415 "template",
416 "textarea",
417 "tfoot",
418 "th",
419 "thead",
420 "time",
421 "title",
422 "tr",
423 "track",
424 "u",
425 "ul",
426 "var",
427 "video",
428 "wbr",
429}
430
431// https://html.spec.whatwg.org/multipage/indices.html#attributes-3
432//
433// "challenge", "command", "contextmenu", "dropzone", "icon", "keytype", "mediagroup",
434// "radiogroup", "spellcheck", "scoped", "seamless", "sortable" and "sorted" have been removed from the spec,
435// but are kept here for backwards compatibility.
436var attributes = []string{
437 "abbr",
438 "accept",
439 "accept-charset",
440 "accesskey",
441 "action",
442 "allowfullscreen",
443 "allowpaymentrequest",
444 "allowusermedia",
445 "alt",
446 "as",
447 "async",
448 "autocomplete",
449 "autofocus",
450 "autoplay",
451 "challenge",
452 "charset",
453 "checked",
454 "cite",
455 "class",
456 "color",
457 "cols",
458 "colspan",
459 "command",
460 "content",
461 "contenteditable",
462 "contextmenu",
463 "controls",
464 "coords",
465 "crossorigin",
466 "data",
467 "datetime",
468 "default",
469 "defer",
470 "dir",
471 "dirname",
472 "disabled",
473 "download",
474 "draggable",
475 "dropzone",
476 "enctype",
477 "for",
478 "form",
479 "formaction",
480 "formenctype",
481 "formmethod",
482 "formnovalidate",
483 "formtarget",
484 "headers",
485 "height",
486 "hidden",
487 "high",
488 "href",
489 "hreflang",
490 "http-equiv",
491 "icon",
492 "id",
493 "inputmode",
494 "integrity",
495 "is",
496 "ismap",
497 "itemid",
498 "itemprop",
499 "itemref",
500 "itemscope",
501 "itemtype",
502 "keytype",
503 "kind",
504 "label",
505 "lang",
506 "list",
507 "loop",
508 "low",
509 "manifest",
510 "max",
511 "maxlength",
512 "media",
513 "mediagroup",
514 "method",
515 "min",
516 "minlength",
517 "multiple",
518 "muted",
519 "name",
520 "nomodule",
521 "nonce",
522 "novalidate",
523 "open",
524 "optimum",
525 "pattern",
526 "ping",
527 "placeholder",
528 "playsinline",
529 "poster",
530 "preload",
531 "radiogroup",
532 "readonly",
533 "referrerpolicy",
534 "rel",
535 "required",
536 "reversed",
537 "rows",
538 "rowspan",
539 "sandbox",
540 "spellcheck",
541 "scope",
542 "scoped",
543 "seamless",
544 "selected",
545 "shape",
546 "size",
547 "sizes",
548 "sortable",
549 "sorted",
550 "slot",
551 "span",
552 "spellcheck",
553 "src",
554 "srcdoc",
555 "srclang",
556 "srcset",
557 "start",
558 "step",
559 "style",
560 "tabindex",
561 "target",
562 "title",
563 "translate",
564 "type",
565 "typemustmatch",
566 "updateviacache",
567 "usemap",
568 "value",
569 "width",
570 "workertype",
571 "wrap",
572}
573
574// "onautocomplete", "onautocompleteerror", "onmousewheel",
575// "onshow" and "onsort" have been removed from the spec,
576// but are kept here for backwards compatibility.
577var eventHandlers = []string{
578 "onabort",
579 "onautocomplete",
580 "onautocompleteerror",
581 "onauxclick",
582 "onafterprint",
583 "onbeforeprint",
584 "onbeforeunload",
585 "onblur",
586 "oncancel",
587 "oncanplay",
588 "oncanplaythrough",
589 "onchange",
590 "onclick",
591 "onclose",
592 "oncontextmenu",
593 "oncopy",
594 "oncuechange",
595 "oncut",
596 "ondblclick",
597 "ondrag",
598 "ondragend",
599 "ondragenter",
600 "ondragexit",
601 "ondragleave",
602 "ondragover",
603 "ondragstart",
604 "ondrop",
605 "ondurationchange",
606 "onemptied",
607 "onended",
608 "onerror",
609 "onfocus",
610 "onhashchange",
611 "oninput",
612 "oninvalid",
613 "onkeydown",
614 "onkeypress",
615 "onkeyup",
616 "onlanguagechange",
617 "onload",
618 "onloadeddata",
619 "onloadedmetadata",
620 "onloadend",
621 "onloadstart",
622 "onmessage",
623 "onmessageerror",
624 "onmousedown",
625 "onmouseenter",
626 "onmouseleave",
627 "onmousemove",
628 "onmouseout",
629 "onmouseover",
630 "onmouseup",
631 "onmousewheel",
632 "onwheel",
633 "onoffline",
634 "ononline",
635 "onpagehide",
636 "onpageshow",
637 "onpaste",
638 "onpause",
639 "onplay",
640 "onplaying",
641 "onpopstate",
642 "onprogress",
643 "onratechange",
644 "onreset",
645 "onresize",
646 "onrejectionhandled",
647 "onscroll",
648 "onsecuritypolicyviolation",
649 "onseeked",
650 "onseeking",
651 "onselect",
652 "onshow",
653 "onsort",
654 "onstalled",
655 "onstorage",
656 "onsubmit",
657 "onsuspend",
658 "ontimeupdate",
659 "ontoggle",
660 "onunhandledrejection",
661 "onunload",
662 "onvolumechange",
663 "onwaiting",
664}
665
666// extra are ad-hoc values not covered by any of the lists above.
667var extra = []string{
668 "acronym",
669 "align",
670 "annotation",
671 "annotation-xml",
672 "applet",
673 "basefont",
674 "bgsound",
675 "big",
676 "blink",
677 "center",
678 "color",
679 "desc",
680 "face",
681 "font",
682 "foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive.
683 "foreignobject",
684 "frame",
685 "frameset",
686 "image",
687 "isindex",
688 "listing",
689 "malignmark",
690 "marquee",
691 "math",
692 "mglyph",
693 "mi",
694 "mn",
695 "mo",
696 "ms",
697 "mtext",
698 "nobr",
699 "noembed",
700 "noframes",
701 "plaintext",
702 "prompt",
703 "public",
704 "rb",
705 "rtc",
706 "spacer",
707 "strike",
708 "svg",
709 "system",
710 "tt",
711 "xmp",
712}