Matthias Andreas Benkard | 832a54e | 2019-01-29 09:27:38 +0100 | [diff] [blame] | 1 | // Copyright 2012 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // +build ignore |
| 6 | |
| 7 | //go:generate go run gen.go |
| 8 | //go:generate go run gen.go -test |
| 9 | |
| 10 | package main |
| 11 | |
| 12 | import ( |
| 13 | "bytes" |
| 14 | "flag" |
| 15 | "fmt" |
| 16 | "go/format" |
| 17 | "io/ioutil" |
| 18 | "math/rand" |
| 19 | "os" |
| 20 | "sort" |
| 21 | "strings" |
| 22 | ) |
| 23 | |
| 24 | // identifier converts s to a Go exported identifier. |
| 25 | // It converts "div" to "Div" and "accept-charset" to "AcceptCharset". |
| 26 | func identifier(s string) string { |
| 27 | b := make([]byte, 0, len(s)) |
| 28 | cap := true |
| 29 | for _, c := range s { |
| 30 | if c == '-' { |
| 31 | cap = true |
| 32 | continue |
| 33 | } |
| 34 | if cap && 'a' <= c && c <= 'z' { |
| 35 | c -= 'a' - 'A' |
| 36 | } |
| 37 | cap = false |
| 38 | b = append(b, byte(c)) |
| 39 | } |
| 40 | return string(b) |
| 41 | } |
| 42 | |
| 43 | var test = flag.Bool("test", false, "generate table_test.go") |
| 44 | |
| 45 | func genFile(name string, buf *bytes.Buffer) { |
| 46 | b, err := format.Source(buf.Bytes()) |
| 47 | if err != nil { |
| 48 | fmt.Fprintln(os.Stderr, err) |
| 49 | os.Exit(1) |
| 50 | } |
| 51 | if err := ioutil.WriteFile(name, b, 0644); err != nil { |
| 52 | fmt.Fprintln(os.Stderr, err) |
| 53 | os.Exit(1) |
| 54 | } |
| 55 | } |
| 56 | |
| 57 | func main() { |
| 58 | flag.Parse() |
| 59 | |
| 60 | var all []string |
| 61 | all = append(all, elements...) |
| 62 | all = append(all, attributes...) |
| 63 | all = append(all, eventHandlers...) |
| 64 | all = append(all, extra...) |
| 65 | sort.Strings(all) |
| 66 | |
| 67 | // uniq - lists have dups |
| 68 | w := 0 |
| 69 | for _, s := range all { |
| 70 | if w == 0 || all[w-1] != s { |
| 71 | all[w] = s |
| 72 | w++ |
| 73 | } |
| 74 | } |
| 75 | all = all[:w] |
| 76 | |
| 77 | if *test { |
| 78 | var buf bytes.Buffer |
| 79 | fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n") |
| 80 | fmt.Fprintln(&buf, "//go:generate go run gen.go -test\n") |
| 81 | fmt.Fprintln(&buf, "package atom\n") |
| 82 | fmt.Fprintln(&buf, "var testAtomList = []string{") |
| 83 | for _, s := range all { |
| 84 | fmt.Fprintf(&buf, "\t%q,\n", s) |
| 85 | } |
| 86 | fmt.Fprintln(&buf, "}") |
| 87 | |
| 88 | genFile("table_test.go", &buf) |
| 89 | return |
| 90 | } |
| 91 | |
| 92 | // Find hash that minimizes table size. |
| 93 | var best *table |
| 94 | for i := 0; i < 1000000; i++ { |
| 95 | if best != nil && 1<<(best.k-1) < len(all) { |
| 96 | break |
| 97 | } |
| 98 | h := rand.Uint32() |
| 99 | for k := uint(0); k <= 16; k++ { |
| 100 | if best != nil && k >= best.k { |
| 101 | break |
| 102 | } |
| 103 | var t table |
| 104 | if t.init(h, k, all) { |
| 105 | best = &t |
| 106 | break |
| 107 | } |
| 108 | } |
| 109 | } |
| 110 | if best == nil { |
| 111 | fmt.Fprintf(os.Stderr, "failed to construct string table\n") |
| 112 | os.Exit(1) |
| 113 | } |
| 114 | |
| 115 | // Lay out strings, using overlaps when possible. |
| 116 | layout := append([]string{}, all...) |
| 117 | |
| 118 | // Remove strings that are substrings of other strings |
| 119 | for changed := true; changed; { |
| 120 | changed = false |
| 121 | for i, s := range layout { |
| 122 | if s == "" { |
| 123 | continue |
| 124 | } |
| 125 | for j, t := range layout { |
| 126 | if i != j && t != "" && strings.Contains(s, t) { |
| 127 | changed = true |
| 128 | layout[j] = "" |
| 129 | } |
| 130 | } |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | // Join strings where one suffix matches another prefix. |
| 135 | for { |
| 136 | // Find best i, j, k such that layout[i][len-k:] == layout[j][:k], |
| 137 | // maximizing overlap length k. |
| 138 | besti := -1 |
| 139 | bestj := -1 |
| 140 | bestk := 0 |
| 141 | for i, s := range layout { |
| 142 | if s == "" { |
| 143 | continue |
| 144 | } |
| 145 | for j, t := range layout { |
| 146 | if i == j { |
| 147 | continue |
| 148 | } |
| 149 | for k := bestk + 1; k <= len(s) && k <= len(t); k++ { |
| 150 | if s[len(s)-k:] == t[:k] { |
| 151 | besti = i |
| 152 | bestj = j |
| 153 | bestk = k |
| 154 | } |
| 155 | } |
| 156 | } |
| 157 | } |
| 158 | if bestk > 0 { |
| 159 | layout[besti] += layout[bestj][bestk:] |
| 160 | layout[bestj] = "" |
| 161 | continue |
| 162 | } |
| 163 | break |
| 164 | } |
| 165 | |
| 166 | text := strings.Join(layout, "") |
| 167 | |
| 168 | atom := map[string]uint32{} |
| 169 | for _, s := range all { |
| 170 | off := strings.Index(text, s) |
| 171 | if off < 0 { |
| 172 | panic("lost string " + s) |
| 173 | } |
| 174 | atom[s] = uint32(off<<8 | len(s)) |
| 175 | } |
| 176 | |
| 177 | var buf bytes.Buffer |
| 178 | // Generate the Go code. |
| 179 | fmt.Fprintln(&buf, "// Code generated by go generate gen.go; DO NOT EDIT.\n") |
| 180 | fmt.Fprintln(&buf, "//go:generate go run gen.go\n") |
| 181 | fmt.Fprintln(&buf, "package atom\n\nconst (") |
| 182 | |
| 183 | // compute max len |
| 184 | maxLen := 0 |
| 185 | for _, s := range all { |
| 186 | if maxLen < len(s) { |
| 187 | maxLen = len(s) |
| 188 | } |
| 189 | fmt.Fprintf(&buf, "\t%s Atom = %#x\n", identifier(s), atom[s]) |
| 190 | } |
| 191 | fmt.Fprintln(&buf, ")\n") |
| 192 | |
| 193 | fmt.Fprintf(&buf, "const hash0 = %#x\n\n", best.h0) |
| 194 | fmt.Fprintf(&buf, "const maxAtomLen = %d\n\n", maxLen) |
| 195 | |
| 196 | fmt.Fprintf(&buf, "var table = [1<<%d]Atom{\n", best.k) |
| 197 | for i, s := range best.tab { |
| 198 | if s == "" { |
| 199 | continue |
| 200 | } |
| 201 | fmt.Fprintf(&buf, "\t%#x: %#x, // %s\n", i, atom[s], s) |
| 202 | } |
| 203 | fmt.Fprintf(&buf, "}\n") |
| 204 | datasize := (1 << best.k) * 4 |
| 205 | |
| 206 | fmt.Fprintln(&buf, "const atomText =") |
| 207 | textsize := len(text) |
| 208 | for len(text) > 60 { |
| 209 | fmt.Fprintf(&buf, "\t%q +\n", text[:60]) |
| 210 | text = text[60:] |
| 211 | } |
| 212 | fmt.Fprintf(&buf, "\t%q\n\n", text) |
| 213 | |
| 214 | genFile("table.go", &buf) |
| 215 | |
| 216 | fmt.Fprintf(os.Stdout, "%d atoms; %d string bytes + %d tables = %d total data\n", len(all), textsize, datasize, textsize+datasize) |
| 217 | } |
| 218 | |
| 219 | type byLen []string |
| 220 | |
| 221 | func (x byLen) Less(i, j int) bool { return len(x[i]) > len(x[j]) } |
| 222 | func (x byLen) Swap(i, j int) { x[i], x[j] = x[j], x[i] } |
| 223 | func (x byLen) Len() int { return len(x) } |
| 224 | |
| 225 | // fnv computes the FNV hash with an arbitrary starting value h. |
| 226 | func fnv(h uint32, s string) uint32 { |
| 227 | for i := 0; i < len(s); i++ { |
| 228 | h ^= uint32(s[i]) |
| 229 | h *= 16777619 |
| 230 | } |
| 231 | return h |
| 232 | } |
| 233 | |
| 234 | // A table represents an attempt at constructing the lookup table. |
| 235 | // The lookup table uses cuckoo hashing, meaning that each string |
| 236 | // can be found in one of two positions. |
| 237 | type table struct { |
| 238 | h0 uint32 |
| 239 | k uint |
| 240 | mask uint32 |
| 241 | tab []string |
| 242 | } |
| 243 | |
| 244 | // hash returns the two hashes for s. |
| 245 | func (t *table) hash(s string) (h1, h2 uint32) { |
| 246 | h := fnv(t.h0, s) |
| 247 | h1 = h & t.mask |
| 248 | h2 = (h >> 16) & t.mask |
| 249 | return |
| 250 | } |
| 251 | |
| 252 | // init initializes the table with the given parameters. |
| 253 | // h0 is the initial hash value, |
| 254 | // k is the number of bits of hash value to use, and |
| 255 | // x is the list of strings to store in the table. |
| 256 | // init returns false if the table cannot be constructed. |
| 257 | func (t *table) init(h0 uint32, k uint, x []string) bool { |
| 258 | t.h0 = h0 |
| 259 | t.k = k |
| 260 | t.tab = make([]string, 1<<k) |
| 261 | t.mask = 1<<k - 1 |
| 262 | for _, s := range x { |
| 263 | if !t.insert(s) { |
| 264 | return false |
| 265 | } |
| 266 | } |
| 267 | return true |
| 268 | } |
| 269 | |
| 270 | // insert inserts s in the table. |
| 271 | func (t *table) insert(s string) bool { |
| 272 | h1, h2 := t.hash(s) |
| 273 | if t.tab[h1] == "" { |
| 274 | t.tab[h1] = s |
| 275 | return true |
| 276 | } |
| 277 | if t.tab[h2] == "" { |
| 278 | t.tab[h2] = s |
| 279 | return true |
| 280 | } |
| 281 | if t.push(h1, 0) { |
| 282 | t.tab[h1] = s |
| 283 | return true |
| 284 | } |
| 285 | if t.push(h2, 0) { |
| 286 | t.tab[h2] = s |
| 287 | return true |
| 288 | } |
| 289 | return false |
| 290 | } |
| 291 | |
| 292 | // push attempts to push aside the entry in slot i. |
| 293 | func (t *table) push(i uint32, depth int) bool { |
| 294 | if depth > len(t.tab) { |
| 295 | return false |
| 296 | } |
| 297 | s := t.tab[i] |
| 298 | h1, h2 := t.hash(s) |
| 299 | j := h1 + h2 - i |
| 300 | if t.tab[j] != "" && !t.push(j, depth+1) { |
| 301 | return false |
| 302 | } |
| 303 | t.tab[j] = s |
| 304 | return true |
| 305 | } |
| 306 | |
| 307 | // The lists of element names and attribute keys were taken from |
| 308 | // https://html.spec.whatwg.org/multipage/indices.html#index |
| 309 | // as of the "HTML Living Standard - Last Updated 16 April 2018" version. |
| 310 | |
| 311 | // "command", "keygen" and "menuitem" have been removed from the spec, |
| 312 | // but are kept here for backwards compatibility. |
| 313 | var elements = []string{ |
| 314 | "a", |
| 315 | "abbr", |
| 316 | "address", |
| 317 | "area", |
| 318 | "article", |
| 319 | "aside", |
| 320 | "audio", |
| 321 | "b", |
| 322 | "base", |
| 323 | "bdi", |
| 324 | "bdo", |
| 325 | "blockquote", |
| 326 | "body", |
| 327 | "br", |
| 328 | "button", |
| 329 | "canvas", |
| 330 | "caption", |
| 331 | "cite", |
| 332 | "code", |
| 333 | "col", |
| 334 | "colgroup", |
| 335 | "command", |
| 336 | "data", |
| 337 | "datalist", |
| 338 | "dd", |
| 339 | "del", |
| 340 | "details", |
| 341 | "dfn", |
| 342 | "dialog", |
| 343 | "div", |
| 344 | "dl", |
| 345 | "dt", |
| 346 | "em", |
| 347 | "embed", |
| 348 | "fieldset", |
| 349 | "figcaption", |
| 350 | "figure", |
| 351 | "footer", |
| 352 | "form", |
| 353 | "h1", |
| 354 | "h2", |
| 355 | "h3", |
| 356 | "h4", |
| 357 | "h5", |
| 358 | "h6", |
| 359 | "head", |
| 360 | "header", |
| 361 | "hgroup", |
| 362 | "hr", |
| 363 | "html", |
| 364 | "i", |
| 365 | "iframe", |
| 366 | "img", |
| 367 | "input", |
| 368 | "ins", |
| 369 | "kbd", |
| 370 | "keygen", |
| 371 | "label", |
| 372 | "legend", |
| 373 | "li", |
| 374 | "link", |
| 375 | "main", |
| 376 | "map", |
| 377 | "mark", |
| 378 | "menu", |
| 379 | "menuitem", |
| 380 | "meta", |
| 381 | "meter", |
| 382 | "nav", |
| 383 | "noscript", |
| 384 | "object", |
| 385 | "ol", |
| 386 | "optgroup", |
| 387 | "option", |
| 388 | "output", |
| 389 | "p", |
| 390 | "param", |
| 391 | "picture", |
| 392 | "pre", |
| 393 | "progress", |
| 394 | "q", |
| 395 | "rp", |
| 396 | "rt", |
| 397 | "ruby", |
| 398 | "s", |
| 399 | "samp", |
| 400 | "script", |
| 401 | "section", |
| 402 | "select", |
| 403 | "slot", |
| 404 | "small", |
| 405 | "source", |
| 406 | "span", |
| 407 | "strong", |
| 408 | "style", |
| 409 | "sub", |
| 410 | "summary", |
| 411 | "sup", |
| 412 | "table", |
| 413 | "tbody", |
| 414 | "td", |
| 415 | "template", |
| 416 | "textarea", |
| 417 | "tfoot", |
| 418 | "th", |
| 419 | "thead", |
| 420 | "time", |
| 421 | "title", |
| 422 | "tr", |
| 423 | "track", |
| 424 | "u", |
| 425 | "ul", |
| 426 | "var", |
| 427 | "video", |
| 428 | "wbr", |
| 429 | } |
| 430 | |
| 431 | // https://html.spec.whatwg.org/multipage/indices.html#attributes-3 |
| 432 | // |
| 433 | // "challenge", "command", "contextmenu", "dropzone", "icon", "keytype", "mediagroup", |
| 434 | // "radiogroup", "spellcheck", "scoped", "seamless", "sortable" and "sorted" have been removed from the spec, |
| 435 | // but are kept here for backwards compatibility. |
| 436 | var attributes = []string{ |
| 437 | "abbr", |
| 438 | "accept", |
| 439 | "accept-charset", |
| 440 | "accesskey", |
| 441 | "action", |
| 442 | "allowfullscreen", |
| 443 | "allowpaymentrequest", |
| 444 | "allowusermedia", |
| 445 | "alt", |
| 446 | "as", |
| 447 | "async", |
| 448 | "autocomplete", |
| 449 | "autofocus", |
| 450 | "autoplay", |
| 451 | "challenge", |
| 452 | "charset", |
| 453 | "checked", |
| 454 | "cite", |
| 455 | "class", |
| 456 | "color", |
| 457 | "cols", |
| 458 | "colspan", |
| 459 | "command", |
| 460 | "content", |
| 461 | "contenteditable", |
| 462 | "contextmenu", |
| 463 | "controls", |
| 464 | "coords", |
| 465 | "crossorigin", |
| 466 | "data", |
| 467 | "datetime", |
| 468 | "default", |
| 469 | "defer", |
| 470 | "dir", |
| 471 | "dirname", |
| 472 | "disabled", |
| 473 | "download", |
| 474 | "draggable", |
| 475 | "dropzone", |
| 476 | "enctype", |
| 477 | "for", |
| 478 | "form", |
| 479 | "formaction", |
| 480 | "formenctype", |
| 481 | "formmethod", |
| 482 | "formnovalidate", |
| 483 | "formtarget", |
| 484 | "headers", |
| 485 | "height", |
| 486 | "hidden", |
| 487 | "high", |
| 488 | "href", |
| 489 | "hreflang", |
| 490 | "http-equiv", |
| 491 | "icon", |
| 492 | "id", |
| 493 | "inputmode", |
| 494 | "integrity", |
| 495 | "is", |
| 496 | "ismap", |
| 497 | "itemid", |
| 498 | "itemprop", |
| 499 | "itemref", |
| 500 | "itemscope", |
| 501 | "itemtype", |
| 502 | "keytype", |
| 503 | "kind", |
| 504 | "label", |
| 505 | "lang", |
| 506 | "list", |
| 507 | "loop", |
| 508 | "low", |
| 509 | "manifest", |
| 510 | "max", |
| 511 | "maxlength", |
| 512 | "media", |
| 513 | "mediagroup", |
| 514 | "method", |
| 515 | "min", |
| 516 | "minlength", |
| 517 | "multiple", |
| 518 | "muted", |
| 519 | "name", |
| 520 | "nomodule", |
| 521 | "nonce", |
| 522 | "novalidate", |
| 523 | "open", |
| 524 | "optimum", |
| 525 | "pattern", |
| 526 | "ping", |
| 527 | "placeholder", |
| 528 | "playsinline", |
| 529 | "poster", |
| 530 | "preload", |
| 531 | "radiogroup", |
| 532 | "readonly", |
| 533 | "referrerpolicy", |
| 534 | "rel", |
| 535 | "required", |
| 536 | "reversed", |
| 537 | "rows", |
| 538 | "rowspan", |
| 539 | "sandbox", |
| 540 | "spellcheck", |
| 541 | "scope", |
| 542 | "scoped", |
| 543 | "seamless", |
| 544 | "selected", |
| 545 | "shape", |
| 546 | "size", |
| 547 | "sizes", |
| 548 | "sortable", |
| 549 | "sorted", |
| 550 | "slot", |
| 551 | "span", |
| 552 | "spellcheck", |
| 553 | "src", |
| 554 | "srcdoc", |
| 555 | "srclang", |
| 556 | "srcset", |
| 557 | "start", |
| 558 | "step", |
| 559 | "style", |
| 560 | "tabindex", |
| 561 | "target", |
| 562 | "title", |
| 563 | "translate", |
| 564 | "type", |
| 565 | "typemustmatch", |
| 566 | "updateviacache", |
| 567 | "usemap", |
| 568 | "value", |
| 569 | "width", |
| 570 | "workertype", |
| 571 | "wrap", |
| 572 | } |
| 573 | |
| 574 | // "onautocomplete", "onautocompleteerror", "onmousewheel", |
| 575 | // "onshow" and "onsort" have been removed from the spec, |
| 576 | // but are kept here for backwards compatibility. |
| 577 | var eventHandlers = []string{ |
| 578 | "onabort", |
| 579 | "onautocomplete", |
| 580 | "onautocompleteerror", |
| 581 | "onauxclick", |
| 582 | "onafterprint", |
| 583 | "onbeforeprint", |
| 584 | "onbeforeunload", |
| 585 | "onblur", |
| 586 | "oncancel", |
| 587 | "oncanplay", |
| 588 | "oncanplaythrough", |
| 589 | "onchange", |
| 590 | "onclick", |
| 591 | "onclose", |
| 592 | "oncontextmenu", |
| 593 | "oncopy", |
| 594 | "oncuechange", |
| 595 | "oncut", |
| 596 | "ondblclick", |
| 597 | "ondrag", |
| 598 | "ondragend", |
| 599 | "ondragenter", |
| 600 | "ondragexit", |
| 601 | "ondragleave", |
| 602 | "ondragover", |
| 603 | "ondragstart", |
| 604 | "ondrop", |
| 605 | "ondurationchange", |
| 606 | "onemptied", |
| 607 | "onended", |
| 608 | "onerror", |
| 609 | "onfocus", |
| 610 | "onhashchange", |
| 611 | "oninput", |
| 612 | "oninvalid", |
| 613 | "onkeydown", |
| 614 | "onkeypress", |
| 615 | "onkeyup", |
| 616 | "onlanguagechange", |
| 617 | "onload", |
| 618 | "onloadeddata", |
| 619 | "onloadedmetadata", |
| 620 | "onloadend", |
| 621 | "onloadstart", |
| 622 | "onmessage", |
| 623 | "onmessageerror", |
| 624 | "onmousedown", |
| 625 | "onmouseenter", |
| 626 | "onmouseleave", |
| 627 | "onmousemove", |
| 628 | "onmouseout", |
| 629 | "onmouseover", |
| 630 | "onmouseup", |
| 631 | "onmousewheel", |
| 632 | "onwheel", |
| 633 | "onoffline", |
| 634 | "ononline", |
| 635 | "onpagehide", |
| 636 | "onpageshow", |
| 637 | "onpaste", |
| 638 | "onpause", |
| 639 | "onplay", |
| 640 | "onplaying", |
| 641 | "onpopstate", |
| 642 | "onprogress", |
| 643 | "onratechange", |
| 644 | "onreset", |
| 645 | "onresize", |
| 646 | "onrejectionhandled", |
| 647 | "onscroll", |
| 648 | "onsecuritypolicyviolation", |
| 649 | "onseeked", |
| 650 | "onseeking", |
| 651 | "onselect", |
| 652 | "onshow", |
| 653 | "onsort", |
| 654 | "onstalled", |
| 655 | "onstorage", |
| 656 | "onsubmit", |
| 657 | "onsuspend", |
| 658 | "ontimeupdate", |
| 659 | "ontoggle", |
| 660 | "onunhandledrejection", |
| 661 | "onunload", |
| 662 | "onvolumechange", |
| 663 | "onwaiting", |
| 664 | } |
| 665 | |
| 666 | // extra are ad-hoc values not covered by any of the lists above. |
| 667 | var extra = []string{ |
| 668 | "acronym", |
| 669 | "align", |
| 670 | "annotation", |
| 671 | "annotation-xml", |
| 672 | "applet", |
| 673 | "basefont", |
| 674 | "bgsound", |
| 675 | "big", |
| 676 | "blink", |
| 677 | "center", |
| 678 | "color", |
| 679 | "desc", |
| 680 | "face", |
| 681 | "font", |
| 682 | "foreignObject", // HTML is case-insensitive, but SVG-embedded-in-HTML is case-sensitive. |
| 683 | "foreignobject", |
| 684 | "frame", |
| 685 | "frameset", |
| 686 | "image", |
| 687 | "isindex", |
| 688 | "listing", |
| 689 | "malignmark", |
| 690 | "marquee", |
| 691 | "math", |
| 692 | "mglyph", |
| 693 | "mi", |
| 694 | "mn", |
| 695 | "mo", |
| 696 | "ms", |
| 697 | "mtext", |
| 698 | "nobr", |
| 699 | "noembed", |
| 700 | "noframes", |
| 701 | "plaintext", |
| 702 | "prompt", |
| 703 | "public", |
| 704 | "rb", |
| 705 | "rtc", |
| 706 | "spacer", |
| 707 | "strike", |
| 708 | "svg", |
| 709 | "system", |
| 710 | "tt", |
| 711 | "xmp", |
| 712 | } |