Matthias Andreas Benkard | 832a54e | 2019-01-29 09:27:38 +0100 | [diff] [blame] | 1 | /* |
| 2 | Package purell offers URL normalization as described on the wikipedia page: |
| 3 | http://en.wikipedia.org/wiki/URL_normalization |
| 4 | */ |
| 5 | package purell |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "fmt" |
| 10 | "net/url" |
| 11 | "regexp" |
| 12 | "sort" |
| 13 | "strconv" |
| 14 | "strings" |
| 15 | |
| 16 | "github.com/PuerkitoBio/urlesc" |
| 17 | "golang.org/x/net/idna" |
| 18 | "golang.org/x/text/unicode/norm" |
| 19 | "golang.org/x/text/width" |
| 20 | ) |
| 21 | |
| 22 | // A set of normalization flags determines how a URL will |
| 23 | // be normalized. |
| 24 | type NormalizationFlags uint |
| 25 | |
| 26 | const ( |
| 27 | // Safe normalizations |
| 28 | FlagLowercaseScheme NormalizationFlags = 1 << iota // HTTP://host -> http://host, applied by default in Go1.1 |
| 29 | FlagLowercaseHost // http://HOST -> http://host |
| 30 | FlagUppercaseEscapes // http://host/t%ef -> http://host/t%EF |
| 31 | FlagDecodeUnnecessaryEscapes // http://host/t%41 -> http://host/tA |
| 32 | FlagEncodeNecessaryEscapes // http://host/!"#$ -> http://host/%21%22#$ |
| 33 | FlagRemoveDefaultPort // http://host:80 -> http://host |
| 34 | FlagRemoveEmptyQuerySeparator // http://host/path? -> http://host/path |
| 35 | |
| 36 | // Usually safe normalizations |
| 37 | FlagRemoveTrailingSlash // http://host/path/ -> http://host/path |
| 38 | FlagAddTrailingSlash // http://host/path -> http://host/path/ (should choose only one of these add/remove trailing slash flags) |
| 39 | FlagRemoveDotSegments // http://host/path/./a/b/../c -> http://host/path/a/c |
| 40 | |
| 41 | // Unsafe normalizations |
| 42 | FlagRemoveDirectoryIndex // http://host/path/index.html -> http://host/path/ |
| 43 | FlagRemoveFragment // http://host/path#fragment -> http://host/path |
| 44 | FlagForceHTTP // https://host -> http://host |
| 45 | FlagRemoveDuplicateSlashes // http://host/path//a///b -> http://host/path/a/b |
| 46 | FlagRemoveWWW // http://www.host/ -> http://host/ |
| 47 | FlagAddWWW // http://host/ -> http://www.host/ (should choose only one of these add/remove WWW flags) |
| 48 | FlagSortQuery // http://host/path?c=3&b=2&a=1&b=1 -> http://host/path?a=1&b=1&b=2&c=3 |
| 49 | |
| 50 | // Normalizations not in the wikipedia article, required to cover tests cases |
| 51 | // submitted by jehiah |
| 52 | FlagDecodeDWORDHost // http://1113982867 -> http://66.102.7.147 |
| 53 | FlagDecodeOctalHost // http://0102.0146.07.0223 -> http://66.102.7.147 |
| 54 | FlagDecodeHexHost // http://0x42660793 -> http://66.102.7.147 |
| 55 | FlagRemoveUnnecessaryHostDots // http://.host../path -> http://host/path |
| 56 | FlagRemoveEmptyPortSeparator // http://host:/path -> http://host/path |
| 57 | |
| 58 | // Convenience set of safe normalizations |
| 59 | FlagsSafe NormalizationFlags = FlagLowercaseHost | FlagLowercaseScheme | FlagUppercaseEscapes | FlagDecodeUnnecessaryEscapes | FlagEncodeNecessaryEscapes | FlagRemoveDefaultPort | FlagRemoveEmptyQuerySeparator |
| 60 | |
| 61 | // For convenience sets, "greedy" uses the "remove trailing slash" and "remove www. prefix" flags, |
| 62 | // while "non-greedy" uses the "add (or keep) the trailing slash" and "add www. prefix". |
| 63 | |
| 64 | // Convenience set of usually safe normalizations (includes FlagsSafe) |
| 65 | FlagsUsuallySafeGreedy NormalizationFlags = FlagsSafe | FlagRemoveTrailingSlash | FlagRemoveDotSegments |
| 66 | FlagsUsuallySafeNonGreedy NormalizationFlags = FlagsSafe | FlagAddTrailingSlash | FlagRemoveDotSegments |
| 67 | |
| 68 | // Convenience set of unsafe normalizations (includes FlagsUsuallySafe) |
| 69 | FlagsUnsafeGreedy NormalizationFlags = FlagsUsuallySafeGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagRemoveWWW | FlagSortQuery |
| 70 | FlagsUnsafeNonGreedy NormalizationFlags = FlagsUsuallySafeNonGreedy | FlagRemoveDirectoryIndex | FlagRemoveFragment | FlagForceHTTP | FlagRemoveDuplicateSlashes | FlagAddWWW | FlagSortQuery |
| 71 | |
| 72 | // Convenience set of all available flags |
| 73 | FlagsAllGreedy = FlagsUnsafeGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator |
| 74 | FlagsAllNonGreedy = FlagsUnsafeNonGreedy | FlagDecodeDWORDHost | FlagDecodeOctalHost | FlagDecodeHexHost | FlagRemoveUnnecessaryHostDots | FlagRemoveEmptyPortSeparator |
| 75 | ) |
| 76 | |
| 77 | const ( |
| 78 | defaultHttpPort = ":80" |
| 79 | defaultHttpsPort = ":443" |
| 80 | ) |
| 81 | |
| 82 | // Regular expressions used by the normalizations |
| 83 | var rxPort = regexp.MustCompile(`(:\d+)/?$`) |
| 84 | var rxDirIndex = regexp.MustCompile(`(^|/)((?:default|index)\.\w{1,4})$`) |
| 85 | var rxDupSlashes = regexp.MustCompile(`/{2,}`) |
| 86 | var rxDWORDHost = regexp.MustCompile(`^(\d+)((?:\.+)?(?:\:\d*)?)$`) |
| 87 | var rxOctalHost = regexp.MustCompile(`^(0\d*)\.(0\d*)\.(0\d*)\.(0\d*)((?:\.+)?(?:\:\d*)?)$`) |
| 88 | var rxHexHost = regexp.MustCompile(`^0x([0-9A-Fa-f]+)((?:\.+)?(?:\:\d*)?)$`) |
| 89 | var rxHostDots = regexp.MustCompile(`^(.+?)(:\d+)?$`) |
| 90 | var rxEmptyPort = regexp.MustCompile(`:+$`) |
| 91 | |
| 92 | // Map of flags to implementation function. |
| 93 | // FlagDecodeUnnecessaryEscapes has no action, since it is done automatically |
| 94 | // by parsing the string as an URL. Same for FlagUppercaseEscapes and FlagRemoveEmptyQuerySeparator. |
| 95 | |
| 96 | // Since maps have undefined traversing order, make a slice of ordered keys |
| 97 | var flagsOrder = []NormalizationFlags{ |
| 98 | FlagLowercaseScheme, |
| 99 | FlagLowercaseHost, |
| 100 | FlagRemoveDefaultPort, |
| 101 | FlagRemoveDirectoryIndex, |
| 102 | FlagRemoveDotSegments, |
| 103 | FlagRemoveFragment, |
| 104 | FlagForceHTTP, // Must be after remove default port (because https=443/http=80) |
| 105 | FlagRemoveDuplicateSlashes, |
| 106 | FlagRemoveWWW, |
| 107 | FlagAddWWW, |
| 108 | FlagSortQuery, |
| 109 | FlagDecodeDWORDHost, |
| 110 | FlagDecodeOctalHost, |
| 111 | FlagDecodeHexHost, |
| 112 | FlagRemoveUnnecessaryHostDots, |
| 113 | FlagRemoveEmptyPortSeparator, |
| 114 | FlagRemoveTrailingSlash, // These two (add/remove trailing slash) must be last |
| 115 | FlagAddTrailingSlash, |
| 116 | } |
| 117 | |
| 118 | // ... and then the map, where order is unimportant |
| 119 | var flags = map[NormalizationFlags]func(*url.URL){ |
| 120 | FlagLowercaseScheme: lowercaseScheme, |
| 121 | FlagLowercaseHost: lowercaseHost, |
| 122 | FlagRemoveDefaultPort: removeDefaultPort, |
| 123 | FlagRemoveDirectoryIndex: removeDirectoryIndex, |
| 124 | FlagRemoveDotSegments: removeDotSegments, |
| 125 | FlagRemoveFragment: removeFragment, |
| 126 | FlagForceHTTP: forceHTTP, |
| 127 | FlagRemoveDuplicateSlashes: removeDuplicateSlashes, |
| 128 | FlagRemoveWWW: removeWWW, |
| 129 | FlagAddWWW: addWWW, |
| 130 | FlagSortQuery: sortQuery, |
| 131 | FlagDecodeDWORDHost: decodeDWORDHost, |
| 132 | FlagDecodeOctalHost: decodeOctalHost, |
| 133 | FlagDecodeHexHost: decodeHexHost, |
| 134 | FlagRemoveUnnecessaryHostDots: removeUnncessaryHostDots, |
| 135 | FlagRemoveEmptyPortSeparator: removeEmptyPortSeparator, |
| 136 | FlagRemoveTrailingSlash: removeTrailingSlash, |
| 137 | FlagAddTrailingSlash: addTrailingSlash, |
| 138 | } |
| 139 | |
| 140 | // MustNormalizeURLString returns the normalized string, and panics if an error occurs. |
| 141 | // It takes an URL string as input, as well as the normalization flags. |
| 142 | func MustNormalizeURLString(u string, f NormalizationFlags) string { |
| 143 | result, e := NormalizeURLString(u, f) |
| 144 | if e != nil { |
| 145 | panic(e) |
| 146 | } |
| 147 | return result |
| 148 | } |
| 149 | |
| 150 | // NormalizeURLString returns the normalized string, or an error if it can't be parsed into an URL object. |
| 151 | // It takes an URL string as input, as well as the normalization flags. |
| 152 | func NormalizeURLString(u string, f NormalizationFlags) (string, error) { |
| 153 | parsed, err := url.Parse(u) |
| 154 | if err != nil { |
| 155 | return "", err |
| 156 | } |
| 157 | |
| 158 | if f&FlagLowercaseHost == FlagLowercaseHost { |
| 159 | parsed.Host = strings.ToLower(parsed.Host) |
| 160 | } |
| 161 | |
| 162 | // The idna package doesn't fully conform to RFC 5895 |
| 163 | // (https://tools.ietf.org/html/rfc5895), so we do it here. |
| 164 | // Taken from Go 1.8 cycle source, courtesy of bradfitz. |
| 165 | // TODO: Remove when (if?) idna package conforms to RFC 5895. |
| 166 | parsed.Host = width.Fold.String(parsed.Host) |
| 167 | parsed.Host = norm.NFC.String(parsed.Host) |
| 168 | if parsed.Host, err = idna.ToASCII(parsed.Host); err != nil { |
| 169 | return "", err |
| 170 | } |
| 171 | |
| 172 | return NormalizeURL(parsed, f), nil |
| 173 | } |
| 174 | |
| 175 | // NormalizeURL returns the normalized string. |
| 176 | // It takes a parsed URL object as input, as well as the normalization flags. |
| 177 | func NormalizeURL(u *url.URL, f NormalizationFlags) string { |
| 178 | for _, k := range flagsOrder { |
| 179 | if f&k == k { |
| 180 | flags[k](u) |
| 181 | } |
| 182 | } |
| 183 | return urlesc.Escape(u) |
| 184 | } |
| 185 | |
| 186 | func lowercaseScheme(u *url.URL) { |
| 187 | if len(u.Scheme) > 0 { |
| 188 | u.Scheme = strings.ToLower(u.Scheme) |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | func lowercaseHost(u *url.URL) { |
| 193 | if len(u.Host) > 0 { |
| 194 | u.Host = strings.ToLower(u.Host) |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | func removeDefaultPort(u *url.URL) { |
| 199 | if len(u.Host) > 0 { |
| 200 | scheme := strings.ToLower(u.Scheme) |
| 201 | u.Host = rxPort.ReplaceAllStringFunc(u.Host, func(val string) string { |
| 202 | if (scheme == "http" && val == defaultHttpPort) || (scheme == "https" && val == defaultHttpsPort) { |
| 203 | return "" |
| 204 | } |
| 205 | return val |
| 206 | }) |
| 207 | } |
| 208 | } |
| 209 | |
| 210 | func removeTrailingSlash(u *url.URL) { |
| 211 | if l := len(u.Path); l > 0 { |
| 212 | if strings.HasSuffix(u.Path, "/") { |
| 213 | u.Path = u.Path[:l-1] |
| 214 | } |
| 215 | } else if l = len(u.Host); l > 0 { |
| 216 | if strings.HasSuffix(u.Host, "/") { |
| 217 | u.Host = u.Host[:l-1] |
| 218 | } |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | func addTrailingSlash(u *url.URL) { |
| 223 | if l := len(u.Path); l > 0 { |
| 224 | if !strings.HasSuffix(u.Path, "/") { |
| 225 | u.Path += "/" |
| 226 | } |
| 227 | } else if l = len(u.Host); l > 0 { |
| 228 | if !strings.HasSuffix(u.Host, "/") { |
| 229 | u.Host += "/" |
| 230 | } |
| 231 | } |
| 232 | } |
| 233 | |
| 234 | func removeDotSegments(u *url.URL) { |
| 235 | if len(u.Path) > 0 { |
| 236 | var dotFree []string |
| 237 | var lastIsDot bool |
| 238 | |
| 239 | sections := strings.Split(u.Path, "/") |
| 240 | for _, s := range sections { |
| 241 | if s == ".." { |
| 242 | if len(dotFree) > 0 { |
| 243 | dotFree = dotFree[:len(dotFree)-1] |
| 244 | } |
| 245 | } else if s != "." { |
| 246 | dotFree = append(dotFree, s) |
| 247 | } |
| 248 | lastIsDot = (s == "." || s == "..") |
| 249 | } |
| 250 | // Special case if host does not end with / and new path does not begin with / |
| 251 | u.Path = strings.Join(dotFree, "/") |
| 252 | if u.Host != "" && !strings.HasSuffix(u.Host, "/") && !strings.HasPrefix(u.Path, "/") { |
| 253 | u.Path = "/" + u.Path |
| 254 | } |
| 255 | // Special case if the last segment was a dot, make sure the path ends with a slash |
| 256 | if lastIsDot && !strings.HasSuffix(u.Path, "/") { |
| 257 | u.Path += "/" |
| 258 | } |
| 259 | } |
| 260 | } |
| 261 | |
| 262 | func removeDirectoryIndex(u *url.URL) { |
| 263 | if len(u.Path) > 0 { |
| 264 | u.Path = rxDirIndex.ReplaceAllString(u.Path, "$1") |
| 265 | } |
| 266 | } |
| 267 | |
| 268 | func removeFragment(u *url.URL) { |
| 269 | u.Fragment = "" |
| 270 | } |
| 271 | |
| 272 | func forceHTTP(u *url.URL) { |
| 273 | if strings.ToLower(u.Scheme) == "https" { |
| 274 | u.Scheme = "http" |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | func removeDuplicateSlashes(u *url.URL) { |
| 279 | if len(u.Path) > 0 { |
| 280 | u.Path = rxDupSlashes.ReplaceAllString(u.Path, "/") |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | func removeWWW(u *url.URL) { |
| 285 | if len(u.Host) > 0 && strings.HasPrefix(strings.ToLower(u.Host), "www.") { |
| 286 | u.Host = u.Host[4:] |
| 287 | } |
| 288 | } |
| 289 | |
| 290 | func addWWW(u *url.URL) { |
| 291 | if len(u.Host) > 0 && !strings.HasPrefix(strings.ToLower(u.Host), "www.") { |
| 292 | u.Host = "www." + u.Host |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | func sortQuery(u *url.URL) { |
| 297 | q := u.Query() |
| 298 | |
| 299 | if len(q) > 0 { |
| 300 | arKeys := make([]string, len(q)) |
| 301 | i := 0 |
| 302 | for k, _ := range q { |
| 303 | arKeys[i] = k |
| 304 | i++ |
| 305 | } |
| 306 | sort.Strings(arKeys) |
| 307 | buf := new(bytes.Buffer) |
| 308 | for _, k := range arKeys { |
| 309 | sort.Strings(q[k]) |
| 310 | for _, v := range q[k] { |
| 311 | if buf.Len() > 0 { |
| 312 | buf.WriteRune('&') |
| 313 | } |
| 314 | buf.WriteString(fmt.Sprintf("%s=%s", k, urlesc.QueryEscape(v))) |
| 315 | } |
| 316 | } |
| 317 | |
| 318 | // Rebuild the raw query string |
| 319 | u.RawQuery = buf.String() |
| 320 | } |
| 321 | } |
| 322 | |
| 323 | func decodeDWORDHost(u *url.URL) { |
| 324 | if len(u.Host) > 0 { |
| 325 | if matches := rxDWORDHost.FindStringSubmatch(u.Host); len(matches) > 2 { |
| 326 | var parts [4]int64 |
| 327 | |
| 328 | dword, _ := strconv.ParseInt(matches[1], 10, 0) |
| 329 | for i, shift := range []uint{24, 16, 8, 0} { |
| 330 | parts[i] = dword >> shift & 0xFF |
| 331 | } |
| 332 | u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[2]) |
| 333 | } |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | func decodeOctalHost(u *url.URL) { |
| 338 | if len(u.Host) > 0 { |
| 339 | if matches := rxOctalHost.FindStringSubmatch(u.Host); len(matches) > 5 { |
| 340 | var parts [4]int64 |
| 341 | |
| 342 | for i := 1; i <= 4; i++ { |
| 343 | parts[i-1], _ = strconv.ParseInt(matches[i], 8, 0) |
| 344 | } |
| 345 | u.Host = fmt.Sprintf("%d.%d.%d.%d%s", parts[0], parts[1], parts[2], parts[3], matches[5]) |
| 346 | } |
| 347 | } |
| 348 | } |
| 349 | |
| 350 | func decodeHexHost(u *url.URL) { |
| 351 | if len(u.Host) > 0 { |
| 352 | if matches := rxHexHost.FindStringSubmatch(u.Host); len(matches) > 2 { |
| 353 | // Conversion is safe because of regex validation |
| 354 | parsed, _ := strconv.ParseInt(matches[1], 16, 0) |
| 355 | // Set host as DWORD (base 10) encoded host |
| 356 | u.Host = fmt.Sprintf("%d%s", parsed, matches[2]) |
| 357 | // The rest is the same as decoding a DWORD host |
| 358 | decodeDWORDHost(u) |
| 359 | } |
| 360 | } |
| 361 | } |
| 362 | |
| 363 | func removeUnncessaryHostDots(u *url.URL) { |
| 364 | if len(u.Host) > 0 { |
| 365 | if matches := rxHostDots.FindStringSubmatch(u.Host); len(matches) > 1 { |
| 366 | // Trim the leading and trailing dots |
| 367 | u.Host = strings.Trim(matches[1], ".") |
| 368 | if len(matches) > 2 { |
| 369 | u.Host += matches[2] |
| 370 | } |
| 371 | } |
| 372 | } |
| 373 | } |
| 374 | |
| 375 | func removeEmptyPortSeparator(u *url.URL) { |
| 376 | if len(u.Host) > 0 { |
| 377 | u.Host = rxEmptyPort.ReplaceAllString(u.Host, "") |
| 378 | } |
| 379 | } |