Matthias Andreas Benkard | 832a54e | 2019-01-29 09:27:38 +0100 | [diff] [blame] | 1 | // Copyright 2013 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package encoding defines an interface for character encodings, such as Shift |
| 6 | // JIS and Windows 1252, that can convert to and from UTF-8. |
| 7 | // |
| 8 | // Encoding implementations are provided in other packages, such as |
| 9 | // golang.org/x/text/encoding/charmap and |
| 10 | // golang.org/x/text/encoding/japanese. |
| 11 | package encoding // import "golang.org/x/text/encoding" |
| 12 | |
| 13 | import ( |
| 14 | "errors" |
| 15 | "io" |
| 16 | "strconv" |
| 17 | "unicode/utf8" |
| 18 | |
| 19 | "golang.org/x/text/encoding/internal/identifier" |
| 20 | "golang.org/x/text/transform" |
| 21 | ) |
| 22 | |
| 23 | // TODO: |
| 24 | // - There seems to be some inconsistency in when decoders return errors |
| 25 | // and when not. Also documentation seems to suggest they shouldn't return |
| 26 | // errors at all (except for UTF-16). |
| 27 | // - Encoders seem to rely on or at least benefit from the input being in NFC |
| 28 | // normal form. Perhaps add an example how users could prepare their output. |
| 29 | |
| 30 | // Encoding is a character set encoding that can be transformed to and from |
| 31 | // UTF-8. |
| 32 | type Encoding interface { |
| 33 | // NewDecoder returns a Decoder. |
| 34 | NewDecoder() *Decoder |
| 35 | |
| 36 | // NewEncoder returns an Encoder. |
| 37 | NewEncoder() *Encoder |
| 38 | } |
| 39 | |
| 40 | // A Decoder converts bytes to UTF-8. It implements transform.Transformer. |
| 41 | // |
| 42 | // Transforming source bytes that are not of that encoding will not result in an |
| 43 | // error per se. Each byte that cannot be transcoded will be represented in the |
| 44 | // output by the UTF-8 encoding of '\uFFFD', the replacement rune. |
| 45 | type Decoder struct { |
| 46 | transform.Transformer |
| 47 | |
| 48 | // This forces external creators of Decoders to use names in struct |
| 49 | // initializers, allowing for future extendibility without having to break |
| 50 | // code. |
| 51 | _ struct{} |
| 52 | } |
| 53 | |
| 54 | // Bytes converts the given encoded bytes to UTF-8. It returns the converted |
| 55 | // bytes or nil, err if any error occurred. |
| 56 | func (d *Decoder) Bytes(b []byte) ([]byte, error) { |
| 57 | b, _, err := transform.Bytes(d, b) |
| 58 | if err != nil { |
| 59 | return nil, err |
| 60 | } |
| 61 | return b, nil |
| 62 | } |
| 63 | |
| 64 | // String converts the given encoded string to UTF-8. It returns the converted |
| 65 | // string or "", err if any error occurred. |
| 66 | func (d *Decoder) String(s string) (string, error) { |
| 67 | s, _, err := transform.String(d, s) |
| 68 | if err != nil { |
| 69 | return "", err |
| 70 | } |
| 71 | return s, nil |
| 72 | } |
| 73 | |
| 74 | // Reader wraps another Reader to decode its bytes. |
| 75 | // |
| 76 | // The Decoder may not be used for any other operation as long as the returned |
| 77 | // Reader is in use. |
| 78 | func (d *Decoder) Reader(r io.Reader) io.Reader { |
| 79 | return transform.NewReader(r, d) |
| 80 | } |
| 81 | |
| 82 | // An Encoder converts bytes from UTF-8. It implements transform.Transformer. |
| 83 | // |
| 84 | // Each rune that cannot be transcoded will result in an error. In this case, |
| 85 | // the transform will consume all source byte up to, not including the offending |
| 86 | // rune. Transforming source bytes that are not valid UTF-8 will be replaced by |
| 87 | // `\uFFFD`. To return early with an error instead, use transform.Chain to |
| 88 | // preprocess the data with a UTF8Validator. |
| 89 | type Encoder struct { |
| 90 | transform.Transformer |
| 91 | |
| 92 | // This forces external creators of Encoders to use names in struct |
| 93 | // initializers, allowing for future extendibility without having to break |
| 94 | // code. |
| 95 | _ struct{} |
| 96 | } |
| 97 | |
| 98 | // Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if |
| 99 | // any error occurred. |
| 100 | func (e *Encoder) Bytes(b []byte) ([]byte, error) { |
| 101 | b, _, err := transform.Bytes(e, b) |
| 102 | if err != nil { |
| 103 | return nil, err |
| 104 | } |
| 105 | return b, nil |
| 106 | } |
| 107 | |
| 108 | // String converts a string from UTF-8. It returns the converted string or |
| 109 | // "", err if any error occurred. |
| 110 | func (e *Encoder) String(s string) (string, error) { |
| 111 | s, _, err := transform.String(e, s) |
| 112 | if err != nil { |
| 113 | return "", err |
| 114 | } |
| 115 | return s, nil |
| 116 | } |
| 117 | |
| 118 | // Writer wraps another Writer to encode its UTF-8 output. |
| 119 | // |
| 120 | // The Encoder may not be used for any other operation as long as the returned |
| 121 | // Writer is in use. |
| 122 | func (e *Encoder) Writer(w io.Writer) io.Writer { |
| 123 | return transform.NewWriter(w, e) |
| 124 | } |
| 125 | |
| 126 | // ASCIISub is the ASCII substitute character, as recommended by |
| 127 | // http://unicode.org/reports/tr36/#Text_Comparison |
| 128 | const ASCIISub = '\x1a' |
| 129 | |
| 130 | // Nop is the nop encoding. Its transformed bytes are the same as the source |
| 131 | // bytes; it does not replace invalid UTF-8 sequences. |
| 132 | var Nop Encoding = nop{} |
| 133 | |
| 134 | type nop struct{} |
| 135 | |
| 136 | func (nop) NewDecoder() *Decoder { |
| 137 | return &Decoder{Transformer: transform.Nop} |
| 138 | } |
| 139 | func (nop) NewEncoder() *Encoder { |
| 140 | return &Encoder{Transformer: transform.Nop} |
| 141 | } |
| 142 | |
| 143 | // Replacement is the replacement encoding. Decoding from the replacement |
| 144 | // encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to |
| 145 | // the replacement encoding yields the same as the source bytes except that |
| 146 | // invalid UTF-8 is converted to '\uFFFD'. |
| 147 | // |
| 148 | // It is defined at http://encoding.spec.whatwg.org/#replacement |
| 149 | var Replacement Encoding = replacement{} |
| 150 | |
| 151 | type replacement struct{} |
| 152 | |
| 153 | func (replacement) NewDecoder() *Decoder { |
| 154 | return &Decoder{Transformer: replacementDecoder{}} |
| 155 | } |
| 156 | |
| 157 | func (replacement) NewEncoder() *Encoder { |
| 158 | return &Encoder{Transformer: replacementEncoder{}} |
| 159 | } |
| 160 | |
| 161 | func (replacement) ID() (mib identifier.MIB, other string) { |
| 162 | return identifier.Replacement, "" |
| 163 | } |
| 164 | |
| 165 | type replacementDecoder struct{ transform.NopResetter } |
| 166 | |
| 167 | func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 168 | if len(dst) < 3 { |
| 169 | return 0, 0, transform.ErrShortDst |
| 170 | } |
| 171 | if atEOF { |
| 172 | const fffd = "\ufffd" |
| 173 | dst[0] = fffd[0] |
| 174 | dst[1] = fffd[1] |
| 175 | dst[2] = fffd[2] |
| 176 | nDst = 3 |
| 177 | } |
| 178 | return nDst, len(src), nil |
| 179 | } |
| 180 | |
| 181 | type replacementEncoder struct{ transform.NopResetter } |
| 182 | |
| 183 | func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 184 | r, size := rune(0), 0 |
| 185 | |
| 186 | for ; nSrc < len(src); nSrc += size { |
| 187 | r = rune(src[nSrc]) |
| 188 | |
| 189 | // Decode a 1-byte rune. |
| 190 | if r < utf8.RuneSelf { |
| 191 | size = 1 |
| 192 | |
| 193 | } else { |
| 194 | // Decode a multi-byte rune. |
| 195 | r, size = utf8.DecodeRune(src[nSrc:]) |
| 196 | if size == 1 { |
| 197 | // All valid runes of size 1 (those below utf8.RuneSelf) were |
| 198 | // handled above. We have invalid UTF-8 or we haven't seen the |
| 199 | // full character yet. |
| 200 | if !atEOF && !utf8.FullRune(src[nSrc:]) { |
| 201 | err = transform.ErrShortSrc |
| 202 | break |
| 203 | } |
| 204 | r = '\ufffd' |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | if nDst+utf8.RuneLen(r) > len(dst) { |
| 209 | err = transform.ErrShortDst |
| 210 | break |
| 211 | } |
| 212 | nDst += utf8.EncodeRune(dst[nDst:], r) |
| 213 | } |
| 214 | return nDst, nSrc, err |
| 215 | } |
| 216 | |
| 217 | // HTMLEscapeUnsupported wraps encoders to replace source runes outside the |
| 218 | // repertoire of the destination encoding with HTML escape sequences. |
| 219 | // |
| 220 | // This wrapper exists to comply to URL and HTML forms requiring a |
| 221 | // non-terminating legacy encoder. The produced sequences may lead to data |
| 222 | // loss as they are indistinguishable from legitimate input. To avoid this |
| 223 | // issue, use UTF-8 encodings whenever possible. |
| 224 | func HTMLEscapeUnsupported(e *Encoder) *Encoder { |
| 225 | return &Encoder{Transformer: &errorHandler{e, errorToHTML}} |
| 226 | } |
| 227 | |
| 228 | // ReplaceUnsupported wraps encoders to replace source runes outside the |
| 229 | // repertoire of the destination encoding with an encoding-specific |
| 230 | // replacement. |
| 231 | // |
| 232 | // This wrapper is only provided for backwards compatibility and legacy |
| 233 | // handling. Its use is strongly discouraged. Use UTF-8 whenever possible. |
| 234 | func ReplaceUnsupported(e *Encoder) *Encoder { |
| 235 | return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} |
| 236 | } |
| 237 | |
| 238 | type errorHandler struct { |
| 239 | *Encoder |
| 240 | handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) |
| 241 | } |
| 242 | |
| 243 | // TODO: consider making this error public in some form. |
| 244 | type repertoireError interface { |
| 245 | Replacement() byte |
| 246 | } |
| 247 | |
| 248 | func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 249 | nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) |
| 250 | for err != nil { |
| 251 | rerr, ok := err.(repertoireError) |
| 252 | if !ok { |
| 253 | return nDst, nSrc, err |
| 254 | } |
| 255 | r, sz := utf8.DecodeRune(src[nSrc:]) |
| 256 | n, ok := h.handler(dst[nDst:], r, rerr) |
| 257 | if !ok { |
| 258 | return nDst, nSrc, transform.ErrShortDst |
| 259 | } |
| 260 | err = nil |
| 261 | nDst += n |
| 262 | if nSrc += sz; nSrc < len(src) { |
| 263 | var dn, sn int |
| 264 | dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) |
| 265 | nDst += dn |
| 266 | nSrc += sn |
| 267 | } |
| 268 | } |
| 269 | return nDst, nSrc, err |
| 270 | } |
| 271 | |
| 272 | func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { |
| 273 | buf := [8]byte{} |
| 274 | b := strconv.AppendUint(buf[:0], uint64(r), 10) |
| 275 | if n = len(b) + len("&#;"); n >= len(dst) { |
| 276 | return 0, false |
| 277 | } |
| 278 | dst[0] = '&' |
| 279 | dst[1] = '#' |
| 280 | dst[copy(dst[2:], b)+2] = ';' |
| 281 | return n, true |
| 282 | } |
| 283 | |
| 284 | func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { |
| 285 | if len(dst) == 0 { |
| 286 | return 0, false |
| 287 | } |
| 288 | dst[0] = err.Replacement() |
| 289 | return 1, true |
| 290 | } |
| 291 | |
| 292 | // ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. |
| 293 | var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") |
| 294 | |
| 295 | // UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first |
| 296 | // input byte that is not valid UTF-8. |
| 297 | var UTF8Validator transform.Transformer = utf8Validator{} |
| 298 | |
| 299 | type utf8Validator struct{ transform.NopResetter } |
| 300 | |
| 301 | func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
| 302 | n := len(src) |
| 303 | if n > len(dst) { |
| 304 | n = len(dst) |
| 305 | } |
| 306 | for i := 0; i < n; { |
| 307 | if c := src[i]; c < utf8.RuneSelf { |
| 308 | dst[i] = c |
| 309 | i++ |
| 310 | continue |
| 311 | } |
| 312 | _, size := utf8.DecodeRune(src[i:]) |
| 313 | if size == 1 { |
| 314 | // All valid runes of size 1 (those below utf8.RuneSelf) were |
| 315 | // handled above. We have invalid UTF-8 or we haven't seen the |
| 316 | // full character yet. |
| 317 | err = ErrInvalidUTF8 |
| 318 | if !atEOF && !utf8.FullRune(src[i:]) { |
| 319 | err = transform.ErrShortSrc |
| 320 | } |
| 321 | return i, i, err |
| 322 | } |
| 323 | if i+size > len(dst) { |
| 324 | return i, i, transform.ErrShortDst |
| 325 | } |
| 326 | for ; size > 0; size-- { |
| 327 | dst[i] = src[i] |
| 328 | i++ |
| 329 | } |
| 330 | } |
| 331 | if len(src) > len(dst) { |
| 332 | err = transform.ErrShortDst |
| 333 | } |
| 334 | return n, n, err |
| 335 | } |