blob: 71933696f592a5501e1a3572507a857a830cd1fb [file] [log] [blame]
Matthias Andreas Benkard832a54e2019-01-29 09:27:38 +01001// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package runes provide transforms for UTF-8 encoded text.
6package runes // import "golang.org/x/text/runes"
7
8import (
9 "unicode"
10 "unicode/utf8"
11
12 "golang.org/x/text/transform"
13)
14
15// A Set is a collection of runes.
16type Set interface {
17 // Contains returns true if r is contained in the set.
18 Contains(r rune) bool
19}
20
21type setFunc func(rune) bool
22
23func (s setFunc) Contains(r rune) bool {
24 return s(r)
25}
26
27// Note: using funcs here instead of wrapping types result in cleaner
28// documentation and a smaller API.
29
30// In creates a Set with a Contains method that returns true for all runes in
31// the given RangeTable.
32func In(rt *unicode.RangeTable) Set {
33 return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
34}
35
36// In creates a Set with a Contains method that returns true for all runes not
37// in the given RangeTable.
38func NotIn(rt *unicode.RangeTable) Set {
39 return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
40}
41
42// Predicate creates a Set with a Contains method that returns f(r).
43func Predicate(f func(rune) bool) Set {
44 return setFunc(f)
45}
46
47// Transformer implements the transform.Transformer interface.
48type Transformer struct {
49 t transform.SpanningTransformer
50}
51
52func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
53 return t.t.Transform(dst, src, atEOF)
54}
55
56func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
57 return t.t.Span(b, atEOF)
58}
59
60func (t Transformer) Reset() { t.t.Reset() }
61
62// Bytes returns a new byte slice with the result of converting b using t. It
63// calls Reset on t. It returns nil if any error was found. This can only happen
64// if an error-producing Transformer is passed to If.
65func (t Transformer) Bytes(b []byte) []byte {
66 b, _, err := transform.Bytes(t, b)
67 if err != nil {
68 return nil
69 }
70 return b
71}
72
73// String returns a string with the result of converting s using t. It calls
74// Reset on t. It returns the empty string if any error was found. This can only
75// happen if an error-producing Transformer is passed to If.
76func (t Transformer) String(s string) string {
77 s, _, err := transform.String(t, s)
78 if err != nil {
79 return ""
80 }
81 return s
82}
83
84// TODO:
85// - Copy: copying strings and bytes in whole-rune units.
86// - Validation (maybe)
87// - Well-formed-ness (maybe)
88
89const runeErrorString = string(utf8.RuneError)
90
91// Remove returns a Transformer that removes runes r for which s.Contains(r).
92// Illegal input bytes are replaced by RuneError before being passed to f.
93func Remove(s Set) Transformer {
94 if f, ok := s.(setFunc); ok {
95 // This little trick cuts the running time of BenchmarkRemove for sets
96 // created by Predicate roughly in half.
97 // TODO: special-case RangeTables as well.
98 return Transformer{remove(f)}
99 }
100 return Transformer{remove(s.Contains)}
101}
102
103// TODO: remove transform.RemoveFunc.
104
105type remove func(r rune) bool
106
107func (remove) Reset() {}
108
109// Span implements transform.Spanner.
110func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
111 for r, size := rune(0), 0; n < len(src); {
112 if r = rune(src[n]); r < utf8.RuneSelf {
113 size = 1
114 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
115 // Invalid rune.
116 if !atEOF && !utf8.FullRune(src[n:]) {
117 err = transform.ErrShortSrc
118 } else {
119 err = transform.ErrEndOfSpan
120 }
121 break
122 }
123 if t(r) {
124 err = transform.ErrEndOfSpan
125 break
126 }
127 n += size
128 }
129 return
130}
131
132// Transform implements transform.Transformer.
133func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
134 for r, size := rune(0), 0; nSrc < len(src); {
135 if r = rune(src[nSrc]); r < utf8.RuneSelf {
136 size = 1
137 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
138 // Invalid rune.
139 if !atEOF && !utf8.FullRune(src[nSrc:]) {
140 err = transform.ErrShortSrc
141 break
142 }
143 // We replace illegal bytes with RuneError. Not doing so might
144 // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
145 // The resulting byte sequence may subsequently contain runes
146 // for which t(r) is true that were passed unnoticed.
147 if !t(utf8.RuneError) {
148 if nDst+3 > len(dst) {
149 err = transform.ErrShortDst
150 break
151 }
152 dst[nDst+0] = runeErrorString[0]
153 dst[nDst+1] = runeErrorString[1]
154 dst[nDst+2] = runeErrorString[2]
155 nDst += 3
156 }
157 nSrc++
158 continue
159 }
160 if t(r) {
161 nSrc += size
162 continue
163 }
164 if nDst+size > len(dst) {
165 err = transform.ErrShortDst
166 break
167 }
168 for i := 0; i < size; i++ {
169 dst[nDst] = src[nSrc]
170 nDst++
171 nSrc++
172 }
173 }
174 return
175}
176
177// Map returns a Transformer that maps the runes in the input using the given
178// mapping. Illegal bytes in the input are converted to utf8.RuneError before
179// being passed to the mapping func.
180func Map(mapping func(rune) rune) Transformer {
181 return Transformer{mapper(mapping)}
182}
183
184type mapper func(rune) rune
185
186func (mapper) Reset() {}
187
188// Span implements transform.Spanner.
189func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
190 for r, size := rune(0), 0; n < len(src); n += size {
191 if r = rune(src[n]); r < utf8.RuneSelf {
192 size = 1
193 } else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
194 // Invalid rune.
195 if !atEOF && !utf8.FullRune(src[n:]) {
196 err = transform.ErrShortSrc
197 } else {
198 err = transform.ErrEndOfSpan
199 }
200 break
201 }
202 if t(r) != r {
203 err = transform.ErrEndOfSpan
204 break
205 }
206 }
207 return n, err
208}
209
210// Transform implements transform.Transformer.
211func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
212 var replacement rune
213 var b [utf8.UTFMax]byte
214
215 for r, size := rune(0), 0; nSrc < len(src); {
216 if r = rune(src[nSrc]); r < utf8.RuneSelf {
217 if replacement = t(r); replacement < utf8.RuneSelf {
218 if nDst == len(dst) {
219 err = transform.ErrShortDst
220 break
221 }
222 dst[nDst] = byte(replacement)
223 nDst++
224 nSrc++
225 continue
226 }
227 size = 1
228 } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
229 // Invalid rune.
230 if !atEOF && !utf8.FullRune(src[nSrc:]) {
231 err = transform.ErrShortSrc
232 break
233 }
234
235 if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
236 if nDst+3 > len(dst) {
237 err = transform.ErrShortDst
238 break
239 }
240 dst[nDst+0] = runeErrorString[0]
241 dst[nDst+1] = runeErrorString[1]
242 dst[nDst+2] = runeErrorString[2]
243 nDst += 3
244 nSrc++
245 continue
246 }
247 } else if replacement = t(r); replacement == r {
248 if nDst+size > len(dst) {
249 err = transform.ErrShortDst
250 break
251 }
252 for i := 0; i < size; i++ {
253 dst[nDst] = src[nSrc]
254 nDst++
255 nSrc++
256 }
257 continue
258 }
259
260 n := utf8.EncodeRune(b[:], replacement)
261
262 if nDst+n > len(dst) {
263 err = transform.ErrShortDst
264 break
265 }
266 for i := 0; i < n; i++ {
267 dst[nDst] = b[i]
268 nDst++
269 }
270 nSrc += size
271 }
272 return
273}
274
275// ReplaceIllFormed returns a transformer that replaces all input bytes that are
276// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
277func ReplaceIllFormed() Transformer {
278 return Transformer{&replaceIllFormed{}}
279}
280
281type replaceIllFormed struct{ transform.NopResetter }
282
283func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
284 for n < len(src) {
285 // ASCII fast path.
286 if src[n] < utf8.RuneSelf {
287 n++
288 continue
289 }
290
291 r, size := utf8.DecodeRune(src[n:])
292
293 // Look for a valid non-ASCII rune.
294 if r != utf8.RuneError || size != 1 {
295 n += size
296 continue
297 }
298
299 // Look for short source data.
300 if !atEOF && !utf8.FullRune(src[n:]) {
301 err = transform.ErrShortSrc
302 break
303 }
304
305 // We have an invalid rune.
306 err = transform.ErrEndOfSpan
307 break
308 }
309 return n, err
310}
311
312func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
313 for nSrc < len(src) {
314 // ASCII fast path.
315 if r := src[nSrc]; r < utf8.RuneSelf {
316 if nDst == len(dst) {
317 err = transform.ErrShortDst
318 break
319 }
320 dst[nDst] = r
321 nDst++
322 nSrc++
323 continue
324 }
325
326 // Look for a valid non-ASCII rune.
327 if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
328 if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
329 err = transform.ErrShortDst
330 break
331 }
332 nDst += size
333 nSrc += size
334 continue
335 }
336
337 // Look for short source data.
338 if !atEOF && !utf8.FullRune(src[nSrc:]) {
339 err = transform.ErrShortSrc
340 break
341 }
342
343 // We have an invalid rune.
344 if nDst+3 > len(dst) {
345 err = transform.ErrShortDst
346 break
347 }
348 dst[nDst+0] = runeErrorString[0]
349 dst[nDst+1] = runeErrorString[1]
350 dst[nDst+2] = runeErrorString[2]
351 nDst += 3
352 nSrc++
353 }
354 return nDst, nSrc, err
355}