001/*
002 * Copyright (C) 2008 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.base;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkPositionIndex;
020
021import com.google.common.annotations.GwtCompatible;
022import com.google.common.annotations.GwtIncompatible;
023import com.google.common.annotations.VisibleForTesting;
024import java.util.Arrays;
025import java.util.BitSet;
026
027/**
028 * Determines a true or false value for any Java {@code char} value, just as {@link Predicate} does
029 * for any {@link Object}. Also offers basic text processing methods based on this function.
030 * Implementations are strongly encouraged to be side-effect-free and immutable.
031 *
032 * <p>Throughout the documentation of this class, the phrase "matching character" is used to mean
033 * "any {@code char} value {@code c} for which {@code this.matches(c)} returns {@code true}".
034 *
035 * <p><b>Warning:</b> This class deals only with {@code char} values, that is, <a
036 * href="http://www.unicode.org/glossary/#BMP_character">BMP characters</a>. It does not understand
037 * <a href="http://www.unicode.org/glossary/#supplementary_code_point">supplementary Unicode code
038 * points</a> in the range {@code 0x10000} to {@code 0x10FFFF} which includes the majority of
039 * assigned characters, including important CJK characters and emoji.
040 *
041 * <p>Supplementary characters are <a
042 * href="https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#supplementary">encoded
043 * into a {@code String} using surrogate pairs</a>, and a {@code CharMatcher} treats these just as
044 * two separate characters. {@link #countIn} counts each supplementary character as 2 {@code char}s.
045 *
046 * <p>For up-to-date Unicode character properties (digit, letter, etc.) and support for
047 * supplementary code points, use ICU4J UCharacter and UnicodeSet (freeze() after building). For
048 * basic text processing based on UnicodeSet use the ICU4J UnicodeSetSpanner.
049 *
050 * <p>Example usages:
051 *
052 * <pre>
053 *   String trimmed = {@link #whitespace() whitespace()}.{@link #trimFrom trimFrom}(userInput);
054 *   if ({@link #ascii() ascii()}.{@link #matchesAllOf matchesAllOf}(s)) { ... }</pre>
055 *
056 * <p>See the Guava User Guide article on <a
057 * href="https://github.com/google/guava/wiki/StringsExplained#charmatcher">{@code CharMatcher}
058 * </a>.
059 *
060 * @author Kevin Bourrillion
061 * @since 1.0
062 */
063@GwtCompatible(emulated = true)
064public abstract class CharMatcher implements Predicate<Character> {
065  /*
066   *           N777777777NO
067   *         N7777777777777N
068   *        M777777777777777N
069   *        $N877777777D77777M
070   *       N M77777777ONND777M
071   *       MN777777777NN  D777
072   *     N7ZN777777777NN ~M7778
073   *    N777777777777MMNN88777N
074   *    N777777777777MNZZZ7777O
075   *    DZN7777O77777777777777
076   *     N7OONND7777777D77777N
077   *      8$M++++?N???$77777$
078   *       M7++++N+M77777777N
079   *        N77O777777777777$                              M
080   *          DNNM$$$$777777N                              D
081   *         N$N:=N$777N7777M                             NZ
082   *        77Z::::N777777777                          ODZZZ
083   *       77N::::::N77777777M                         NNZZZ$
084   *     $777:::::::77777777MN                        ZM8ZZZZZ
085   *     777M::::::Z7777777Z77                        N++ZZZZNN
086   *    7777M:::::M7777777$777M                       $++IZZZZM
087   *   M777$:::::N777777$M7777M                       +++++ZZZDN
088   *     NN$::::::7777$$M777777N                      N+++ZZZZNZ
089   *       N::::::N:7$O:77777777                      N++++ZZZZN
090   *       M::::::::::::N77777777+                   +?+++++ZZZM
091   *       8::::::::::::D77777777M                    O+++++ZZ
092   *        ::::::::::::M777777777N                      O+?D
093   *        M:::::::::::M77777777778                     77=
094   *        D=::::::::::N7777777777N                    777
095   *       INN===::::::=77777777777N                  I777N
096   *      ?777N========N7777777777787M               N7777
097   *      77777$D======N77777777777N777N?         N777777
098   *     I77777$$$N7===M$$77777777$77777777$MMZ77777777N
099   *      $$$$$$$$$$$NIZN$$$$$$$$$M$$7777777777777777ON
100   *       M$$$$$$$$M    M$$$$$$$$N=N$$$$7777777$$$ND
101   *      O77Z$$$$$$$     M$$$$$$$$MNI==$DNNNNM=~N
102   *   7 :N MNN$$$$M$      $$$777$8      8D8I
103   *     NMM.:7O           777777778
104   *                       7777777MN
105   *                       M NO .7:
106   *                       M   :   M
107   *                            8
108   */
109
110  // Constant matcher factory methods
111
112  /**
113   * Matches any character.
114   *
115   * @since 19.0 (since 1.0 as constant {@code ANY})
116   */
117  public static CharMatcher any() {
118    return Any.INSTANCE;
119  }
120
121  /**
122   * Matches no characters.
123   *
124   * @since 19.0 (since 1.0 as constant {@code NONE})
125   */
126  public static CharMatcher none() {
127    return None.INSTANCE;
128  }
129
130  /**
131   * Determines whether a character is whitespace according to the latest Unicode standard, as
132   * illustrated <a
133   * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
134   * This is not the same definition used by other Java APIs. (See a <a
135   * href="https://goo.gl/Y6SLWx">comparison of several definitions of "whitespace"</a>.)
136   *
137   * <p>All Unicode White_Space characters are on the BMP and thus supported by this API.
138   *
139   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this matcher to keep it up to
140   * date.
141   *
142   * @since 19.0 (since 1.0 as constant {@code WHITESPACE})
143   */
144  public static CharMatcher whitespace() {
145    return Whitespace.INSTANCE;
146  }
147
148  /**
149   * Determines whether a character is a breaking whitespace (that is, a whitespace which can be
150   * interpreted as a break between words for formatting purposes). See {@link #whitespace()} for a
151   * discussion of that term.
152   *
153   * @since 19.0 (since 2.0 as constant {@code BREAKING_WHITESPACE})
154   */
155  public static CharMatcher breakingWhitespace() {
156    return BreakingWhitespace.INSTANCE;
157  }
158
159  /**
160   * Determines whether a character is ASCII, meaning that its code point is less than 128.
161   *
162   * @since 19.0 (since 1.0 as constant {@code ASCII})
163   */
164  public static CharMatcher ascii() {
165    return Ascii.INSTANCE;
166  }
167
168  /**
169   * Determines whether a character is a BMP digit according to <a
170   * href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">Unicode</a>. If
171   * you only care to match ASCII digits, you can use {@code inRange('0', '9')}.
172   *
173   * @deprecated Many digits are supplementary characters; see the class documentation.
174   * @since 19.0 (since 1.0 as constant {@code DIGIT})
175   */
176  @Deprecated
177  public static CharMatcher digit() {
178    return Digit.INSTANCE;
179  }
180
181  /**
182   * Determines whether a character is a BMP digit according to {@linkplain Character#isDigit(char)
183   * Java's definition}. If you only care to match ASCII digits, you can use {@code inRange('0',
184   * '9')}.
185   *
186   * @deprecated Many digits are supplementary characters; see the class documentation.
187   * @since 19.0 (since 1.0 as constant {@code JAVA_DIGIT})
188   */
189  @Deprecated
190  public static CharMatcher javaDigit() {
191    return JavaDigit.INSTANCE;
192  }
193
194  /**
195   * Determines whether a character is a BMP letter according to {@linkplain
196   * Character#isLetter(char) Java's definition}. If you only care to match letters of the Latin
197   * alphabet, you can use {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
198   *
199   * @deprecated Most letters are supplementary characters; see the class documentation.
200   * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER})
201   */
202  @Deprecated
203  public static CharMatcher javaLetter() {
204    return JavaLetter.INSTANCE;
205  }
206
207  /**
208   * Determines whether a character is a BMP letter or digit according to {@linkplain
209   * Character#isLetterOrDigit(char) Java's definition}.
210   *
211   * @deprecated Most letters and digits are supplementary characters; see the class documentation.
212   * @since 19.0 (since 1.0 as constant {@code JAVA_LETTER_OR_DIGIT}).
213   */
214  @Deprecated
215  public static CharMatcher javaLetterOrDigit() {
216    return JavaLetterOrDigit.INSTANCE;
217  }
218
219  /**
220   * Determines whether a BMP character is upper case according to {@linkplain
221   * Character#isUpperCase(char) Java's definition}.
222   *
223   * @deprecated Some uppercase characters are supplementary characters; see the class
224   *     documentation.
225   * @since 19.0 (since 1.0 as constant {@code JAVA_UPPER_CASE})
226   */
227  @Deprecated
228  public static CharMatcher javaUpperCase() {
229    return JavaUpperCase.INSTANCE;
230  }
231
232  /**
233   * Determines whether a BMP character is lower case according to {@linkplain
234   * Character#isLowerCase(char) Java's definition}.
235   *
236   * @deprecated Some lowercase characters are supplementary characters; see the class
237   *     documentation.
238   * @since 19.0 (since 1.0 as constant {@code JAVA_LOWER_CASE})
239   */
240  @Deprecated
241  public static CharMatcher javaLowerCase() {
242    return JavaLowerCase.INSTANCE;
243  }
244
245  /**
246   * Determines whether a character is an ISO control character as specified by {@link
247   * Character#isISOControl(char)}.
248   *
249   * <p>All ISO control codes are on the BMP and thus supported by this API.
250   *
251   * @since 19.0 (since 1.0 as constant {@code JAVA_ISO_CONTROL})
252   */
253  public static CharMatcher javaIsoControl() {
254    return JavaIsoControl.INSTANCE;
255  }
256
257  /**
258   * Determines whether a character is invisible; that is, if its Unicode category is any of
259   * SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT, SURROGATE, and
260   * PRIVATE_USE according to ICU4J.
261   *
262   * <p>See also the Unicode Default_Ignorable_Code_Point property (available via ICU).
263   *
264   * @deprecated Most invisible characters are supplementary characters; see the class
265   *     documentation.
266   * @since 19.0 (since 1.0 as constant {@code INVISIBLE})
267   */
268  @Deprecated
269  public static CharMatcher invisible() {
270    return Invisible.INSTANCE;
271  }
272
273  /**
274   * Determines whether a character is single-width (not double-width). When in doubt, this matcher
275   * errs on the side of returning {@code false} (that is, it tends to assume a character is
276   * double-width).
277   *
278   * <p><b>Note:</b> as the reference file evolves, we will modify this matcher to keep it up to
279   * date.
280   *
281   * <p>See also <a href="http://www.unicode.org/reports/tr11/">UAX #11 East Asian Width</a>.
282   *
283   * @deprecated Many such characters are supplementary characters; see the class documentation.
284   * @since 19.0 (since 1.0 as constant {@code SINGLE_WIDTH})
285   */
286  @Deprecated
287  public static CharMatcher singleWidth() {
288    return SingleWidth.INSTANCE;
289  }
290
291  // Legacy constants
292
293  /**
294   * Determines whether a character is whitespace according to the latest Unicode
295   * standard, as illustrated
296   * <a
297  // href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bwhitespace%7D">here</a>.
298   * This is not the same definition used by other Java APIs. (See a
299   * <a href="https://goo.gl/Y6SLWx">comparison of several definitions of
300   * "whitespace"</a>.)
301   *
302   * <p><b>Note:</b> as the Unicode definition evolves, we will modify this constant
303   * to keep it up to date.
304   *
305   * @deprecated Use {@link #whitespace()} instead. This constant is scheduled to be
306   *     removed in June 2018.
307   */
308  @com.google.common.annotations.Beta
309  @Deprecated
310  public static final CharMatcher WHITESPACE = whitespace();
311
312  /**
313   * Determines whether a character is a breaking whitespace (that is, a whitespace
314   * which can be interpreted as a break between words for formatting purposes). See
315   * {@link #whitespace} for a discussion of that term.
316   *
317   * @since 2.0
318   * @deprecated Use {@link #breakingWhitespace()} instead. This constant is scheduled
319   *     to be removed in June 2018.
320   */
321  @com.google.common.annotations.Beta
322  @Deprecated
323  public static final CharMatcher BREAKING_WHITESPACE = breakingWhitespace();
324
325  /**
326   * Determines whether a character is ASCII, meaning that its code point is less than
327   * 128.
328   *
329   * @deprecated Use {@link #ascii()} instead. This constant is scheduled to be
330   *     removed in June 2018.
331   */
332  @com.google.common.annotations.Beta
333  @Deprecated
334  public static final CharMatcher ASCII = ascii();
335
336  /**
337   * Determines whether a character is a digit according to
338   * <a href="http://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5Cp%7Bdigit%7D">
339   * Unicode</a>. If you only care to match ASCII digits, you can use
340   * {@code inRange('0', '9')}.
341   *
342   * @deprecated Many digits are supplementary characters; see the class
343   *     documentation. If you need to use this, use {@link #digit()} instead. This
344   * .   constant is scheduled to be removed in June 2018.
345   */
346  @com.google.common.annotations.Beta
347  @Deprecated
348  public static final CharMatcher DIGIT = digit();
349
350  /**
351   * Determines whether a character is a digit according to
352   * {@linkplain Character#isDigit(char) Java's definition}. If you only care to match
353   * ASCII digits, you can use {@code inRange('0', '9')}.
354   *
355   * @deprecated Many digits are supplementary characters; see the class
356   *     documentation. If you need to use this, use {@link #javaDigit()} instead.
357   *     This constant is scheduled to be removed in June 2018.
358   */
359  @com.google.common.annotations.Beta
360  @Deprecated
361  public static final CharMatcher JAVA_DIGIT = javaDigit();
362
363  /**
364   * Determines whether a character is a letter according to
365   * {@linkplain Character#isLetter(char) Java's definition}. If you only care to
366   * match letters of the Latin alphabet, you can use
367   * {@code inRange('a', 'z').or(inRange('A', 'Z'))}.
368   *
369   * @deprecated Most letters are supplementary characters; see the class
370   *     documentation. If you need to use this, use {@link #javaLetter()} instead.
371   *     This constant is scheduled to be removed in June 2018.
372   */
373  @com.google.common.annotations.Beta
374  @Deprecated
375  public static final CharMatcher JAVA_LETTER = javaLetter();
376
377  /**
378   * Determines whether a character is a letter or digit according to
379   * {@linkplain Character#isLetterOrDigit(char) Java's definition}.
380   *
381   * @deprecated Most letters and digits are supplementary characters; see the class
382   *     documentation. If you need to use this, use {@link #javaLetterOrDigit()}
383   *     instead. This constant is scheduled to be removed in June 2018.
384   */
385  @com.google.common.annotations.Beta
386  @Deprecated
387  public static final CharMatcher JAVA_LETTER_OR_DIGIT = javaLetterOrDigit();
388
389  /**
390   * Determines whether a character is upper case according to
391   * {@linkplain Character#isUpperCase(char) Java's definition}.
392   *
393   * @deprecated Some uppercase letters are supplementary characters; see the class
394   *     documentation. If you need to use this, use {@link #javaUpperCase()} instead.
395   *     This constant is scheduled to be removed in June 2018.
396   */
397  @com.google.common.annotations.Beta
398  @Deprecated
399  public static final CharMatcher JAVA_UPPER_CASE = javaUpperCase();
400
401  /**
402   * Determines whether a character is lower case according to
403   * {@linkplain Character#isLowerCase(char) Java's definition}.
404   *
405   * @deprecated Some lowercase letters are supplementary characters; see the class
406   *     documentation. If you need to use this, use {@link #javaLowerCase()} instead.
407   *     This constant is scheduled to be removed in June 2018.
408   */
409  @com.google.common.annotations.Beta
410  @Deprecated
411  public static final CharMatcher JAVA_LOWER_CASE = javaLowerCase();
412
413  /**
414   * Determines whether a character is an ISO control character as specified by
415   * {@link Character#isISOControl(char)}.
416   *
417   * @deprecated Use {@link #javaIsoControl()} instead. This constant is scheduled to
418   *     be removed in June 2018.
419   */
420  @com.google.common.annotations.Beta
421  @Deprecated
422  public static final CharMatcher JAVA_ISO_CONTROL = javaIsoControl();
423
424  /**
425   * Determines whether a character is invisible; that is, if its Unicode category is
426   * any of SPACE_SEPARATOR, LINE_SEPARATOR, PARAGRAPH_SEPARATOR, CONTROL, FORMAT,
427   * SURROGATE, and PRIVATE_USE according to ICU4J.
428   *
429   * @deprecated Most invisible characters are supplementary characters; see the class
430   *     documentation. If you need to use this, use {@link #invisible()} instead.
431   *     This constant is scheduled to be removed in June 2018.
432   */
433  @com.google.common.annotations.Beta
434  @Deprecated
435  public static final CharMatcher INVISIBLE = invisible();
436
437  /**
438   * Determines whether a character is single-width (not double-width). When in doubt,
439   * this matcher errs on the side of returning {@code false} (that is, it tends to
440   * assume a character is double-width).
441   *
442   * <p><b>Note:</b> as the reference file evolves, we will modify this constant to
443   * keep it up to date.
444   *
445   * @deprecated Many such characters are supplementary characters; see the class
446   *     documentation. If you need to use this, use {@link #singleWidth()} instead.
447   *     This constant is scheduled to be removed in June 2018.
448   */
449  @com.google.common.annotations.Beta
450  @Deprecated
451  public static final CharMatcher SINGLE_WIDTH = singleWidth();
452
453  /**
454   * Matches any character.
455   *
456   * @deprecated Use {@link #any()} instead. This constant is scheduled to be
457   *     removed in June 2018.
458   */
459  @com.google.common.annotations.Beta
460  @Deprecated
461  public static final CharMatcher ANY = any();
462
463  /**
464   * Matches no characters.
465   *
466   * @deprecated Use {@link #none()} instead. This constant is scheduled to be
467   *     removed in June 2018.
468   */
469  @com.google.common.annotations.Beta
470  @Deprecated
471  public static final CharMatcher NONE = none();
472
473  // Static factories
474
475  /** Returns a {@code char} matcher that matches only one specified BMP character. */
476  public static CharMatcher is(final char match) {
477    return new Is(match);
478  }
479
480  /**
481   * Returns a {@code char} matcher that matches any character except the BMP character specified.
482   *
483   * <p>To negate another {@code CharMatcher}, use {@link #negate()}.
484   */
485  public static CharMatcher isNot(final char match) {
486    return new IsNot(match);
487  }
488
489  /**
490   * Returns a {@code char} matcher that matches any BMP character present in the given character
491   * sequence. Returns a bogus matcher if the sequence contains supplementary characters.
492   */
493  public static CharMatcher anyOf(final CharSequence sequence) {
494    switch (sequence.length()) {
495      case 0:
496        return none();
497      case 1:
498        return is(sequence.charAt(0));
499      case 2:
500        return isEither(sequence.charAt(0), sequence.charAt(1));
501      default:
502        // TODO(lowasser): is it potentially worth just going ahead and building a precomputed
503        // matcher?
504        return new AnyOf(sequence);
505    }
506  }
507
508  /**
509   * Returns a {@code char} matcher that matches any BMP character not present in the given
510   * character sequence. Returns a bogus matcher if the sequence contains supplementary characters.
511   */
512  public static CharMatcher noneOf(CharSequence sequence) {
513    return anyOf(sequence).negate();
514  }
515
516  /**
517   * Returns a {@code char} matcher that matches any character in a given BMP range (both endpoints
518   * are inclusive). For example, to match any lowercase letter of the English alphabet, use {@code
519   * CharMatcher.inRange('a', 'z')}.
520   *
521   * @throws IllegalArgumentException if {@code endInclusive < startInclusive}
522   */
523  public static CharMatcher inRange(final char startInclusive, final char endInclusive) {
524    return new InRange(startInclusive, endInclusive);
525  }
526
527  /**
528   * Returns a matcher with identical behavior to the given {@link Character}-based predicate, but
529   * which operates on primitive {@code char} instances instead.
530   */
531  public static CharMatcher forPredicate(final Predicate<? super Character> predicate) {
532    return predicate instanceof CharMatcher ? (CharMatcher) predicate : new ForPredicate(predicate);
533  }
534
535  // Constructors
536
537  /**
538   * Constructor for use by subclasses. When subclassing, you may want to override {@code
539   * toString()} to provide a useful description.
540   */
541  protected CharMatcher() {}
542
543  // Abstract methods
544
545  /** Determines a true or false value for the given character. */
546  public abstract boolean matches(char c);
547
548  // Non-static factories
549
550  /** Returns a matcher that matches any character not matched by this matcher. */
551  // @Override under Java 8 but not under Java 7
552  @Override
553  public CharMatcher negate() {
554    return new Negated(this);
555  }
556
557  /**
558   * Returns a matcher that matches any character matched by both this matcher and {@code other}.
559   */
560  public CharMatcher and(CharMatcher other) {
561    return new And(this, other);
562  }
563
564  /**
565   * Returns a matcher that matches any character matched by either this matcher or {@code other}.
566   */
567  public CharMatcher or(CharMatcher other) {
568    return new Or(this, other);
569  }
570
571  /**
572   * Returns a {@code char} matcher functionally equivalent to this one, but which may be faster to
573   * query than the original; your mileage may vary. Precomputation takes time and is likely to be
574   * worthwhile only if the precomputed matcher is queried many thousands of times.
575   *
576   * <p>This method has no effect (returns {@code this}) when called in GWT: it's unclear whether a
577   * precomputed matcher is faster, but it certainly consumes more memory, which doesn't seem like a
578   * worthwhile tradeoff in a browser.
579   */
580  public CharMatcher precomputed() {
581    return Platform.precomputeCharMatcher(this);
582  }
583
584  private static final int DISTINCT_CHARS = Character.MAX_VALUE - Character.MIN_VALUE + 1;
585
586  /**
587   * This is the actual implementation of {@link #precomputed}, but we bounce calls through a method
588   * on {@link Platform} so that we can have different behavior in GWT.
589   *
590   * <p>This implementation tries to be smart in a number of ways. It recognizes cases where the
591   * negation is cheaper to precompute than the matcher itself; it tries to build small hash tables
592   * for matchers that only match a few characters, and so on. In the worst-case scenario, it
593   * constructs an eight-kilobyte bit array and queries that. In many situations this produces a
594   * matcher which is faster to query than the original.
595   */
596  @GwtIncompatible // SmallCharMatcher
597  CharMatcher precomputedInternal() {
598    final BitSet table = new BitSet();
599    setBits(table);
600    int totalCharacters = table.cardinality();
601    if (totalCharacters * 2 <= DISTINCT_CHARS) {
602      return precomputedPositive(totalCharacters, table, toString());
603    } else {
604      // TODO(lowasser): is it worth it to worry about the last character of large matchers?
605      table.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
606      int negatedCharacters = DISTINCT_CHARS - totalCharacters;
607      String suffix = ".negate()";
608      final String description = toString();
609      String negatedDescription =
610          description.endsWith(suffix)
611              ? description.substring(0, description.length() - suffix.length())
612              : description + suffix;
613      return new NegatedFastMatcher(
614          precomputedPositive(negatedCharacters, table, negatedDescription)) {
615        @Override
616        public String toString() {
617          return description;
618        }
619      };
620    }
621  }
622
623  /**
624   * Helper method for {@link #precomputedInternal} that doesn't test if the negation is cheaper.
625   */
626  @GwtIncompatible // SmallCharMatcher
627  private static CharMatcher precomputedPositive(
628      int totalCharacters, BitSet table, String description) {
629    switch (totalCharacters) {
630      case 0:
631        return none();
632      case 1:
633        return is((char) table.nextSetBit(0));
634      case 2:
635        char c1 = (char) table.nextSetBit(0);
636        char c2 = (char) table.nextSetBit(c1 + 1);
637        return isEither(c1, c2);
638      default:
639        return isSmall(totalCharacters, table.length())
640            ? SmallCharMatcher.from(table, description)
641            : new BitSetMatcher(table, description);
642    }
643  }
644
645  @GwtIncompatible // SmallCharMatcher
646  private static boolean isSmall(int totalCharacters, int tableLength) {
647    return totalCharacters <= SmallCharMatcher.MAX_SIZE
648        && tableLength > (totalCharacters * 4 * Character.SIZE);
649    // err on the side of BitSetMatcher
650  }
651
652  /** Sets bits in {@code table} matched by this matcher. */
653  @GwtIncompatible // used only from other GwtIncompatible code
654  void setBits(BitSet table) {
655    for (int c = Character.MAX_VALUE; c >= Character.MIN_VALUE; c--) {
656      if (matches((char) c)) {
657        table.set(c);
658      }
659    }
660  }
661
662  // Text processing routines
663
664  /**
665   * Returns {@code true} if a character sequence contains at least one matching BMP character.
666   * Equivalent to {@code !matchesNoneOf(sequence)}.
667   *
668   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
669   * character, until this returns {@code true} or the end is reached.
670   *
671   * @param sequence the character sequence to examine, possibly empty
672   * @return {@code true} if this matcher matches at least one character in the sequence
673   * @since 8.0
674   */
675  public boolean matchesAnyOf(CharSequence sequence) {
676    return !matchesNoneOf(sequence);
677  }
678
679  /**
680   * Returns {@code true} if a character sequence contains only matching BMP characters.
681   *
682   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
683   * character, until this returns {@code false} or the end is reached.
684   *
685   * @param sequence the character sequence to examine, possibly empty
686   * @return {@code true} if this matcher matches every character in the sequence, including when
687   *     the sequence is empty
688   */
689  public boolean matchesAllOf(CharSequence sequence) {
690    for (int i = sequence.length() - 1; i >= 0; i--) {
691      if (!matches(sequence.charAt(i))) {
692        return false;
693      }
694    }
695    return true;
696  }
697
698  /**
699   * Returns {@code true} if a character sequence contains no matching BMP characters. Equivalent to
700   * {@code !matchesAnyOf(sequence)}.
701   *
702   * <p>The default implementation iterates over the sequence, invoking {@link #matches} for each
703   * character, until this returns {@code true} or the end is reached.
704   *
705   * @param sequence the character sequence to examine, possibly empty
706   * @return {@code true} if this matcher matches no characters in the sequence, including when the
707   *     sequence is empty
708   */
709  public boolean matchesNoneOf(CharSequence sequence) {
710    return indexIn(sequence) == -1;
711  }
712
713  /**
714   * Returns the index of the first matching BMP character in a character sequence, or {@code -1} if
715   * no matching character is present.
716   *
717   * <p>The default implementation iterates over the sequence in forward order calling {@link
718   * #matches} for each character.
719   *
720   * @param sequence the character sequence to examine from the beginning
721   * @return an index, or {@code -1} if no character matches
722   */
723  public int indexIn(CharSequence sequence) {
724    return indexIn(sequence, 0);
725  }
726
727  /**
728   * Returns the index of the first matching BMP character in a character sequence, starting from a
729   * given position, or {@code -1} if no character matches after that position.
730   *
731   * <p>The default implementation iterates over the sequence in forward order, beginning at {@code
732   * start}, calling {@link #matches} for each character.
733   *
734   * @param sequence the character sequence to examine
735   * @param start the first index to examine; must be nonnegative and no greater than {@code
736   *     sequence.length()}
737   * @return the index of the first matching character, guaranteed to be no less than {@code start},
738   *     or {@code -1} if no character matches
739   * @throws IndexOutOfBoundsException if start is negative or greater than {@code
740   *     sequence.length()}
741   */
742  public int indexIn(CharSequence sequence, int start) {
743    int length = sequence.length();
744    checkPositionIndex(start, length);
745    for (int i = start; i < length; i++) {
746      if (matches(sequence.charAt(i))) {
747        return i;
748      }
749    }
750    return -1;
751  }
752
753  /**
754   * Returns the index of the last matching BMP character in a character sequence, or {@code -1} if
755   * no matching character is present.
756   *
757   * <p>The default implementation iterates over the sequence in reverse order calling {@link
758   * #matches} for each character.
759   *
760   * @param sequence the character sequence to examine from the end
761   * @return an index, or {@code -1} if no character matches
762   */
763  public int lastIndexIn(CharSequence sequence) {
764    for (int i = sequence.length() - 1; i >= 0; i--) {
765      if (matches(sequence.charAt(i))) {
766        return i;
767      }
768    }
769    return -1;
770  }
771
772  /**
773   * Returns the number of matching {@code char}s found in a character sequence.
774   *
775   * <p>Counts 2 per supplementary character, such as for {@link #whitespace}().{@link #negate}().
776   */
777  public int countIn(CharSequence sequence) {
778    int count = 0;
779    for (int i = 0; i < sequence.length(); i++) {
780      if (matches(sequence.charAt(i))) {
781        count++;
782      }
783    }
784    return count;
785  }
786
787  /**
788   * Returns a string containing all non-matching characters of a character sequence, in order. For
789   * example:
790   *
791   * <pre>{@code
792   * CharMatcher.is('a').removeFrom("bazaar")
793   * }</pre>
794   *
795   * ... returns {@code "bzr"}.
796   */
797  public String removeFrom(CharSequence sequence) {
798    String string = sequence.toString();
799    int pos = indexIn(string);
800    if (pos == -1) {
801      return string;
802    }
803
804    char[] chars = string.toCharArray();
805    int spread = 1;
806
807    // This unusual loop comes from extensive benchmarking
808    OUT:
809    while (true) {
810      pos++;
811      while (true) {
812        if (pos == chars.length) {
813          break OUT;
814        }
815        if (matches(chars[pos])) {
816          break;
817        }
818        chars[pos - spread] = chars[pos];
819        pos++;
820      }
821      spread++;
822    }
823    return new String(chars, 0, pos - spread);
824  }
825
826  /**
827   * Returns a string containing all matching BMP characters of a character sequence, in order. For
828   * example:
829   *
830   * <pre>{@code
831   * CharMatcher.is('a').retainFrom("bazaar")
832   * }</pre>
833   *
834   * ... returns {@code "aaa"}.
835   */
836  public String retainFrom(CharSequence sequence) {
837    return negate().removeFrom(sequence);
838  }
839
840  /**
841   * Returns a string copy of the input character sequence, with each matching BMP character
842   * replaced by a given replacement character. For example:
843   *
844   * <pre>{@code
845   * CharMatcher.is('a').replaceFrom("radar", 'o')
846   * }</pre>
847   *
848   * ... returns {@code "rodor"}.
849   *
850   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
851   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
852   * character.
853   *
854   * @param sequence the character sequence to replace matching characters in
855   * @param replacement the character to append to the result string in place of each matching
856   *     character in {@code sequence}
857   * @return the new string
858   */
859  public String replaceFrom(CharSequence sequence, char replacement) {
860    String string = sequence.toString();
861    int pos = indexIn(string);
862    if (pos == -1) {
863      return string;
864    }
865    char[] chars = string.toCharArray();
866    chars[pos] = replacement;
867    for (int i = pos + 1; i < chars.length; i++) {
868      if (matches(chars[i])) {
869        chars[i] = replacement;
870      }
871    }
872    return new String(chars);
873  }
874
875  /**
876   * Returns a string copy of the input character sequence, with each matching BMP character
877   * replaced by a given replacement sequence. For example:
878   *
879   * <pre>{@code
880   * CharMatcher.is('a').replaceFrom("yaha", "oo")
881   * }</pre>
882   *
883   * ... returns {@code "yoohoo"}.
884   *
885   * <p><b>Note:</b> If the replacement is a fixed string with only one character, you are better
886   * off calling {@link #replaceFrom(CharSequence, char)} directly.
887   *
888   * @param sequence the character sequence to replace matching characters in
889   * @param replacement the characters to append to the result string in place of each matching
890   *     character in {@code sequence}
891   * @return the new string
892   */
893  public String replaceFrom(CharSequence sequence, CharSequence replacement) {
894    int replacementLen = replacement.length();
895    if (replacementLen == 0) {
896      return removeFrom(sequence);
897    }
898    if (replacementLen == 1) {
899      return replaceFrom(sequence, replacement.charAt(0));
900    }
901
902    String string = sequence.toString();
903    int pos = indexIn(string);
904    if (pos == -1) {
905      return string;
906    }
907
908    int len = string.length();
909    StringBuilder buf = new StringBuilder((len * 3 / 2) + 16);
910
911    int oldpos = 0;
912    do {
913      buf.append(string, oldpos, pos);
914      buf.append(replacement);
915      oldpos = pos + 1;
916      pos = indexIn(string, oldpos);
917    } while (pos != -1);
918
919    buf.append(string, oldpos, len);
920    return buf.toString();
921  }
922
923  /**
924   * Returns a substring of the input character sequence that omits all matching BMP characters from
925   * the beginning and from the end of the string. For example:
926   *
927   * <pre>{@code
928   * CharMatcher.anyOf("ab").trimFrom("abacatbab")
929   * }</pre>
930   *
931   * ... returns {@code "cat"}.
932   *
933   * <p>Note that:
934   *
935   * <pre>{@code
936   * CharMatcher.inRange('\0', ' ').trimFrom(str)
937   * }</pre>
938   *
939   * ... is equivalent to {@link String#trim()}.
940   */
941  public String trimFrom(CharSequence sequence) {
942    int len = sequence.length();
943    int first;
944    int last;
945
946    for (first = 0; first < len; first++) {
947      if (!matches(sequence.charAt(first))) {
948        break;
949      }
950    }
951    for (last = len - 1; last > first; last--) {
952      if (!matches(sequence.charAt(last))) {
953        break;
954      }
955    }
956
957    return sequence.subSequence(first, last + 1).toString();
958  }
959
960  /**
961   * Returns a substring of the input character sequence that omits all matching BMP characters from
962   * the beginning of the string. For example:
963   *
964   * <pre>{@code
965   * CharMatcher.anyOf("ab").trimLeadingFrom("abacatbab")
966   * }</pre>
967   *
968   * ... returns {@code "catbab"}.
969   */
970  public String trimLeadingFrom(CharSequence sequence) {
971    int len = sequence.length();
972    for (int first = 0; first < len; first++) {
973      if (!matches(sequence.charAt(first))) {
974        return sequence.subSequence(first, len).toString();
975      }
976    }
977    return "";
978  }
979
980  /**
981   * Returns a substring of the input character sequence that omits all matching BMP characters from
982   * the end of the string. For example:
983   *
984   * <pre>{@code
985   * CharMatcher.anyOf("ab").trimTrailingFrom("abacatbab")
986   * }</pre>
987   *
988   * ... returns {@code "abacat"}.
989   */
990  public String trimTrailingFrom(CharSequence sequence) {
991    int len = sequence.length();
992    for (int last = len - 1; last >= 0; last--) {
993      if (!matches(sequence.charAt(last))) {
994        return sequence.subSequence(0, last + 1).toString();
995      }
996    }
997    return "";
998  }
999
1000  /**
1001   * Returns a string copy of the input character sequence, with each group of consecutive matching
1002   * BMP characters replaced by a single replacement character. For example:
1003   *
1004   * <pre>{@code
1005   * CharMatcher.anyOf("eko").collapseFrom("bookkeeper", '-')
1006   * }</pre>
1007   *
1008   * ... returns {@code "b-p-r"}.
1009   *
1010   * <p>The default implementation uses {@link #indexIn(CharSequence)} to find the first matching
1011   * character, then iterates the remainder of the sequence calling {@link #matches(char)} for each
1012   * character.
1013   *
1014   * @param sequence the character sequence to replace matching groups of characters in
1015   * @param replacement the character to append to the result string in place of each group of
1016   *     matching characters in {@code sequence}
1017   * @return the new string
1018   */
1019  public String collapseFrom(CharSequence sequence, char replacement) {
1020    // This implementation avoids unnecessary allocation.
1021    int len = sequence.length();
1022    for (int i = 0; i < len; i++) {
1023      char c = sequence.charAt(i);
1024      if (matches(c)) {
1025        if (c == replacement && (i == len - 1 || !matches(sequence.charAt(i + 1)))) {
1026          // a no-op replacement
1027          i++;
1028        } else {
1029          StringBuilder builder = new StringBuilder(len).append(sequence, 0, i).append(replacement);
1030          return finishCollapseFrom(sequence, i + 1, len, replacement, builder, true);
1031        }
1032      }
1033    }
1034    // no replacement needed
1035    return sequence.toString();
1036  }
1037
1038  /**
1039   * Collapses groups of matching characters exactly as {@link #collapseFrom} does, except that
1040   * groups of matching BMP characters at the start or end of the sequence are removed without
1041   * replacement.
1042   */
1043  public String trimAndCollapseFrom(CharSequence sequence, char replacement) {
1044    // This implementation avoids unnecessary allocation.
1045    int len = sequence.length();
1046    int first = 0;
1047    int last = len - 1;
1048
1049    while (first < len && matches(sequence.charAt(first))) {
1050      first++;
1051    }
1052
1053    while (last > first && matches(sequence.charAt(last))) {
1054      last--;
1055    }
1056
1057    return (first == 0 && last == len - 1)
1058        ? collapseFrom(sequence, replacement)
1059        : finishCollapseFrom(
1060            sequence, first, last + 1, replacement, new StringBuilder(last + 1 - first), false);
1061  }
1062
1063  private String finishCollapseFrom(
1064      CharSequence sequence,
1065      int start,
1066      int end,
1067      char replacement,
1068      StringBuilder builder,
1069      boolean inMatchingGroup) {
1070    for (int i = start; i < end; i++) {
1071      char c = sequence.charAt(i);
1072      if (matches(c)) {
1073        if (!inMatchingGroup) {
1074          builder.append(replacement);
1075          inMatchingGroup = true;
1076        }
1077      } else {
1078        builder.append(c);
1079        inMatchingGroup = false;
1080      }
1081    }
1082    return builder.toString();
1083  }
1084
1085  /**
1086   * @deprecated Provided only to satisfy the {@link Predicate} interface; use {@link #matches}
1087   *     instead.
1088   */
1089  @Deprecated
1090  @Override
1091  public boolean apply(Character character) {
1092    return matches(character);
1093  }
1094
1095  /**
1096   * Returns a string representation of this {@code CharMatcher}, such as {@code
1097   * CharMatcher.or(WHITESPACE, JAVA_DIGIT)}.
1098   */
1099  @Override
1100  public String toString() {
1101    return super.toString();
1102  }
1103
1104  /**
1105   * Returns the Java Unicode escape sequence for the given {@code char}, in the form "\u12AB" where
1106   * "12AB" is the four hexadecimal digits representing the 16-bit code unit.
1107   */
1108  private static String showCharacter(char c) {
1109    String hex = "0123456789ABCDEF";
1110    char[] tmp = {'\\', 'u', '\0', '\0', '\0', '\0'};
1111    for (int i = 0; i < 4; i++) {
1112      tmp[5 - i] = hex.charAt(c & 0xF);
1113      c = (char) (c >> 4);
1114    }
1115    return String.copyValueOf(tmp);
1116  }
1117
1118  // Fast matchers
1119
1120  /** A matcher for which precomputation will not yield any significant benefit. */
1121  abstract static class FastMatcher extends CharMatcher {
1122
1123    @Override
1124    public final CharMatcher precomputed() {
1125      return this;
1126    }
1127
1128    @Override
1129    public CharMatcher negate() {
1130      return new NegatedFastMatcher(this);
1131    }
1132  }
1133
1134  /** {@link FastMatcher} which overrides {@code toString()} with a custom name. */
1135  abstract static class NamedFastMatcher extends FastMatcher {
1136
1137    private final String description;
1138
1139    NamedFastMatcher(String description) {
1140      this.description = checkNotNull(description);
1141    }
1142
1143    @Override
1144    public final String toString() {
1145      return description;
1146    }
1147  }
1148
1149  /** Negation of a {@link FastMatcher}. */
1150  static class NegatedFastMatcher extends Negated {
1151
1152    NegatedFastMatcher(CharMatcher original) {
1153      super(original);
1154    }
1155
1156    @Override
1157    public final CharMatcher precomputed() {
1158      return this;
1159    }
1160  }
1161
1162  /** Fast matcher using a {@link BitSet} table of matching characters. */
1163  @GwtIncompatible // used only from other GwtIncompatible code
1164  private static final class BitSetMatcher extends NamedFastMatcher {
1165
1166    private final BitSet table;
1167
1168    private BitSetMatcher(BitSet table, String description) {
1169      super(description);
1170      if (table.length() + Long.SIZE < table.size()) {
1171        table = (BitSet) table.clone();
1172        // If only we could actually call BitSet.trimToSize() ourselves...
1173      }
1174      this.table = table;
1175    }
1176
1177    @Override
1178    public boolean matches(char c) {
1179      return table.get(c);
1180    }
1181
1182    @Override
1183    void setBits(BitSet bitSet) {
1184      bitSet.or(table);
1185    }
1186  }
1187
1188  // Static constant implementation classes
1189
1190  /** Implementation of {@link #any()}. */
1191  private static final class Any extends NamedFastMatcher {
1192
1193    static final Any INSTANCE = new Any();
1194
1195    private Any() {
1196      super("CharMatcher.any()");
1197    }
1198
1199    @Override
1200    public boolean matches(char c) {
1201      return true;
1202    }
1203
1204    @Override
1205    public int indexIn(CharSequence sequence) {
1206      return (sequence.length() == 0) ? -1 : 0;
1207    }
1208
1209    @Override
1210    public int indexIn(CharSequence sequence, int start) {
1211      int length = sequence.length();
1212      checkPositionIndex(start, length);
1213      return (start == length) ? -1 : start;
1214    }
1215
1216    @Override
1217    public int lastIndexIn(CharSequence sequence) {
1218      return sequence.length() - 1;
1219    }
1220
1221    @Override
1222    public boolean matchesAllOf(CharSequence sequence) {
1223      checkNotNull(sequence);
1224      return true;
1225    }
1226
1227    @Override
1228    public boolean matchesNoneOf(CharSequence sequence) {
1229      return sequence.length() == 0;
1230    }
1231
1232    @Override
1233    public String removeFrom(CharSequence sequence) {
1234      checkNotNull(sequence);
1235      return "";
1236    }
1237
1238    @Override
1239    public String replaceFrom(CharSequence sequence, char replacement) {
1240      char[] array = new char[sequence.length()];
1241      Arrays.fill(array, replacement);
1242      return new String(array);
1243    }
1244
1245    @Override
1246    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1247      StringBuilder result = new StringBuilder(sequence.length() * replacement.length());
1248      for (int i = 0; i < sequence.length(); i++) {
1249        result.append(replacement);
1250      }
1251      return result.toString();
1252    }
1253
1254    @Override
1255    public String collapseFrom(CharSequence sequence, char replacement) {
1256      return (sequence.length() == 0) ? "" : String.valueOf(replacement);
1257    }
1258
1259    @Override
1260    public String trimFrom(CharSequence sequence) {
1261      checkNotNull(sequence);
1262      return "";
1263    }
1264
1265    @Override
1266    public int countIn(CharSequence sequence) {
1267      return sequence.length();
1268    }
1269
1270    @Override
1271    public CharMatcher and(CharMatcher other) {
1272      return checkNotNull(other);
1273    }
1274
1275    @Override
1276    public CharMatcher or(CharMatcher other) {
1277      checkNotNull(other);
1278      return this;
1279    }
1280
1281    @Override
1282    public CharMatcher negate() {
1283      return none();
1284    }
1285  }
1286
1287  /** Implementation of {@link #none()}. */
1288  private static final class None extends NamedFastMatcher {
1289
1290    static final None INSTANCE = new None();
1291
1292    private None() {
1293      super("CharMatcher.none()");
1294    }
1295
1296    @Override
1297    public boolean matches(char c) {
1298      return false;
1299    }
1300
1301    @Override
1302    public int indexIn(CharSequence sequence) {
1303      checkNotNull(sequence);
1304      return -1;
1305    }
1306
1307    @Override
1308    public int indexIn(CharSequence sequence, int start) {
1309      int length = sequence.length();
1310      checkPositionIndex(start, length);
1311      return -1;
1312    }
1313
1314    @Override
1315    public int lastIndexIn(CharSequence sequence) {
1316      checkNotNull(sequence);
1317      return -1;
1318    }
1319
1320    @Override
1321    public boolean matchesAllOf(CharSequence sequence) {
1322      return sequence.length() == 0;
1323    }
1324
1325    @Override
1326    public boolean matchesNoneOf(CharSequence sequence) {
1327      checkNotNull(sequence);
1328      return true;
1329    }
1330
1331    @Override
1332    public String removeFrom(CharSequence sequence) {
1333      return sequence.toString();
1334    }
1335
1336    @Override
1337    public String replaceFrom(CharSequence sequence, char replacement) {
1338      return sequence.toString();
1339    }
1340
1341    @Override
1342    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
1343      checkNotNull(replacement);
1344      return sequence.toString();
1345    }
1346
1347    @Override
1348    public String collapseFrom(CharSequence sequence, char replacement) {
1349      return sequence.toString();
1350    }
1351
1352    @Override
1353    public String trimFrom(CharSequence sequence) {
1354      return sequence.toString();
1355    }
1356
1357    @Override
1358    public String trimLeadingFrom(CharSequence sequence) {
1359      return sequence.toString();
1360    }
1361
1362    @Override
1363    public String trimTrailingFrom(CharSequence sequence) {
1364      return sequence.toString();
1365    }
1366
1367    @Override
1368    public int countIn(CharSequence sequence) {
1369      checkNotNull(sequence);
1370      return 0;
1371    }
1372
1373    @Override
1374    public CharMatcher and(CharMatcher other) {
1375      checkNotNull(other);
1376      return this;
1377    }
1378
1379    @Override
1380    public CharMatcher or(CharMatcher other) {
1381      return checkNotNull(other);
1382    }
1383
1384    @Override
1385    public CharMatcher negate() {
1386      return any();
1387    }
1388  }
1389
1390  /** Implementation of {@link #whitespace()}. */
1391  @VisibleForTesting
1392  static final class Whitespace extends NamedFastMatcher {
1393
1394    // TABLE is a precomputed hashset of whitespace characters. MULTIPLIER serves as a hash function
1395    // whose key property is that it maps 25 characters into the 32-slot table without collision.
1396    // Basically this is an opportunistic fast implementation as opposed to "good code". For most
1397    // other use-cases, the reduction in readability isn't worth it.
1398    static final String TABLE =
1399        "\u2002\u3000\r\u0085\u200A\u2005\u2000\u3000"
1400            + "\u2029\u000B\u3000\u2008\u2003\u205F\u3000\u1680"
1401            + "\u0009\u0020\u2006\u2001\u202F\u00A0\u000C\u2009"
1402            + "\u3000\u2004\u3000\u3000\u2028\n\u2007\u3000";
1403    static final int MULTIPLIER = 1682554634;
1404    static final int SHIFT = Integer.numberOfLeadingZeros(TABLE.length() - 1);
1405
1406    static final Whitespace INSTANCE = new Whitespace();
1407
1408    Whitespace() {
1409      super("CharMatcher.whitespace()");
1410    }
1411
1412    @Override
1413    public boolean matches(char c) {
1414      return TABLE.charAt((MULTIPLIER * c) >>> SHIFT) == c;
1415    }
1416
1417    @GwtIncompatible // used only from other GwtIncompatible code
1418    @Override
1419    void setBits(BitSet table) {
1420      for (int i = 0; i < TABLE.length(); i++) {
1421        table.set(TABLE.charAt(i));
1422      }
1423    }
1424  }
1425
1426  /** Implementation of {@link #breakingWhitespace()}. */
1427  private static final class BreakingWhitespace extends CharMatcher {
1428
1429    static final CharMatcher INSTANCE = new BreakingWhitespace();
1430
1431    @Override
1432    public boolean matches(char c) {
1433      switch (c) {
1434        case '\t':
1435        case '\n':
1436        case '\013':
1437        case '\f':
1438        case '\r':
1439        case ' ':
1440        case '\u0085':
1441        case '\u1680':
1442        case '\u2028':
1443        case '\u2029':
1444        case '\u205f':
1445        case '\u3000':
1446          return true;
1447        case '\u2007':
1448          return false;
1449        default:
1450          return c >= '\u2000' && c <= '\u200a';
1451      }
1452    }
1453
1454    @Override
1455    public String toString() {
1456      return "CharMatcher.breakingWhitespace()";
1457    }
1458  }
1459
1460  /** Implementation of {@link #ascii()}. */
1461  private static final class Ascii extends NamedFastMatcher {
1462
1463    static final Ascii INSTANCE = new Ascii();
1464
1465    Ascii() {
1466      super("CharMatcher.ascii()");
1467    }
1468
1469    @Override
1470    public boolean matches(char c) {
1471      return c <= '\u007f';
1472    }
1473  }
1474
1475  /** Implementation that matches characters that fall within multiple ranges. */
1476  private static class RangesMatcher extends CharMatcher {
1477
1478    private final String description;
1479    private final char[] rangeStarts;
1480    private final char[] rangeEnds;
1481
1482    RangesMatcher(String description, char[] rangeStarts, char[] rangeEnds) {
1483      this.description = description;
1484      this.rangeStarts = rangeStarts;
1485      this.rangeEnds = rangeEnds;
1486      checkArgument(rangeStarts.length == rangeEnds.length);
1487      for (int i = 0; i < rangeStarts.length; i++) {
1488        checkArgument(rangeStarts[i] <= rangeEnds[i]);
1489        if (i + 1 < rangeStarts.length) {
1490          checkArgument(rangeEnds[i] < rangeStarts[i + 1]);
1491        }
1492      }
1493    }
1494
1495    @Override
1496    public boolean matches(char c) {
1497      int index = Arrays.binarySearch(rangeStarts, c);
1498      if (index >= 0) {
1499        return true;
1500      } else {
1501        index = ~index - 1;
1502        return index >= 0 && c <= rangeEnds[index];
1503      }
1504    }
1505
1506    @Override
1507    public String toString() {
1508      return description;
1509    }
1510  }
1511
1512  /** Implementation of {@link #digit()}. */
1513  private static final class Digit extends RangesMatcher {
1514    // Plug the following UnicodeSet pattern into
1515    // https://unicode.org/cldr/utility/list-unicodeset.jsp
1516    // [[:Nd:]&[:nv=0:]&[\u0000-\uFFFF]]
1517    // and get the zeroes from there.
1518
1519    // Must be in ascending order.
1520    private static final String ZEROES =
1521        "0\u0660\u06f0\u07c0\u0966\u09e6\u0a66\u0ae6\u0b66\u0be6\u0c66\u0ce6\u0d66\u0de6"
1522            + "\u0e50\u0ed0\u0f20\u1040\u1090\u17e0\u1810\u1946\u19d0\u1a80\u1a90\u1b50\u1bb0"
1523            + "\u1c40\u1c50\ua620\ua8d0\ua900\ua9d0\ua9f0\uaa50\uabf0\uff10";
1524
1525    private static char[] zeroes() {
1526      return ZEROES.toCharArray();
1527    }
1528
1529    private static char[] nines() {
1530      char[] nines = new char[ZEROES.length()];
1531      for (int i = 0; i < ZEROES.length(); i++) {
1532        nines[i] = (char) (ZEROES.charAt(i) + 9);
1533      }
1534      return nines;
1535    }
1536
1537    static final Digit INSTANCE = new Digit();
1538
1539    private Digit() {
1540      super("CharMatcher.digit()", zeroes(), nines());
1541    }
1542  }
1543
1544  /** Implementation of {@link #javaDigit()}. */
1545  private static final class JavaDigit extends CharMatcher {
1546
1547    static final JavaDigit INSTANCE = new JavaDigit();
1548
1549    @Override
1550    public boolean matches(char c) {
1551      return Character.isDigit(c);
1552    }
1553
1554    @Override
1555    public String toString() {
1556      return "CharMatcher.javaDigit()";
1557    }
1558  }
1559
1560  /** Implementation of {@link #javaLetter()}. */
1561  private static final class JavaLetter extends CharMatcher {
1562
1563    static final JavaLetter INSTANCE = new JavaLetter();
1564
1565    @Override
1566    public boolean matches(char c) {
1567      return Character.isLetter(c);
1568    }
1569
1570    @Override
1571    public String toString() {
1572      return "CharMatcher.javaLetter()";
1573    }
1574  }
1575
1576  /** Implementation of {@link #javaLetterOrDigit()}. */
1577  private static final class JavaLetterOrDigit extends CharMatcher {
1578
1579    static final JavaLetterOrDigit INSTANCE = new JavaLetterOrDigit();
1580
1581    @Override
1582    public boolean matches(char c) {
1583      return Character.isLetterOrDigit(c);
1584    }
1585
1586    @Override
1587    public String toString() {
1588      return "CharMatcher.javaLetterOrDigit()";
1589    }
1590  }
1591
1592  /** Implementation of {@link #javaUpperCase()}. */
1593  private static final class JavaUpperCase extends CharMatcher {
1594
1595    static final JavaUpperCase INSTANCE = new JavaUpperCase();
1596
1597    @Override
1598    public boolean matches(char c) {
1599      return Character.isUpperCase(c);
1600    }
1601
1602    @Override
1603    public String toString() {
1604      return "CharMatcher.javaUpperCase()";
1605    }
1606  }
1607
1608  /** Implementation of {@link #javaLowerCase()}. */
1609  private static final class JavaLowerCase extends CharMatcher {
1610
1611    static final JavaLowerCase INSTANCE = new JavaLowerCase();
1612
1613    @Override
1614    public boolean matches(char c) {
1615      return Character.isLowerCase(c);
1616    }
1617
1618    @Override
1619    public String toString() {
1620      return "CharMatcher.javaLowerCase()";
1621    }
1622  }
1623
1624  /** Implementation of {@link #javaIsoControl()}. */
1625  private static final class JavaIsoControl extends NamedFastMatcher {
1626
1627    static final JavaIsoControl INSTANCE = new JavaIsoControl();
1628
1629    private JavaIsoControl() {
1630      super("CharMatcher.javaIsoControl()");
1631    }
1632
1633    @Override
1634    public boolean matches(char c) {
1635      return c <= '\u001f' || (c >= '\u007f' && c <= '\u009f');
1636    }
1637  }
1638
1639  /** Implementation of {@link #invisible()}. */
1640  private static final class Invisible extends RangesMatcher {
1641    // Plug the following UnicodeSet pattern into
1642    // https://unicode.org/cldr/utility/list-unicodeset.jsp
1643    // [[[:Zs:][:Zl:][:Zp:][:Cc:][:Cf:][:Cs:][:Co:]]&[\u0000-\uFFFF]]
1644    // with the "Abbreviate" option, and get the ranges from there.
1645    private static final String RANGE_STARTS =
1646        "\u0000\u007f\u00ad\u0600\u061c\u06dd\u070f\u08e2\u1680\u180e\u2000\u2028\u205f\u2066"
1647            + "\u3000\ud800\ufeff\ufff9";
1648    private static final String RANGE_ENDS = // inclusive ends
1649        "\u0020\u00a0\u00ad\u0605\u061c\u06dd\u070f\u08e2\u1680\u180e\u200f\u202f\u2064\u206f"
1650            + "\u3000\uf8ff\ufeff\ufffb";
1651
1652    static final Invisible INSTANCE = new Invisible();
1653
1654    private Invisible() {
1655      super("CharMatcher.invisible()", RANGE_STARTS.toCharArray(), RANGE_ENDS.toCharArray());
1656    }
1657  }
1658
1659  /** Implementation of {@link #singleWidth()}. */
1660  private static final class SingleWidth extends RangesMatcher {
1661
1662    static final SingleWidth INSTANCE = new SingleWidth();
1663
1664    private SingleWidth() {
1665      super(
1666          "CharMatcher.singleWidth()",
1667          "\u0000\u05be\u05d0\u05f3\u0600\u0750\u0e00\u1e00\u2100\ufb50\ufe70\uff61".toCharArray(),
1668          "\u04f9\u05be\u05ea\u05f4\u06ff\u077f\u0e7f\u20af\u213a\ufdff\ufeff\uffdc".toCharArray());
1669    }
1670  }
1671
1672  // Non-static factory implementation classes
1673
1674  /** Implementation of {@link #negate()}. */
1675  private static class Negated extends CharMatcher {
1676
1677    final CharMatcher original;
1678
1679    Negated(CharMatcher original) {
1680      this.original = checkNotNull(original);
1681    }
1682
1683    @Override
1684    public boolean matches(char c) {
1685      return !original.matches(c);
1686    }
1687
1688    @Override
1689    public boolean matchesAllOf(CharSequence sequence) {
1690      return original.matchesNoneOf(sequence);
1691    }
1692
1693    @Override
1694    public boolean matchesNoneOf(CharSequence sequence) {
1695      return original.matchesAllOf(sequence);
1696    }
1697
1698    @Override
1699    public int countIn(CharSequence sequence) {
1700      return sequence.length() - original.countIn(sequence);
1701    }
1702
1703    @GwtIncompatible // used only from other GwtIncompatible code
1704    @Override
1705    void setBits(BitSet table) {
1706      BitSet tmp = new BitSet();
1707      original.setBits(tmp);
1708      tmp.flip(Character.MIN_VALUE, Character.MAX_VALUE + 1);
1709      table.or(tmp);
1710    }
1711
1712    @Override
1713    public CharMatcher negate() {
1714      return original;
1715    }
1716
1717    @Override
1718    public String toString() {
1719      return original + ".negate()";
1720    }
1721  }
1722
1723  /** Implementation of {@link #and(CharMatcher)}. */
1724  private static final class And extends CharMatcher {
1725
1726    final CharMatcher first;
1727    final CharMatcher second;
1728
1729    And(CharMatcher a, CharMatcher b) {
1730      first = checkNotNull(a);
1731      second = checkNotNull(b);
1732    }
1733
1734    @Override
1735    public boolean matches(char c) {
1736      return first.matches(c) && second.matches(c);
1737    }
1738
1739    @GwtIncompatible // used only from other GwtIncompatible code
1740    @Override
1741    void setBits(BitSet table) {
1742      BitSet tmp1 = new BitSet();
1743      first.setBits(tmp1);
1744      BitSet tmp2 = new BitSet();
1745      second.setBits(tmp2);
1746      tmp1.and(tmp2);
1747      table.or(tmp1);
1748    }
1749
1750    @Override
1751    public String toString() {
1752      return "CharMatcher.and(" + first + ", " + second + ")";
1753    }
1754  }
1755
1756  /** Implementation of {@link #or(CharMatcher)}. */
1757  private static final class Or extends CharMatcher {
1758
1759    final CharMatcher first;
1760    final CharMatcher second;
1761
1762    Or(CharMatcher a, CharMatcher b) {
1763      first = checkNotNull(a);
1764      second = checkNotNull(b);
1765    }
1766
1767    @GwtIncompatible // used only from other GwtIncompatible code
1768    @Override
1769    void setBits(BitSet table) {
1770      first.setBits(table);
1771      second.setBits(table);
1772    }
1773
1774    @Override
1775    public boolean matches(char c) {
1776      return first.matches(c) || second.matches(c);
1777    }
1778
1779    @Override
1780    public String toString() {
1781      return "CharMatcher.or(" + first + ", " + second + ")";
1782    }
1783  }
1784
1785  // Static factory implementations
1786
1787  /** Implementation of {@link #is(char)}. */
1788  private static final class Is extends FastMatcher {
1789
1790    private final char match;
1791
1792    Is(char match) {
1793      this.match = match;
1794    }
1795
1796    @Override
1797    public boolean matches(char c) {
1798      return c == match;
1799    }
1800
1801    @Override
1802    public String replaceFrom(CharSequence sequence, char replacement) {
1803      return sequence.toString().replace(match, replacement);
1804    }
1805
1806    @Override
1807    public CharMatcher and(CharMatcher other) {
1808      return other.matches(match) ? this : none();
1809    }
1810
1811    @Override
1812    public CharMatcher or(CharMatcher other) {
1813      return other.matches(match) ? other : super.or(other);
1814    }
1815
1816    @Override
1817    public CharMatcher negate() {
1818      return isNot(match);
1819    }
1820
1821    @GwtIncompatible // used only from other GwtIncompatible code
1822    @Override
1823    void setBits(BitSet table) {
1824      table.set(match);
1825    }
1826
1827    @Override
1828    public String toString() {
1829      return "CharMatcher.is('" + showCharacter(match) + "')";
1830    }
1831  }
1832
1833  /** Implementation of {@link #isNot(char)}. */
1834  private static final class IsNot extends FastMatcher {
1835
1836    private final char match;
1837
1838    IsNot(char match) {
1839      this.match = match;
1840    }
1841
1842    @Override
1843    public boolean matches(char c) {
1844      return c != match;
1845    }
1846
1847    @Override
1848    public CharMatcher and(CharMatcher other) {
1849      return other.matches(match) ? super.and(other) : other;
1850    }
1851
1852    @Override
1853    public CharMatcher or(CharMatcher other) {
1854      return other.matches(match) ? any() : this;
1855    }
1856
1857    @GwtIncompatible // used only from other GwtIncompatible code
1858    @Override
1859    void setBits(BitSet table) {
1860      table.set(0, match);
1861      table.set(match + 1, Character.MAX_VALUE + 1);
1862    }
1863
1864    @Override
1865    public CharMatcher negate() {
1866      return is(match);
1867    }
1868
1869    @Override
1870    public String toString() {
1871      return "CharMatcher.isNot('" + showCharacter(match) + "')";
1872    }
1873  }
1874
1875  private static CharMatcher.IsEither isEither(char c1, char c2) {
1876    return new CharMatcher.IsEither(c1, c2);
1877  }
1878
1879  /** Implementation of {@link #anyOf(CharSequence)} for exactly two characters. */
1880  private static final class IsEither extends FastMatcher {
1881
1882    private final char match1;
1883    private final char match2;
1884
1885    IsEither(char match1, char match2) {
1886      this.match1 = match1;
1887      this.match2 = match2;
1888    }
1889
1890    @Override
1891    public boolean matches(char c) {
1892      return c == match1 || c == match2;
1893    }
1894
1895    @GwtIncompatible // used only from other GwtIncompatible code
1896    @Override
1897    void setBits(BitSet table) {
1898      table.set(match1);
1899      table.set(match2);
1900    }
1901
1902    @Override
1903    public String toString() {
1904      return "CharMatcher.anyOf(\"" + showCharacter(match1) + showCharacter(match2) + "\")";
1905    }
1906  }
1907
1908  /** Implementation of {@link #anyOf(CharSequence)} for three or more characters. */
1909  private static final class AnyOf extends CharMatcher {
1910
1911    private final char[] chars;
1912
1913    public AnyOf(CharSequence chars) {
1914      this.chars = chars.toString().toCharArray();
1915      Arrays.sort(this.chars);
1916    }
1917
1918    @Override
1919    public boolean matches(char c) {
1920      return Arrays.binarySearch(chars, c) >= 0;
1921    }
1922
1923    @Override
1924    @GwtIncompatible // used only from other GwtIncompatible code
1925    void setBits(BitSet table) {
1926      for (char c : chars) {
1927        table.set(c);
1928      }
1929    }
1930
1931    @Override
1932    public String toString() {
1933      StringBuilder description = new StringBuilder("CharMatcher.anyOf(\"");
1934      for (char c : chars) {
1935        description.append(showCharacter(c));
1936      }
1937      description.append("\")");
1938      return description.toString();
1939    }
1940  }
1941
1942  /** Implementation of {@link #inRange(char, char)}. */
1943  private static final class InRange extends FastMatcher {
1944
1945    private final char startInclusive;
1946    private final char endInclusive;
1947
1948    InRange(char startInclusive, char endInclusive) {
1949      checkArgument(endInclusive >= startInclusive);
1950      this.startInclusive = startInclusive;
1951      this.endInclusive = endInclusive;
1952    }
1953
1954    @Override
1955    public boolean matches(char c) {
1956      return startInclusive <= c && c <= endInclusive;
1957    }
1958
1959    @GwtIncompatible // used only from other GwtIncompatible code
1960    @Override
1961    void setBits(BitSet table) {
1962      table.set(startInclusive, endInclusive + 1);
1963    }
1964
1965    @Override
1966    public String toString() {
1967      return "CharMatcher.inRange('"
1968          + showCharacter(startInclusive)
1969          + "', '"
1970          + showCharacter(endInclusive)
1971          + "')";
1972    }
1973  }
1974
1975  /** Implementation of {@link #forPredicate(Predicate)}. */
1976  private static final class ForPredicate extends CharMatcher {
1977
1978    private final Predicate<? super Character> predicate;
1979
1980    ForPredicate(Predicate<? super Character> predicate) {
1981      this.predicate = checkNotNull(predicate);
1982    }
1983
1984    @Override
1985    public boolean matches(char c) {
1986      return predicate.apply(c);
1987    }
1988
1989    @SuppressWarnings("deprecation") // intentional; deprecation is for callers primarily
1990    @Override
1991    public boolean apply(Character character) {
1992      return predicate.apply(checkNotNull(character));
1993    }
1994
1995    @Override
1996    public String toString() {
1997      return "CharMatcher.forPredicate(" + predicate + ")";
1998    }
1999  }
2000}