001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.commons.lang3.text;
018
019import java.util.ArrayList;
020import java.util.Arrays;
021import java.util.Collections;
022import java.util.List;
023import java.util.ListIterator;
024import java.util.NoSuchElementException;
025import java.util.StringTokenizer;
026
027import org.apache.commons.lang3.ArrayUtils;
028import org.apache.commons.lang3.StringUtils;
029
030/**
031 * Tokenizes a string based on delimiters (separators)
032 * and supporting quoting and ignored character concepts.
033 * <p>
034 * This class can split a String into many smaller strings. It aims
035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
036 * however it offers much more control and flexibility including implementing
037 * the {@link ListIterator} interface. By default, it is set up
038 * like {@link StringTokenizer}.
039 * </p>
040 * <p>
041 * The input String is split into a number of <i>tokens</i>.
042 * Each token is separated from the next String by a <i>delimiter</i>.
043 * One or more delimiter characters must be specified.
044 * </p>
045 * <p>
046 * Each token may be surrounded by quotes.
047 * The <i>quote</i> matcher specifies the quote character(s).
048 * A quote may be escaped within a quoted section by duplicating itself.
049 * </p>
050 * <p>
051 * Between each token and the delimiter are potentially characters that need trimming.
052 * The <i>trimmer</i> matcher specifies these characters.
053 * One usage might be to trim whitespace characters.
054 * </p>
055 * <p>
056 * At any point outside the quotes there might potentially be invalid characters.
057 * The <i>ignored</i> matcher specifies these characters to be removed.
058 * One usage might be to remove new line characters.
059 * </p>
060 * <p>
061 * Empty tokens may be removed or returned as null.
062 * </p>
063 * <pre>
064 * "a,b,c"         - Three tokens "a","b","c"   (comma delimiter)
065 * " a, b , c "    - Three tokens "a","b","c"   (default CSV processing trims whitespace)
066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
067 * </pre>
068 *
069 * <table>
070 *  <caption>StrTokenizer properties and options</caption>
071 *  <tr>
072 *   <th>Property</th><th>Type</th><th>Default</th>
073 *  </tr>
074 *  <tr>
075 *   <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
076 *  </tr>
077 *  <tr>
078 *   <td>quote</td><td>NoneMatcher</td><td>{}</td>
079 *  </tr>
080 *  <tr>
081 *   <td>ignore</td><td>NoneMatcher</td><td>{}</td>
082 *  </tr>
083 *  <tr>
084 *   <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
085 *  </tr>
086 *  <tr>
087 *   <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
088 *  </tr>
089 * </table>
090 *
091 * @since 2.2
092 * @deprecated As of 3.6, use Apache Commons Text
093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
094 * StringTokenizer</a> instead
095 */
096@Deprecated
097public class StrTokenizer implements ListIterator<String>, Cloneable {
098
099    private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
100    private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
101    static {
102        CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
103        CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
104        CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
105        CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
106        CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
107        CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
108        CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
109
110        TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
111        TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
112        TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
113        TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
114        TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
115        TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
116        TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
117    }
118
119    /** The text to work on. */
120    private char[] chars;
121    /** The parsed tokens */
122    private String[] tokens;
123    /** The current iteration position */
124    private int tokenPos;
125
126    /** The delimiter matcher */
127    private StrMatcher delimMatcher = StrMatcher.splitMatcher();
128    /** The quote matcher */
129    private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
130    /** The ignored matcher */
131    private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
132    /** The trimmer matcher */
133    private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
134
135    /** Whether to return empty tokens as null */
136    private boolean emptyAsNull;
137    /** Whether to ignore empty tokens */
138    private boolean ignoreEmptyTokens = true;
139
140
141    /**
142     * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
143     *
144     * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
145     */
146    private static StrTokenizer getCSVClone() {
147        return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
148    }
149
150    /**
151     * Gets a new tokenizer instance which parses Comma Separated Value strings
152     * initializing it with the given input.  The default for CSV processing
153     * will be trim whitespace from both ends (which can be overridden with
154     * the setTrimmer method).
155     * <p>
156     * You must call a "reset" method to set the string which you want to parse.
157     * </p>
158     * @return a new tokenizer instance which parses Comma Separated Value strings
159     */
160    public static StrTokenizer getCSVInstance() {
161        return getCSVClone();
162    }
163
164    /**
165     * Gets a new tokenizer instance which parses Comma Separated Value strings
166     * initializing it with the given input.  The default for CSV processing
167     * will be trim whitespace from both ends (which can be overridden with
168     * the setTrimmer method).
169     *
170     * @param input  the text to parse
171     * @return a new tokenizer instance which parses Comma Separated Value strings
172     */
173    public static StrTokenizer getCSVInstance(final String input) {
174        final StrTokenizer tok = getCSVClone();
175        tok.reset(input);
176        return tok;
177    }
178
179    /**
180     * Gets a new tokenizer instance which parses Comma Separated Value strings
181     * initializing it with the given input.  The default for CSV processing
182     * will be trim whitespace from both ends (which can be overridden with
183     * the setTrimmer method).
184     *
185     * @param input  the text to parse
186     * @return a new tokenizer instance which parses Comma Separated Value strings
187     */
188    public static StrTokenizer getCSVInstance(final char[] input) {
189        final StrTokenizer tok = getCSVClone();
190        tok.reset(input);
191        return tok;
192    }
193
194    /**
195     * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
196     *
197     * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
198     */
199    private static StrTokenizer getTSVClone() {
200        return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
201    }
202
203
204    /**
205     * Gets a new tokenizer instance which parses Tab Separated Value strings.
206     * The default for CSV processing will be trim whitespace from both ends
207     * (which can be overridden with the setTrimmer method).
208     * <p>
209     * You must call a "reset" method to set the string which you want to parse.
210     * </p>
211     * @return a new tokenizer instance which parses Tab Separated Value strings.
212     */
213    public static StrTokenizer getTSVInstance() {
214        return getTSVClone();
215    }
216
217    /**
218     * Gets a new tokenizer instance which parses Tab Separated Value strings.
219     * The default for CSV processing will be trim whitespace from both ends
220     * (which can be overridden with the setTrimmer method).
221     * @param input  the string to parse
222     * @return a new tokenizer instance which parses Tab Separated Value strings.
223     */
224    public static StrTokenizer getTSVInstance(final String input) {
225        final StrTokenizer tok = getTSVClone();
226        tok.reset(input);
227        return tok;
228    }
229
230    /**
231     * Gets a new tokenizer instance which parses Tab Separated Value strings.
232     * The default for CSV processing will be trim whitespace from both ends
233     * (which can be overridden with the setTrimmer method).
234     * @param input  the string to parse
235     * @return a new tokenizer instance which parses Tab Separated Value strings.
236     */
237    public static StrTokenizer getTSVInstance(final char[] input) {
238        final StrTokenizer tok = getTSVClone();
239        tok.reset(input);
240        return tok;
241    }
242
243    /**
244     * Constructs a tokenizer splitting on space, tab, newline and formfeed
245     * as per StringTokenizer, but with no text to tokenize.
246     * <p>
247     * This constructor is normally used with {@link #reset(String)}.
248     * </p>
249     */
250    public StrTokenizer() {
251        this.chars = null;
252    }
253
254    /**
255     * Constructs a tokenizer splitting on space, tab, newline and formfeed
256     * as per StringTokenizer.
257     *
258     * @param input  the string which is to be parsed
259     */
260    public StrTokenizer(final String input) {
261        if (input != null) {
262            chars = input.toCharArray();
263        } else {
264            chars = null;
265        }
266    }
267
268    /**
269     * Constructs a tokenizer splitting on the specified delimiter character.
270     *
271     * @param input  the string which is to be parsed
272     * @param delim  the field delimiter character
273     */
274    public StrTokenizer(final String input, final char delim) {
275        this(input);
276        setDelimiterChar(delim);
277    }
278
279    /**
280     * Constructs a tokenizer splitting on the specified delimiter string.
281     *
282     * @param input  the string which is to be parsed
283     * @param delim  the field delimiter string
284     */
285    public StrTokenizer(final String input, final String delim) {
286        this(input);
287        setDelimiterString(delim);
288    }
289
290    /**
291     * Constructs a tokenizer splitting using the specified delimiter matcher.
292     *
293     * @param input  the string which is to be parsed
294     * @param delim  the field delimiter matcher
295     */
296    public StrTokenizer(final String input, final StrMatcher delim) {
297        this(input);
298        setDelimiterMatcher(delim);
299    }
300
301    /**
302     * Constructs a tokenizer splitting on the specified delimiter character
303     * and handling quotes using the specified quote character.
304     *
305     * @param input  the string which is to be parsed
306     * @param delim  the field delimiter character
307     * @param quote  the field quoted string character
308     */
309    public StrTokenizer(final String input, final char delim, final char quote) {
310        this(input, delim);
311        setQuoteChar(quote);
312    }
313
314    /**
315     * Constructs a tokenizer splitting using the specified delimiter matcher
316     * and handling quotes using the specified quote matcher.
317     *
318     * @param input  the string which is to be parsed
319     * @param delim  the field delimiter matcher
320     * @param quote  the field quoted string matcher
321     */
322    public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
323        this(input, delim);
324        setQuoteMatcher(quote);
325    }
326
327    /**
328     * Constructs a tokenizer splitting on space, tab, newline and formfeed
329     * as per StringTokenizer.
330     *
331     * @param input  the string which is to be parsed, not cloned
332     */
333    public StrTokenizer(final char[] input) {
334        this.chars = ArrayUtils.clone(input);
335    }
336
337    /**
338     * Constructs a tokenizer splitting on the specified character.
339     *
340     * @param input  the string which is to be parsed, not cloned
341     * @param delim the field delimiter character
342     */
343    public StrTokenizer(final char[] input, final char delim) {
344        this(input);
345        setDelimiterChar(delim);
346    }
347
348    /**
349     * Constructs a tokenizer splitting on the specified string.
350     *
351     * @param input  the string which is to be parsed, not cloned
352     * @param delim the field delimiter string
353     */
354    public StrTokenizer(final char[] input, final String delim) {
355        this(input);
356        setDelimiterString(delim);
357    }
358
359    /**
360     * Constructs a tokenizer splitting using the specified delimiter matcher.
361     *
362     * @param input  the string which is to be parsed, not cloned
363     * @param delim  the field delimiter matcher
364     */
365    public StrTokenizer(final char[] input, final StrMatcher delim) {
366        this(input);
367        setDelimiterMatcher(delim);
368    }
369
370    /**
371     * Constructs a tokenizer splitting on the specified delimiter character
372     * and handling quotes using the specified quote character.
373     *
374     * @param input  the string which is to be parsed, not cloned
375     * @param delim  the field delimiter character
376     * @param quote  the field quoted string character
377     */
378    public StrTokenizer(final char[] input, final char delim, final char quote) {
379        this(input, delim);
380        setQuoteChar(quote);
381    }
382
383    /**
384     * Constructs a tokenizer splitting using the specified delimiter matcher
385     * and handling quotes using the specified quote matcher.
386     *
387     * @param input  the string which is to be parsed, not cloned
388     * @param delim  the field delimiter character
389     * @param quote  the field quoted string character
390     */
391    public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
392        this(input, delim);
393        setQuoteMatcher(quote);
394    }
395
396    // API
397    /**
398     * Gets the number of tokens found in the String.
399     *
400     * @return the number of matched tokens
401     */
402    public int size() {
403        checkTokenized();
404        return tokens.length;
405    }
406
407    /**
408     * Gets the next token from the String.
409     * Equivalent to {@link #next()} except it returns null rather than
410     * throwing {@link NoSuchElementException} when no tokens remain.
411     *
412     * @return the next sequential token, or null when no more tokens are found
413     */
414    public String nextToken() {
415        if (hasNext()) {
416            return tokens[tokenPos++];
417        }
418        return null;
419    }
420
421    /**
422     * Gets the previous token from the String.
423     *
424     * @return the previous sequential token, or null when no more tokens are found
425     */
426    public String previousToken() {
427        if (hasPrevious()) {
428            return tokens[--tokenPos];
429        }
430        return null;
431    }
432
433    /**
434     * Gets a copy of the full token list as an independent modifiable array.
435     *
436     * @return the tokens as a String array
437     */
438    public String[] getTokenArray() {
439        checkTokenized();
440        return tokens.clone();
441    }
442
443    /**
444     * Gets a copy of the full token list as an independent modifiable list.
445     *
446     * @return the tokens as a String array
447     */
448    public List<String> getTokenList() {
449        checkTokenized();
450        final List<String> list = new ArrayList<>(tokens.length);
451        list.addAll(Arrays.asList(tokens));
452        return list;
453    }
454
455    /**
456     * Resets this tokenizer, forgetting all parsing and iteration already completed.
457     * <p>
458     * This method allows the same tokenizer to be reused for the same String.
459     * </p>
460     *
461     * @return this, to enable chaining
462     */
463    public StrTokenizer reset() {
464        tokenPos = 0;
465        tokens = null;
466        return this;
467    }
468
469    /**
470     * Reset this tokenizer, giving it a new input string to parse.
471     * In this manner you can re-use a tokenizer with the same settings
472     * on multiple input lines.
473     *
474     * @param input  the new string to tokenize, null sets no text to parse
475     * @return this, to enable chaining
476     */
477    public StrTokenizer reset(final String input) {
478        reset();
479        if (input != null) {
480            this.chars = input.toCharArray();
481        } else {
482            this.chars = null;
483        }
484        return this;
485    }
486
487    /**
488     * Reset this tokenizer, giving it a new input string to parse.
489     * In this manner you can re-use a tokenizer with the same settings
490     * on multiple input lines.
491     *
492     * @param input  the new character array to tokenize, not cloned, null sets no text to parse
493     * @return this, to enable chaining
494     */
495    public StrTokenizer reset(final char[] input) {
496        reset();
497        this.chars = ArrayUtils.clone(input);
498        return this;
499    }
500
501    /**
502     * Checks whether there are any more tokens.
503     *
504     * @return true if there are more tokens
505     */
506    @Override
507    public boolean hasNext() {
508        checkTokenized();
509        return tokenPos < tokens.length;
510    }
511
512    /**
513     * Gets the next token.
514     *
515     * @return the next String token
516     * @throws NoSuchElementException if there are no more elements
517     */
518    @Override
519    public String next() {
520        if (hasNext()) {
521            return tokens[tokenPos++];
522        }
523        throw new NoSuchElementException();
524    }
525
526    /**
527     * Gets the index of the next token to return.
528     *
529     * @return the next token index
530     */
531    @Override
532    public int nextIndex() {
533        return tokenPos;
534    }
535
536    /**
537     * Checks whether there are any previous tokens that can be iterated to.
538     *
539     * @return true if there are previous tokens
540     */
541    @Override
542    public boolean hasPrevious() {
543        checkTokenized();
544        return tokenPos > 0;
545    }
546
547    /**
548     * Gets the token previous to the last returned token.
549     *
550     * @return the previous token
551     */
552    @Override
553    public String previous() {
554        if (hasPrevious()) {
555            return tokens[--tokenPos];
556        }
557        throw new NoSuchElementException();
558    }
559
560    /**
561     * Gets the index of the previous token.
562     *
563     * @return the previous token index
564     */
565    @Override
566    public int previousIndex() {
567        return tokenPos - 1;
568    }
569
570    /**
571     * Unsupported ListIterator operation.
572     *
573     * @throws UnsupportedOperationException always
574     */
575    @Override
576    public void remove() {
577        throw new UnsupportedOperationException("remove() is unsupported");
578    }
579
580    /**
581     * Unsupported ListIterator operation.
582     * @param obj this parameter ignored.
583     * @throws UnsupportedOperationException always
584     */
585    @Override
586    public void set(final String obj) {
587        throw new UnsupportedOperationException("set() is unsupported");
588    }
589
590    /**
591     * Unsupported ListIterator operation.
592     * @param obj this parameter ignored.
593     * @throws UnsupportedOperationException always
594     */
595    @Override
596    public void add(final String obj) {
597        throw new UnsupportedOperationException("add() is unsupported");
598    }
599
600    /**
601     * Checks if tokenization has been done, and if not then do it.
602     */
603    private void checkTokenized() {
604        if (tokens == null) {
605            if (chars == null) {
606                // still call tokenize as subclass may do some work
607                final List<String> split = tokenize(null, 0, 0);
608                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
609            } else {
610                final List<String> split = tokenize(chars, 0, chars.length);
611                tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
612            }
613        }
614    }
615
616    /**
617     * Internal method to performs the tokenization.
618     * <p>
619     * Most users of this class do not need to call this method. This method
620     * will be called automatically by other (public) methods when required.
621     * </p>
622     * <p>
623     * This method exists to allow subclasses to add code before or after the
624     * tokenization. For example, a subclass could alter the character array,
625     * offset or count to be parsed, or call the tokenizer multiple times on
626     * multiple strings. It is also be possible to filter the results.
627     * </p>
628     * <p>
629     * {@link StrTokenizer} will always pass a zero offset and a count
630     * equal to the length of the array to this method, however a subclass
631     * may pass other values, or even an entirely different array.
632     * </p>
633     *
634     * @param srcChars  the character array being tokenized, may be null
635     * @param offset  the start position within the character array, must be valid
636     * @param count  the number of characters to tokenize, must be valid
637     * @return the modifiable list of String tokens, unmodifiable if null array or zero count
638     */
639    protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
640        if (ArrayUtils.isEmpty(srcChars)) {
641            return Collections.emptyList();
642        }
643        final StrBuilder buf = new StrBuilder();
644        final List<String> tokenList = new ArrayList<>();
645        int pos = offset;
646
647        // loop around the entire buffer
648        while (pos >= 0 && pos < count) {
649            // find next token
650            pos = readNextToken(srcChars, pos, count, buf, tokenList);
651
652            // handle case where end of string is a delimiter
653            if (pos >= count) {
654                addToken(tokenList, StringUtils.EMPTY);
655            }
656        }
657        return tokenList;
658    }
659
660    /**
661     * Adds a token to a list, paying attention to the parameters we've set.
662     *
663     * @param list  the list to add to
664     * @param tok  the token to add
665     */
666    private void addToken(final List<String> list, String tok) {
667        if (StringUtils.isEmpty(tok)) {
668            if (isIgnoreEmptyTokens()) {
669                return;
670            }
671            if (isEmptyTokenAsNull()) {
672                tok = null;
673            }
674        }
675        list.add(tok);
676    }
677
678    /**
679     * Reads character by character through the String to get the next token.
680     *
681     * @param srcChars  the character array being tokenized
682     * @param start  the first character of field
683     * @param len  the length of the character array being tokenized
684     * @param workArea  a temporary work area
685     * @param tokenList  the list of parsed tokens
686     * @return the starting position of the next field (the character
687     *  immediately after the delimiter), or -1 if end of string found
688     */
689    private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
690        // skip all leading whitespace, unless it is the
691        // field delimiter or the quote character
692        while (start < len) {
693            final int removeLen = Math.max(
694                    getIgnoredMatcher().isMatch(srcChars, start, start, len),
695                    getTrimmerMatcher().isMatch(srcChars, start, start, len));
696            if (removeLen == 0 ||
697                getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
698                getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
699                break;
700            }
701            start += removeLen;
702        }
703
704        // handle reaching end
705        if (start >= len) {
706            addToken(tokenList, StringUtils.EMPTY);
707            return -1;
708        }
709
710        // handle empty token
711        final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
712        if (delimLen > 0) {
713            addToken(tokenList, StringUtils.EMPTY);
714            return start + delimLen;
715        }
716
717        // handle found token
718        final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
719        if (quoteLen > 0) {
720            return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
721        }
722        return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
723    }
724
725    /**
726     * Reads a possibly quoted string token.
727     *
728     * @param srcChars  the character array being tokenized
729     * @param start  the first character of field
730     * @param len  the length of the character array being tokenized
731     * @param workArea  a temporary work area
732     * @param tokenList  the list of parsed tokens
733     * @param quoteStart  the start position of the matched quote, 0 if no quoting
734     * @param quoteLen  the length of the matched quote, 0 if no quoting
735     * @return the starting position of the next field (the character
736     *  immediately after the delimiter, or if end of string found,
737     *  then the length of string
738     */
739    private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
740                               final List<String> tokenList, final int quoteStart, final int quoteLen) {
741        // Loop until we've found the end of the quoted
742        // string or the end of the input
743        workArea.clear();
744        int pos = start;
745        boolean quoting = quoteLen > 0;
746        int trimStart = 0;
747
748        while (pos < len) {
749            // quoting mode can occur several times throughout a string
750            // we must switch between quoting and non-quoting until we
751            // encounter a non-quoted delimiter, or end of string
752            if (quoting) {
753                // In quoting mode
754
755                // If we've found a quote character, see if it's
756                // followed by a second quote.  If so, then we need
757                // to actually put the quote character into the token
758                // rather than end the token.
759                if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
760                    if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
761                        // matched pair of quotes, thus an escaped quote
762                        workArea.append(srcChars, pos, quoteLen);
763                        pos += quoteLen * 2;
764                        trimStart = workArea.size();
765                        continue;
766                    }
767
768                    // end of quoting
769                    quoting = false;
770                    pos += quoteLen;
771                    continue;
772                }
773
774            } else {
775                // Not in quoting mode
776
777                // check for delimiter, and thus end of token
778                final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
779                if (delimLen > 0) {
780                    // return condition when end of token found
781                    addToken(tokenList, workArea.substring(0, trimStart));
782                    return pos + delimLen;
783                }
784
785                // check for quote, and thus back into quoting mode
786                if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
787                    quoting = true;
788                    pos += quoteLen;
789                    continue;
790                }
791
792                // check for ignored (outside quotes), and ignore
793                final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
794                if (ignoredLen > 0) {
795                    pos += ignoredLen;
796                    continue;
797                }
798
799                // check for trimmed character
800                // don't yet know if it's at the end, so copy to workArea
801                // use trimStart to keep track of trim at the end
802                final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
803                if (trimmedLen > 0) {
804                    workArea.append(srcChars, pos, trimmedLen);
805                    pos += trimmedLen;
806                    continue;
807                }
808            }
809            // copy regular character from inside quotes
810            workArea.append(srcChars[pos++]);
811            trimStart = workArea.size();
812        }
813
814        // return condition when end of string found
815        addToken(tokenList, workArea.substring(0, trimStart));
816        return -1;
817    }
818
819    /**
820     * Checks if the characters at the index specified match the quote
821     * already matched in readNextToken().
822     *
823     * @param srcChars  the character array being tokenized
824     * @param pos  the position to check for a quote
825     * @param len  the length of the character array being tokenized
826     * @param quoteStart  the start position of the matched quote, 0 if no quoting
827     * @param quoteLen  the length of the matched quote, 0 if no quoting
828     * @return true if a quote is matched
829     */
830    private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
831        for (int i = 0; i < quoteLen; i++) {
832            if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
833                return false;
834            }
835        }
836        return true;
837    }
838
839    /**
840     * Gets the field delimiter matcher.
841     *
842     * @return the delimiter matcher in use
843     */
844    public StrMatcher getDelimiterMatcher() {
845        return this.delimMatcher;
846    }
847
848    /**
849     * Sets the field delimiter matcher.
850     * <p>
851     * The delimiter is used to separate one token from another.
852     * </p>
853     *
854     * @param delim  the delimiter matcher to use
855     * @return this, to enable chaining
856     */
857    public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
858        if (delim == null) {
859            this.delimMatcher = StrMatcher.noneMatcher();
860        } else {
861            this.delimMatcher = delim;
862        }
863        return this;
864    }
865
866    /**
867     * Sets the field delimiter character.
868     *
869     * @param delim  the delimiter character to use
870     * @return this, to enable chaining
871     */
872    public StrTokenizer setDelimiterChar(final char delim) {
873        return setDelimiterMatcher(StrMatcher.charMatcher(delim));
874    }
875
876    /**
877     * Sets the field delimiter string.
878     *
879     * @param delim  the delimiter string to use
880     * @return this, to enable chaining
881     */
882    public StrTokenizer setDelimiterString(final String delim) {
883        return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
884    }
885
886    /**
887     * Gets the quote matcher currently in use.
888     * <p>
889     * The quote character is used to wrap data between the tokens.
890     * This enables delimiters to be entered as data.
891     * The default value is '"' (double quote).
892     * </p>
893     *
894     * @return the quote matcher in use
895     */
896    public StrMatcher getQuoteMatcher() {
897        return quoteMatcher;
898    }
899
900    /**
901     * Set the quote matcher to use.
902     * <p>
903     * The quote character is used to wrap data between the tokens.
904     * This enables delimiters to be entered as data.
905     * </p>
906     *
907     * @param quote  the quote matcher to use, null ignored
908     * @return this, to enable chaining
909     */
910    public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
911        if (quote != null) {
912            this.quoteMatcher = quote;
913        }
914        return this;
915    }
916
917    /**
918     * Sets the quote character to use.
919     * <p>
920     * The quote character is used to wrap data between the tokens.
921     * This enables delimiters to be entered as data.
922     * </p>
923     *
924     * @param quote  the quote character to use
925     * @return this, to enable chaining
926     */
927    public StrTokenizer setQuoteChar(final char quote) {
928        return setQuoteMatcher(StrMatcher.charMatcher(quote));
929    }
930
931    // Ignored
932    /**
933     * Gets the ignored character matcher.
934     * <p>
935     * These characters are ignored when parsing the String, unless they are
936     * within a quoted region.
937     * The default value is not to ignore anything.
938     * </p>
939     *
940     * @return the ignored matcher in use
941     */
942    public StrMatcher getIgnoredMatcher() {
943        return ignoredMatcher;
944    }
945
946    /**
947     * Set the matcher for characters to ignore.
948     * <p>
949     * These characters are ignored when parsing the String, unless they are
950     * within a quoted region.
951     * </p>
952     *
953     * @param ignored  the ignored matcher to use, null ignored
954     * @return this, to enable chaining
955     */
956    public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
957        if (ignored != null) {
958            this.ignoredMatcher = ignored;
959        }
960        return this;
961    }
962
963    /**
964     * Set the character to ignore.
965     * <p>
966     * This character is ignored when parsing the String, unless it is
967     * within a quoted region.
968     *
969     * @param ignored  the ignored character to use
970     * @return this, to enable chaining
971     */
972    public StrTokenizer setIgnoredChar(final char ignored) {
973        return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
974    }
975
976    /**
977     * Gets the trimmer character matcher.
978     * <p>
979     * These characters are trimmed off on each side of the delimiter
980     * until the token or quote is found.
981     * The default value is not to trim anything.
982     * </p>
983     *
984     * @return the trimmer matcher in use
985     */
986    public StrMatcher getTrimmerMatcher() {
987        return trimmerMatcher;
988    }
989
990    /**
991     * Sets the matcher for characters to trim.
992     * <p>
993     * These characters are trimmed off on each side of the delimiter
994     * until the token or quote is found.
995     * </p>
996     *
997     * @param trimmer  the trimmer matcher to use, null ignored
998     * @return this, to enable chaining
999     */
1000    public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
1001        if (trimmer != null) {
1002            this.trimmerMatcher = trimmer;
1003        }
1004        return this;
1005    }
1006
1007    /**
1008     * Gets whether the tokenizer currently returns empty tokens as null.
1009     * The default for this property is false.
1010     *
1011     * @return true if empty tokens are returned as null
1012     */
1013    public boolean isEmptyTokenAsNull() {
1014        return this.emptyAsNull;
1015    }
1016
1017    /**
1018     * Sets whether the tokenizer should return empty tokens as null.
1019     * The default for this property is false.
1020     *
1021     * @param emptyAsNull  whether empty tokens are returned as null
1022     * @return this, to enable chaining
1023     */
1024    public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
1025        this.emptyAsNull = emptyAsNull;
1026        return this;
1027    }
1028
1029    /**
1030     * Gets whether the tokenizer currently ignores empty tokens.
1031     * The default for this property is true.
1032     *
1033     * @return true if empty tokens are not returned
1034     */
1035    public boolean isIgnoreEmptyTokens() {
1036        return ignoreEmptyTokens;
1037    }
1038
1039    /**
1040     * Sets whether the tokenizer should ignore and not return empty tokens.
1041     * The default for this property is true.
1042     *
1043     * @param ignoreEmptyTokens  whether empty tokens are not returned
1044     * @return this, to enable chaining
1045     */
1046    public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
1047        this.ignoreEmptyTokens = ignoreEmptyTokens;
1048        return this;
1049    }
1050
1051    /**
1052     * Gets the String content that the tokenizer is parsing.
1053     *
1054     * @return the string content being parsed
1055     */
1056    public String getContent() {
1057        if (chars == null) {
1058            return null;
1059        }
1060        return new String(chars);
1061    }
1062
1063    /**
1064     * Creates a new instance of this Tokenizer. The new instance is reset so
1065     * that it will be at the start of the token list.
1066     * If a {@link CloneNotSupportedException} is caught, return {@code null}.
1067     *
1068     * @return a new instance of this Tokenizer which has been reset.
1069     */
1070    @Override
1071    public Object clone() {
1072        try {
1073            return cloneReset();
1074        } catch (final CloneNotSupportedException ex) {
1075            return null;
1076        }
1077    }
1078
1079    /**
1080     * Creates a new instance of this Tokenizer. The new instance is reset so that
1081     * it will be at the start of the token list.
1082     *
1083     * @return a new instance of this Tokenizer which has been reset.
1084     * @throws CloneNotSupportedException if there is a problem cloning
1085     */
1086    Object cloneReset() throws CloneNotSupportedException {
1087        // this method exists to enable 100% test coverage
1088        final StrTokenizer cloned = (StrTokenizer) super.clone();
1089        if (cloned.chars != null) {
1090            cloned.chars = cloned.chars.clone();
1091        }
1092        cloned.reset();
1093        return cloned;
1094    }
1095
1096    /**
1097     * Gets the String content that the tokenizer is parsing.
1098     *
1099     * @return the string content being parsed
1100     */
1101    @Override
1102    public String toString() {
1103        if (tokens == null) {
1104            return "StrTokenizer[not tokenized yet]";
1105        }
1106        return "StrTokenizer" + getTokenList();
1107    }
1108
1109}