001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.lang3.text; 018 019import java.util.ArrayList; 020import java.util.Arrays; 021import java.util.Collections; 022import java.util.List; 023import java.util.ListIterator; 024import java.util.NoSuchElementException; 025import java.util.StringTokenizer; 026 027import org.apache.commons.lang3.ArrayUtils; 028import org.apache.commons.lang3.StringUtils; 029 030/** 031 * Tokenizes a string based on delimiters (separators) 032 * and supporting quoting and ignored character concepts. 033 * <p> 034 * This class can split a String into many smaller strings. It aims 035 * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, 036 * however it offers much more control and flexibility including implementing 037 * the {@link ListIterator} interface. By default, it is set up 038 * like {@link StringTokenizer}. 039 * </p> 040 * <p> 041 * The input String is split into a number of <i>tokens</i>. 042 * Each token is separated from the next String by a <i>delimiter</i>. 043 * One or more delimiter characters must be specified. 044 * </p> 045 * <p> 046 * Each token may be surrounded by quotes. 047 * The <i>quote</i> matcher specifies the quote character(s). 048 * A quote may be escaped within a quoted section by duplicating itself. 049 * </p> 050 * <p> 051 * Between each token and the delimiter are potentially characters that need trimming. 052 * The <i>trimmer</i> matcher specifies these characters. 053 * One usage might be to trim whitespace characters. 054 * </p> 055 * <p> 056 * At any point outside the quotes there might potentially be invalid characters. 057 * The <i>ignored</i> matcher specifies these characters to be removed. 058 * One usage might be to remove new line characters. 059 * </p> 060 * <p> 061 * Empty tokens may be removed or returned as null. 062 * </p> 063 * <pre> 064 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 065 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 066 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 067 * </pre> 068 * 069 * <table> 070 * <caption>StrTokenizer properties and options</caption> 071 * <tr> 072 * <th>Property</th><th>Type</th><th>Default</th> 073 * </tr> 074 * <tr> 075 * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td> 076 * </tr> 077 * <tr> 078 * <td>quote</td><td>NoneMatcher</td><td>{}</td> 079 * </tr> 080 * <tr> 081 * <td>ignore</td><td>NoneMatcher</td><td>{}</td> 082 * </tr> 083 * <tr> 084 * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td> 085 * </tr> 086 * <tr> 087 * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td> 088 * </tr> 089 * </table> 090 * 091 * @since 2.2 092 * @deprecated As of 3.6, use Apache Commons Text 093 * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html"> 094 * StringTokenizer</a> instead 095 */ 096@Deprecated 097public class StrTokenizer implements ListIterator<String>, Cloneable { 098 099 private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE; 100 private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE; 101 static { 102 CSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 103 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher()); 104 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 105 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 106 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 107 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 108 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 109 110 TSV_TOKENIZER_PROTOTYPE = new StrTokenizer(); 111 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher()); 112 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher()); 113 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher()); 114 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher()); 115 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 116 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 117 } 118 119 /** The text to work on. */ 120 private char[] chars; 121 /** The parsed tokens */ 122 private String[] tokens; 123 /** The current iteration position */ 124 private int tokenPos; 125 126 /** The delimiter matcher */ 127 private StrMatcher delimMatcher = StrMatcher.splitMatcher(); 128 /** The quote matcher */ 129 private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); 130 /** The ignored matcher */ 131 private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); 132 /** The trimmer matcher */ 133 private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); 134 135 /** Whether to return empty tokens as null */ 136 private boolean emptyAsNull; 137 /** Whether to ignore empty tokens */ 138 private boolean ignoreEmptyTokens = true; 139 140 141 /** 142 * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 143 * 144 * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}. 145 */ 146 private static StrTokenizer getCSVClone() { 147 return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 148 } 149 150 /** 151 * Gets a new tokenizer instance which parses Comma Separated Value strings 152 * initializing it with the given input. The default for CSV processing 153 * will be trim whitespace from both ends (which can be overridden with 154 * the setTrimmer method). 155 * <p> 156 * You must call a "reset" method to set the string which you want to parse. 157 * </p> 158 * @return a new tokenizer instance which parses Comma Separated Value strings 159 */ 160 public static StrTokenizer getCSVInstance() { 161 return getCSVClone(); 162 } 163 164 /** 165 * Gets a new tokenizer instance which parses Comma Separated Value strings 166 * initializing it with the given input. The default for CSV processing 167 * will be trim whitespace from both ends (which can be overridden with 168 * the setTrimmer method). 169 * 170 * @param input the text to parse 171 * @return a new tokenizer instance which parses Comma Separated Value strings 172 */ 173 public static StrTokenizer getCSVInstance(final String input) { 174 final StrTokenizer tok = getCSVClone(); 175 tok.reset(input); 176 return tok; 177 } 178 179 /** 180 * Gets a new tokenizer instance which parses Comma Separated Value strings 181 * initializing it with the given input. The default for CSV processing 182 * will be trim whitespace from both ends (which can be overridden with 183 * the setTrimmer method). 184 * 185 * @param input the text to parse 186 * @return a new tokenizer instance which parses Comma Separated Value strings 187 */ 188 public static StrTokenizer getCSVInstance(final char[] input) { 189 final StrTokenizer tok = getCSVClone(); 190 tok.reset(input); 191 return tok; 192 } 193 194 /** 195 * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 196 * 197 * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. 198 */ 199 private static StrTokenizer getTSVClone() { 200 return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 201 } 202 203 204 /** 205 * Gets a new tokenizer instance which parses Tab Separated Value strings. 206 * The default for CSV processing will be trim whitespace from both ends 207 * (which can be overridden with the setTrimmer method). 208 * <p> 209 * You must call a "reset" method to set the string which you want to parse. 210 * </p> 211 * @return a new tokenizer instance which parses Tab Separated Value strings. 212 */ 213 public static StrTokenizer getTSVInstance() { 214 return getTSVClone(); 215 } 216 217 /** 218 * Gets a new tokenizer instance which parses Tab Separated Value strings. 219 * The default for CSV processing will be trim whitespace from both ends 220 * (which can be overridden with the setTrimmer method). 221 * @param input the string to parse 222 * @return a new tokenizer instance which parses Tab Separated Value strings. 223 */ 224 public static StrTokenizer getTSVInstance(final String input) { 225 final StrTokenizer tok = getTSVClone(); 226 tok.reset(input); 227 return tok; 228 } 229 230 /** 231 * Gets a new tokenizer instance which parses Tab Separated Value strings. 232 * The default for CSV processing will be trim whitespace from both ends 233 * (which can be overridden with the setTrimmer method). 234 * @param input the string to parse 235 * @return a new tokenizer instance which parses Tab Separated Value strings. 236 */ 237 public static StrTokenizer getTSVInstance(final char[] input) { 238 final StrTokenizer tok = getTSVClone(); 239 tok.reset(input); 240 return tok; 241 } 242 243 /** 244 * Constructs a tokenizer splitting on space, tab, newline and formfeed 245 * as per StringTokenizer, but with no text to tokenize. 246 * <p> 247 * This constructor is normally used with {@link #reset(String)}. 248 * </p> 249 */ 250 public StrTokenizer() { 251 this.chars = null; 252 } 253 254 /** 255 * Constructs a tokenizer splitting on space, tab, newline and formfeed 256 * as per StringTokenizer. 257 * 258 * @param input the string which is to be parsed 259 */ 260 public StrTokenizer(final String input) { 261 if (input != null) { 262 chars = input.toCharArray(); 263 } else { 264 chars = null; 265 } 266 } 267 268 /** 269 * Constructs a tokenizer splitting on the specified delimiter character. 270 * 271 * @param input the string which is to be parsed 272 * @param delim the field delimiter character 273 */ 274 public StrTokenizer(final String input, final char delim) { 275 this(input); 276 setDelimiterChar(delim); 277 } 278 279 /** 280 * Constructs a tokenizer splitting on the specified delimiter string. 281 * 282 * @param input the string which is to be parsed 283 * @param delim the field delimiter string 284 */ 285 public StrTokenizer(final String input, final String delim) { 286 this(input); 287 setDelimiterString(delim); 288 } 289 290 /** 291 * Constructs a tokenizer splitting using the specified delimiter matcher. 292 * 293 * @param input the string which is to be parsed 294 * @param delim the field delimiter matcher 295 */ 296 public StrTokenizer(final String input, final StrMatcher delim) { 297 this(input); 298 setDelimiterMatcher(delim); 299 } 300 301 /** 302 * Constructs a tokenizer splitting on the specified delimiter character 303 * and handling quotes using the specified quote character. 304 * 305 * @param input the string which is to be parsed 306 * @param delim the field delimiter character 307 * @param quote the field quoted string character 308 */ 309 public StrTokenizer(final String input, final char delim, final char quote) { 310 this(input, delim); 311 setQuoteChar(quote); 312 } 313 314 /** 315 * Constructs a tokenizer splitting using the specified delimiter matcher 316 * and handling quotes using the specified quote matcher. 317 * 318 * @param input the string which is to be parsed 319 * @param delim the field delimiter matcher 320 * @param quote the field quoted string matcher 321 */ 322 public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { 323 this(input, delim); 324 setQuoteMatcher(quote); 325 } 326 327 /** 328 * Constructs a tokenizer splitting on space, tab, newline and formfeed 329 * as per StringTokenizer. 330 * 331 * @param input the string which is to be parsed, not cloned 332 */ 333 public StrTokenizer(final char[] input) { 334 this.chars = ArrayUtils.clone(input); 335 } 336 337 /** 338 * Constructs a tokenizer splitting on the specified character. 339 * 340 * @param input the string which is to be parsed, not cloned 341 * @param delim the field delimiter character 342 */ 343 public StrTokenizer(final char[] input, final char delim) { 344 this(input); 345 setDelimiterChar(delim); 346 } 347 348 /** 349 * Constructs a tokenizer splitting on the specified string. 350 * 351 * @param input the string which is to be parsed, not cloned 352 * @param delim the field delimiter string 353 */ 354 public StrTokenizer(final char[] input, final String delim) { 355 this(input); 356 setDelimiterString(delim); 357 } 358 359 /** 360 * Constructs a tokenizer splitting using the specified delimiter matcher. 361 * 362 * @param input the string which is to be parsed, not cloned 363 * @param delim the field delimiter matcher 364 */ 365 public StrTokenizer(final char[] input, final StrMatcher delim) { 366 this(input); 367 setDelimiterMatcher(delim); 368 } 369 370 /** 371 * Constructs a tokenizer splitting on the specified delimiter character 372 * and handling quotes using the specified quote character. 373 * 374 * @param input the string which is to be parsed, not cloned 375 * @param delim the field delimiter character 376 * @param quote the field quoted string character 377 */ 378 public StrTokenizer(final char[] input, final char delim, final char quote) { 379 this(input, delim); 380 setQuoteChar(quote); 381 } 382 383 /** 384 * Constructs a tokenizer splitting using the specified delimiter matcher 385 * and handling quotes using the specified quote matcher. 386 * 387 * @param input the string which is to be parsed, not cloned 388 * @param delim the field delimiter character 389 * @param quote the field quoted string character 390 */ 391 public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { 392 this(input, delim); 393 setQuoteMatcher(quote); 394 } 395 396 // API 397 /** 398 * Gets the number of tokens found in the String. 399 * 400 * @return the number of matched tokens 401 */ 402 public int size() { 403 checkTokenized(); 404 return tokens.length; 405 } 406 407 /** 408 * Gets the next token from the String. 409 * Equivalent to {@link #next()} except it returns null rather than 410 * throwing {@link NoSuchElementException} when no tokens remain. 411 * 412 * @return the next sequential token, or null when no more tokens are found 413 */ 414 public String nextToken() { 415 if (hasNext()) { 416 return tokens[tokenPos++]; 417 } 418 return null; 419 } 420 421 /** 422 * Gets the previous token from the String. 423 * 424 * @return the previous sequential token, or null when no more tokens are found 425 */ 426 public String previousToken() { 427 if (hasPrevious()) { 428 return tokens[--tokenPos]; 429 } 430 return null; 431 } 432 433 /** 434 * Gets a copy of the full token list as an independent modifiable array. 435 * 436 * @return the tokens as a String array 437 */ 438 public String[] getTokenArray() { 439 checkTokenized(); 440 return tokens.clone(); 441 } 442 443 /** 444 * Gets a copy of the full token list as an independent modifiable list. 445 * 446 * @return the tokens as a String array 447 */ 448 public List<String> getTokenList() { 449 checkTokenized(); 450 final List<String> list = new ArrayList<>(tokens.length); 451 list.addAll(Arrays.asList(tokens)); 452 return list; 453 } 454 455 /** 456 * Resets this tokenizer, forgetting all parsing and iteration already completed. 457 * <p> 458 * This method allows the same tokenizer to be reused for the same String. 459 * </p> 460 * 461 * @return this, to enable chaining 462 */ 463 public StrTokenizer reset() { 464 tokenPos = 0; 465 tokens = null; 466 return this; 467 } 468 469 /** 470 * Reset this tokenizer, giving it a new input string to parse. 471 * In this manner you can re-use a tokenizer with the same settings 472 * on multiple input lines. 473 * 474 * @param input the new string to tokenize, null sets no text to parse 475 * @return this, to enable chaining 476 */ 477 public StrTokenizer reset(final String input) { 478 reset(); 479 if (input != null) { 480 this.chars = input.toCharArray(); 481 } else { 482 this.chars = null; 483 } 484 return this; 485 } 486 487 /** 488 * Reset this tokenizer, giving it a new input string to parse. 489 * In this manner you can re-use a tokenizer with the same settings 490 * on multiple input lines. 491 * 492 * @param input the new character array to tokenize, not cloned, null sets no text to parse 493 * @return this, to enable chaining 494 */ 495 public StrTokenizer reset(final char[] input) { 496 reset(); 497 this.chars = ArrayUtils.clone(input); 498 return this; 499 } 500 501 /** 502 * Checks whether there are any more tokens. 503 * 504 * @return true if there are more tokens 505 */ 506 @Override 507 public boolean hasNext() { 508 checkTokenized(); 509 return tokenPos < tokens.length; 510 } 511 512 /** 513 * Gets the next token. 514 * 515 * @return the next String token 516 * @throws NoSuchElementException if there are no more elements 517 */ 518 @Override 519 public String next() { 520 if (hasNext()) { 521 return tokens[tokenPos++]; 522 } 523 throw new NoSuchElementException(); 524 } 525 526 /** 527 * Gets the index of the next token to return. 528 * 529 * @return the next token index 530 */ 531 @Override 532 public int nextIndex() { 533 return tokenPos; 534 } 535 536 /** 537 * Checks whether there are any previous tokens that can be iterated to. 538 * 539 * @return true if there are previous tokens 540 */ 541 @Override 542 public boolean hasPrevious() { 543 checkTokenized(); 544 return tokenPos > 0; 545 } 546 547 /** 548 * Gets the token previous to the last returned token. 549 * 550 * @return the previous token 551 */ 552 @Override 553 public String previous() { 554 if (hasPrevious()) { 555 return tokens[--tokenPos]; 556 } 557 throw new NoSuchElementException(); 558 } 559 560 /** 561 * Gets the index of the previous token. 562 * 563 * @return the previous token index 564 */ 565 @Override 566 public int previousIndex() { 567 return tokenPos - 1; 568 } 569 570 /** 571 * Unsupported ListIterator operation. 572 * 573 * @throws UnsupportedOperationException always 574 */ 575 @Override 576 public void remove() { 577 throw new UnsupportedOperationException("remove() is unsupported"); 578 } 579 580 /** 581 * Unsupported ListIterator operation. 582 * @param obj this parameter ignored. 583 * @throws UnsupportedOperationException always 584 */ 585 @Override 586 public void set(final String obj) { 587 throw new UnsupportedOperationException("set() is unsupported"); 588 } 589 590 /** 591 * Unsupported ListIterator operation. 592 * @param obj this parameter ignored. 593 * @throws UnsupportedOperationException always 594 */ 595 @Override 596 public void add(final String obj) { 597 throw new UnsupportedOperationException("add() is unsupported"); 598 } 599 600 /** 601 * Checks if tokenization has been done, and if not then do it. 602 */ 603 private void checkTokenized() { 604 if (tokens == null) { 605 if (chars == null) { 606 // still call tokenize as subclass may do some work 607 final List<String> split = tokenize(null, 0, 0); 608 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 609 } else { 610 final List<String> split = tokenize(chars, 0, chars.length); 611 tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY); 612 } 613 } 614 } 615 616 /** 617 * Internal method to performs the tokenization. 618 * <p> 619 * Most users of this class do not need to call this method. This method 620 * will be called automatically by other (public) methods when required. 621 * </p> 622 * <p> 623 * This method exists to allow subclasses to add code before or after the 624 * tokenization. For example, a subclass could alter the character array, 625 * offset or count to be parsed, or call the tokenizer multiple times on 626 * multiple strings. It is also be possible to filter the results. 627 * </p> 628 * <p> 629 * {@link StrTokenizer} will always pass a zero offset and a count 630 * equal to the length of the array to this method, however a subclass 631 * may pass other values, or even an entirely different array. 632 * </p> 633 * 634 * @param srcChars the character array being tokenized, may be null 635 * @param offset the start position within the character array, must be valid 636 * @param count the number of characters to tokenize, must be valid 637 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 638 */ 639 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 640 if (ArrayUtils.isEmpty(srcChars)) { 641 return Collections.emptyList(); 642 } 643 final StrBuilder buf = new StrBuilder(); 644 final List<String> tokenList = new ArrayList<>(); 645 int pos = offset; 646 647 // loop around the entire buffer 648 while (pos >= 0 && pos < count) { 649 // find next token 650 pos = readNextToken(srcChars, pos, count, buf, tokenList); 651 652 // handle case where end of string is a delimiter 653 if (pos >= count) { 654 addToken(tokenList, StringUtils.EMPTY); 655 } 656 } 657 return tokenList; 658 } 659 660 /** 661 * Adds a token to a list, paying attention to the parameters we've set. 662 * 663 * @param list the list to add to 664 * @param tok the token to add 665 */ 666 private void addToken(final List<String> list, String tok) { 667 if (StringUtils.isEmpty(tok)) { 668 if (isIgnoreEmptyTokens()) { 669 return; 670 } 671 if (isEmptyTokenAsNull()) { 672 tok = null; 673 } 674 } 675 list.add(tok); 676 } 677 678 /** 679 * Reads character by character through the String to get the next token. 680 * 681 * @param srcChars the character array being tokenized 682 * @param start the first character of field 683 * @param len the length of the character array being tokenized 684 * @param workArea a temporary work area 685 * @param tokenList the list of parsed tokens 686 * @return the starting position of the next field (the character 687 * immediately after the delimiter), or -1 if end of string found 688 */ 689 private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) { 690 // skip all leading whitespace, unless it is the 691 // field delimiter or the quote character 692 while (start < len) { 693 final int removeLen = Math.max( 694 getIgnoredMatcher().isMatch(srcChars, start, start, len), 695 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 696 if (removeLen == 0 || 697 getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 || 698 getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 699 break; 700 } 701 start += removeLen; 702 } 703 704 // handle reaching end 705 if (start >= len) { 706 addToken(tokenList, StringUtils.EMPTY); 707 return -1; 708 } 709 710 // handle empty token 711 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 712 if (delimLen > 0) { 713 addToken(tokenList, StringUtils.EMPTY); 714 return start + delimLen; 715 } 716 717 // handle found token 718 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 719 if (quoteLen > 0) { 720 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 721 } 722 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 723 } 724 725 /** 726 * Reads a possibly quoted string token. 727 * 728 * @param srcChars the character array being tokenized 729 * @param start the first character of field 730 * @param len the length of the character array being tokenized 731 * @param workArea a temporary work area 732 * @param tokenList the list of parsed tokens 733 * @param quoteStart the start position of the matched quote, 0 if no quoting 734 * @param quoteLen the length of the matched quote, 0 if no quoting 735 * @return the starting position of the next field (the character 736 * immediately after the delimiter, or if end of string found, 737 * then the length of string 738 */ 739 private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea, 740 final List<String> tokenList, final int quoteStart, final int quoteLen) { 741 // Loop until we've found the end of the quoted 742 // string or the end of the input 743 workArea.clear(); 744 int pos = start; 745 boolean quoting = quoteLen > 0; 746 int trimStart = 0; 747 748 while (pos < len) { 749 // quoting mode can occur several times throughout a string 750 // we must switch between quoting and non-quoting until we 751 // encounter a non-quoted delimiter, or end of string 752 if (quoting) { 753 // In quoting mode 754 755 // If we've found a quote character, see if it's 756 // followed by a second quote. If so, then we need 757 // to actually put the quote character into the token 758 // rather than end the token. 759 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 760 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 761 // matched pair of quotes, thus an escaped quote 762 workArea.append(srcChars, pos, quoteLen); 763 pos += quoteLen * 2; 764 trimStart = workArea.size(); 765 continue; 766 } 767 768 // end of quoting 769 quoting = false; 770 pos += quoteLen; 771 continue; 772 } 773 774 } else { 775 // Not in quoting mode 776 777 // check for delimiter, and thus end of token 778 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 779 if (delimLen > 0) { 780 // return condition when end of token found 781 addToken(tokenList, workArea.substring(0, trimStart)); 782 return pos + delimLen; 783 } 784 785 // check for quote, and thus back into quoting mode 786 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 787 quoting = true; 788 pos += quoteLen; 789 continue; 790 } 791 792 // check for ignored (outside quotes), and ignore 793 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 794 if (ignoredLen > 0) { 795 pos += ignoredLen; 796 continue; 797 } 798 799 // check for trimmed character 800 // don't yet know if it's at the end, so copy to workArea 801 // use trimStart to keep track of trim at the end 802 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 803 if (trimmedLen > 0) { 804 workArea.append(srcChars, pos, trimmedLen); 805 pos += trimmedLen; 806 continue; 807 } 808 } 809 // copy regular character from inside quotes 810 workArea.append(srcChars[pos++]); 811 trimStart = workArea.size(); 812 } 813 814 // return condition when end of string found 815 addToken(tokenList, workArea.substring(0, trimStart)); 816 return -1; 817 } 818 819 /** 820 * Checks if the characters at the index specified match the quote 821 * already matched in readNextToken(). 822 * 823 * @param srcChars the character array being tokenized 824 * @param pos the position to check for a quote 825 * @param len the length of the character array being tokenized 826 * @param quoteStart the start position of the matched quote, 0 if no quoting 827 * @param quoteLen the length of the matched quote, 0 if no quoting 828 * @return true if a quote is matched 829 */ 830 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { 831 for (int i = 0; i < quoteLen; i++) { 832 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 833 return false; 834 } 835 } 836 return true; 837 } 838 839 /** 840 * Gets the field delimiter matcher. 841 * 842 * @return the delimiter matcher in use 843 */ 844 public StrMatcher getDelimiterMatcher() { 845 return this.delimMatcher; 846 } 847 848 /** 849 * Sets the field delimiter matcher. 850 * <p> 851 * The delimiter is used to separate one token from another. 852 * </p> 853 * 854 * @param delim the delimiter matcher to use 855 * @return this, to enable chaining 856 */ 857 public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { 858 if (delim == null) { 859 this.delimMatcher = StrMatcher.noneMatcher(); 860 } else { 861 this.delimMatcher = delim; 862 } 863 return this; 864 } 865 866 /** 867 * Sets the field delimiter character. 868 * 869 * @param delim the delimiter character to use 870 * @return this, to enable chaining 871 */ 872 public StrTokenizer setDelimiterChar(final char delim) { 873 return setDelimiterMatcher(StrMatcher.charMatcher(delim)); 874 } 875 876 /** 877 * Sets the field delimiter string. 878 * 879 * @param delim the delimiter string to use 880 * @return this, to enable chaining 881 */ 882 public StrTokenizer setDelimiterString(final String delim) { 883 return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); 884 } 885 886 /** 887 * Gets the quote matcher currently in use. 888 * <p> 889 * The quote character is used to wrap data between the tokens. 890 * This enables delimiters to be entered as data. 891 * The default value is '"' (double quote). 892 * </p> 893 * 894 * @return the quote matcher in use 895 */ 896 public StrMatcher getQuoteMatcher() { 897 return quoteMatcher; 898 } 899 900 /** 901 * Set the quote matcher to use. 902 * <p> 903 * The quote character is used to wrap data between the tokens. 904 * This enables delimiters to be entered as data. 905 * </p> 906 * 907 * @param quote the quote matcher to use, null ignored 908 * @return this, to enable chaining 909 */ 910 public StrTokenizer setQuoteMatcher(final StrMatcher quote) { 911 if (quote != null) { 912 this.quoteMatcher = quote; 913 } 914 return this; 915 } 916 917 /** 918 * Sets the quote character to use. 919 * <p> 920 * The quote character is used to wrap data between the tokens. 921 * This enables delimiters to be entered as data. 922 * </p> 923 * 924 * @param quote the quote character to use 925 * @return this, to enable chaining 926 */ 927 public StrTokenizer setQuoteChar(final char quote) { 928 return setQuoteMatcher(StrMatcher.charMatcher(quote)); 929 } 930 931 // Ignored 932 /** 933 * Gets the ignored character matcher. 934 * <p> 935 * These characters are ignored when parsing the String, unless they are 936 * within a quoted region. 937 * The default value is not to ignore anything. 938 * </p> 939 * 940 * @return the ignored matcher in use 941 */ 942 public StrMatcher getIgnoredMatcher() { 943 return ignoredMatcher; 944 } 945 946 /** 947 * Set the matcher for characters to ignore. 948 * <p> 949 * These characters are ignored when parsing the String, unless they are 950 * within a quoted region. 951 * </p> 952 * 953 * @param ignored the ignored matcher to use, null ignored 954 * @return this, to enable chaining 955 */ 956 public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { 957 if (ignored != null) { 958 this.ignoredMatcher = ignored; 959 } 960 return this; 961 } 962 963 /** 964 * Set the character to ignore. 965 * <p> 966 * This character is ignored when parsing the String, unless it is 967 * within a quoted region. 968 * 969 * @param ignored the ignored character to use 970 * @return this, to enable chaining 971 */ 972 public StrTokenizer setIgnoredChar(final char ignored) { 973 return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); 974 } 975 976 /** 977 * Gets the trimmer character matcher. 978 * <p> 979 * These characters are trimmed off on each side of the delimiter 980 * until the token or quote is found. 981 * The default value is not to trim anything. 982 * </p> 983 * 984 * @return the trimmer matcher in use 985 */ 986 public StrMatcher getTrimmerMatcher() { 987 return trimmerMatcher; 988 } 989 990 /** 991 * Sets the matcher for characters to trim. 992 * <p> 993 * These characters are trimmed off on each side of the delimiter 994 * until the token or quote is found. 995 * </p> 996 * 997 * @param trimmer the trimmer matcher to use, null ignored 998 * @return this, to enable chaining 999 */ 1000 public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { 1001 if (trimmer != null) { 1002 this.trimmerMatcher = trimmer; 1003 } 1004 return this; 1005 } 1006 1007 /** 1008 * Gets whether the tokenizer currently returns empty tokens as null. 1009 * The default for this property is false. 1010 * 1011 * @return true if empty tokens are returned as null 1012 */ 1013 public boolean isEmptyTokenAsNull() { 1014 return this.emptyAsNull; 1015 } 1016 1017 /** 1018 * Sets whether the tokenizer should return empty tokens as null. 1019 * The default for this property is false. 1020 * 1021 * @param emptyAsNull whether empty tokens are returned as null 1022 * @return this, to enable chaining 1023 */ 1024 public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1025 this.emptyAsNull = emptyAsNull; 1026 return this; 1027 } 1028 1029 /** 1030 * Gets whether the tokenizer currently ignores empty tokens. 1031 * The default for this property is true. 1032 * 1033 * @return true if empty tokens are not returned 1034 */ 1035 public boolean isIgnoreEmptyTokens() { 1036 return ignoreEmptyTokens; 1037 } 1038 1039 /** 1040 * Sets whether the tokenizer should ignore and not return empty tokens. 1041 * The default for this property is true. 1042 * 1043 * @param ignoreEmptyTokens whether empty tokens are not returned 1044 * @return this, to enable chaining 1045 */ 1046 public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1047 this.ignoreEmptyTokens = ignoreEmptyTokens; 1048 return this; 1049 } 1050 1051 /** 1052 * Gets the String content that the tokenizer is parsing. 1053 * 1054 * @return the string content being parsed 1055 */ 1056 public String getContent() { 1057 if (chars == null) { 1058 return null; 1059 } 1060 return new String(chars); 1061 } 1062 1063 /** 1064 * Creates a new instance of this Tokenizer. The new instance is reset so 1065 * that it will be at the start of the token list. 1066 * If a {@link CloneNotSupportedException} is caught, return {@code null}. 1067 * 1068 * @return a new instance of this Tokenizer which has been reset. 1069 */ 1070 @Override 1071 public Object clone() { 1072 try { 1073 return cloneReset(); 1074 } catch (final CloneNotSupportedException ex) { 1075 return null; 1076 } 1077 } 1078 1079 /** 1080 * Creates a new instance of this Tokenizer. The new instance is reset so that 1081 * it will be at the start of the token list. 1082 * 1083 * @return a new instance of this Tokenizer which has been reset. 1084 * @throws CloneNotSupportedException if there is a problem cloning 1085 */ 1086 Object cloneReset() throws CloneNotSupportedException { 1087 // this method exists to enable 100% test coverage 1088 final StrTokenizer cloned = (StrTokenizer) super.clone(); 1089 if (cloned.chars != null) { 1090 cloned.chars = cloned.chars.clone(); 1091 } 1092 cloned.reset(); 1093 return cloned; 1094 } 1095 1096 /** 1097 * Gets the String content that the tokenizer is parsing. 1098 * 1099 * @return the string content being parsed 1100 */ 1101 @Override 1102 public String toString() { 1103 if (tokens == null) { 1104 return "StrTokenizer[not tokenized yet]"; 1105 } 1106 return "StrTokenizer" + getTokenList(); 1107 } 1108 1109}