ICU 4.8.1.1
4.8.1.1
|
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2011, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 //#define REGEX_DEBUG 00020 00045 #include "unicode/utypes.h" 00046 00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00048 00049 #include "unicode/uobject.h" 00050 #include "unicode/unistr.h" 00051 #include "unicode/utext.h" 00052 #include "unicode/parseerr.h" 00053 00054 #include "unicode/uregex.h" 00055 00056 // Forward Declarations 00057 00058 U_NAMESPACE_BEGIN 00059 00060 struct Regex8BitSet; 00061 class RegexCImpl; 00062 class RegexMatcher; 00063 class RegexPattern; 00064 struct REStackFrame; 00065 class RuleBasedBreakIterator; 00066 class UnicodeSet; 00067 class UVector; 00068 class UVector32; 00069 class UVector64; 00070 00075 #ifdef REGEX_DEBUG 00076 U_INTERNAL void U_EXPORT2 00077 RegexPatternDump(const RegexPattern *pat); 00078 #else 00079 #undef RegexPatternDump 00080 #define RegexPatternDump(pat) 00081 #endif 00082 00083 00084 00096 class U_I18N_API RegexPattern: public UObject { 00097 public: 00098 00106 RegexPattern(); 00107 00114 RegexPattern(const RegexPattern &source); 00115 00121 virtual ~RegexPattern(); 00122 00131 UBool operator==(const RegexPattern& that) const; 00132 00141 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);} 00142 00148 RegexPattern &operator =(const RegexPattern &source); 00149 00157 virtual RegexPattern *clone() const; 00158 00159 00184 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00185 UParseError &pe, 00186 UErrorCode &status); 00187 00188 00215 static RegexPattern * U_EXPORT2 compile( UText *regex, 00216 UParseError &pe, 00217 UErrorCode &status); 00218 00243 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00244 uint32_t flags, 00245 UParseError &pe, 00246 UErrorCode &status); 00247 00248 00275 static RegexPattern * U_EXPORT2 compile( UText *regex, 00276 uint32_t flags, 00277 UParseError &pe, 00278 UErrorCode &status); 00279 00280 00303 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00304 uint32_t flags, 00305 UErrorCode &status); 00306 00307 00332 static RegexPattern * U_EXPORT2 compile( UText *regex, 00333 uint32_t flags, 00334 UErrorCode &status); 00335 00336 00342 virtual uint32_t flags() const; 00343 00361 virtual RegexMatcher *matcher(const UnicodeString &input, 00362 UErrorCode &status) const; 00363 00364 private: 00378 RegexMatcher *matcher(const UChar *input, 00379 UErrorCode &status) const; 00380 public: 00381 00382 00394 virtual RegexMatcher *matcher(UErrorCode &status) const; 00395 00396 00411 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 00412 const UnicodeString &input, 00413 UParseError &pe, 00414 UErrorCode &status); 00415 00416 00431 static UBool U_EXPORT2 matches(UText *regex, 00432 UText *input, 00433 UParseError &pe, 00434 UErrorCode &status); 00435 00436 00445 virtual UnicodeString pattern() const; 00446 00447 00458 virtual UText *patternText(UErrorCode &status) const; 00459 00460 00499 virtual int32_t split(const UnicodeString &input, 00500 UnicodeString dest[], 00501 int32_t destCapacity, 00502 UErrorCode &status) const; 00503 00504 00543 virtual int32_t split(UText *input, 00544 UText *dest[], 00545 int32_t destCapacity, 00546 UErrorCode &status) const; 00547 00548 00554 virtual UClassID getDynamicClassID() const; 00555 00561 static UClassID U_EXPORT2 getStaticClassID(); 00562 00563 private: 00564 // 00565 // Implementation Data 00566 // 00567 UText *fPattern; // The original pattern string. 00568 UnicodeString *fPatternString; // The original pattern UncodeString if relevant 00569 uint32_t fFlags; // The flags used when compiling the pattern. 00570 // 00571 UVector64 *fCompiledPat; // The compiled pattern p-code. 00572 UnicodeString fLiteralText; // Any literal string data from the pattern, 00573 // after un-escaping, for use during the match. 00574 00575 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00576 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00577 00578 00579 UErrorCode fDeferredStatus; // status if some prior error has left this 00580 // RegexPattern in an unusable state. 00581 00582 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00583 // >= this value. For some patterns, this calculated 00584 // value may be less than the true shortest 00585 // possible match. 00586 00587 int32_t fFrameSize; // Size of a state stack frame in the 00588 // execution engine. 00589 00590 int32_t fDataSize; // The size of the data needed by the pattern that 00591 // does not go on the state stack, but has just 00592 // a single copy per matcher. 00593 00594 UVector32 *fGroupMap; // Map from capture group number to position of 00595 // the group's variables in the matcher stack frame. 00596 00597 int32_t fMaxCaptureDigits; 00598 00599 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00600 // regex character classes, e.g. Word. 00601 00602 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00603 // sets for predefined regex classes. 00604 00605 int32_t fStartType; // Info on how a match must start. 00606 int32_t fInitialStringIdx; // 00607 int32_t fInitialStringLen; 00608 UnicodeSet *fInitialChars; 00609 UChar32 fInitialChar; 00610 Regex8BitSet *fInitialChars8; 00611 UBool fNeedsAltInput; 00612 00613 friend class RegexCompile; 00614 friend class RegexMatcher; 00615 friend class RegexCImpl; 00616 00617 // 00618 // Implementation Methods 00619 // 00620 void init(); // Common initialization, for use by constructors. 00621 void zap(); // Common cleanup 00622 #ifdef REGEX_DEBUG 00623 void dumpOp(int32_t index) const; 00624 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); 00625 #endif 00626 00627 }; 00628 00629 00630 00640 class U_I18N_API RegexMatcher: public UObject { 00641 public: 00642 00657 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 00658 00674 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); 00675 00697 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 00698 uint32_t flags, UErrorCode &status); 00699 00721 RegexMatcher(UText *regexp, UText *input, 00722 uint32_t flags, UErrorCode &status); 00723 00724 private: 00738 RegexMatcher(const UnicodeString ®exp, const UChar *input, 00739 uint32_t flags, UErrorCode &status); 00740 public: 00741 00742 00748 virtual ~RegexMatcher(); 00749 00750 00757 virtual UBool matches(UErrorCode &status); 00758 00759 00770 virtual UBool matches(int64_t startIndex, UErrorCode &status); 00771 00772 00786 virtual UBool lookingAt(UErrorCode &status); 00787 00788 00802 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status); 00803 00804 00817 virtual UBool find(); 00818 00819 00829 virtual UBool find(int64_t start, UErrorCode &status); 00830 00831 00841 virtual UnicodeString group(UErrorCode &status) const; 00842 00843 00856 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00857 00858 00864 virtual int32_t groupCount() const; 00865 00866 00881 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 00882 00898 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; 00899 00915 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; 00916 00917 00925 virtual int32_t start(UErrorCode &status) const; 00926 00934 virtual int64_t start64(UErrorCode &status) const; 00935 00936 00950 virtual int32_t start(int32_t group, UErrorCode &status) const; 00951 00965 virtual int64_t start64(int32_t group, UErrorCode &status) const; 00966 00967 00981 virtual int32_t end(UErrorCode &status) const; 00982 00996 virtual int64_t end64(UErrorCode &status) const; 00997 00998 01016 virtual int32_t end(int32_t group, UErrorCode &status) const; 01017 01035 virtual int64_t end64(int32_t group, UErrorCode &status) const; 01036 01037 01046 virtual RegexMatcher &reset(); 01047 01048 01064 virtual RegexMatcher &reset(int64_t index, UErrorCode &status); 01065 01066 01084 virtual RegexMatcher &reset(const UnicodeString &input); 01085 01086 01100 virtual RegexMatcher &reset(UText *input); 01101 01102 01127 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status); 01128 01129 private: 01143 RegexMatcher &reset(const UChar *input); 01144 public: 01145 01153 virtual const UnicodeString &input() const; 01154 01163 virtual UText *inputText() const; 01164 01175 virtual UText *getInput(UText *dest, UErrorCode &status) const; 01176 01177 01196 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status); 01197 01209 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status); 01210 01219 virtual int32_t regionStart() const; 01220 01229 virtual int64_t regionStart64() const; 01230 01231 01240 virtual int32_t regionEnd() const; 01241 01250 virtual int64_t regionEnd64() const; 01251 01260 virtual UBool hasTransparentBounds() const; 01261 01280 virtual RegexMatcher &useTransparentBounds(UBool b); 01281 01282 01290 virtual UBool hasAnchoringBounds() const; 01291 01292 01305 virtual RegexMatcher &useAnchoringBounds(UBool b); 01306 01307 01320 virtual UBool hitEnd() const; 01321 01331 virtual UBool requireEnd() const; 01332 01333 01339 virtual const RegexPattern &pattern() const; 01340 01341 01358 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 01359 01360 01381 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); 01382 01383 01404 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 01405 01406 01431 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); 01432 01433 01461 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 01462 const UnicodeString &replacement, UErrorCode &status); 01463 01464 01492 virtual RegexMatcher &appendReplacement(UText *dest, 01493 UText *replacement, UErrorCode &status); 01494 01495 01506 virtual UnicodeString &appendTail(UnicodeString &dest); 01507 01508 01522 virtual UText *appendTail(UText *dest, UErrorCode &status); 01523 01524 01548 virtual int32_t split(const UnicodeString &input, 01549 UnicodeString dest[], 01550 int32_t destCapacity, 01551 UErrorCode &status); 01552 01553 01577 virtual int32_t split(UText *input, 01578 UText *dest[], 01579 int32_t destCapacity, 01580 UErrorCode &status); 01581 01603 virtual void setTimeLimit(int32_t limit, UErrorCode &status); 01604 01611 virtual int32_t getTimeLimit() const; 01612 01634 virtual void setStackLimit(int32_t limit, UErrorCode &status); 01635 01643 virtual int32_t getStackLimit() const; 01644 01645 01659 virtual void setMatchCallback(URegexMatchCallback *callback, 01660 const void *context, 01661 UErrorCode &status); 01662 01663 01674 virtual void getMatchCallback(URegexMatchCallback *&callback, 01675 const void *&context, 01676 UErrorCode &status); 01677 01678 01692 virtual void setFindProgressCallback(URegexFindProgressCallback *callback, 01693 const void *context, 01694 UErrorCode &status); 01695 01696 01707 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback, 01708 const void *&context, 01709 UErrorCode &status); 01710 01711 01717 void setTrace(UBool state); 01718 01719 01725 static UClassID U_EXPORT2 getStaticClassID(); 01726 01732 virtual UClassID getDynamicClassID() const; 01733 01734 private: 01735 // Constructors and other object boilerplate are private. 01736 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 01737 RegexMatcher(); // default constructor not implemented 01738 RegexMatcher(const RegexPattern *pat); 01739 RegexMatcher(const RegexMatcher &other); 01740 RegexMatcher &operator =(const RegexMatcher &rhs); 01741 void init(UErrorCode &status); // Common initialization 01742 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. 01743 01744 friend class RegexPattern; 01745 friend class RegexCImpl; 01746 public: 01748 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 01749 private: 01750 01751 // 01752 // MatchAt This is the internal interface to the match engine itself. 01753 // Match status comes back in matcher member variables. 01754 // 01755 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); 01756 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); 01757 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test 01758 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test 01759 REStackFrame *resetStack(); 01760 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); 01761 void IncrementTime(UErrorCode &status); 01762 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status); 01763 01764 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; 01765 01766 UBool findUsingChunk(); 01767 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 01768 UBool isChunkWordBoundary(int32_t pos); 01769 01770 const RegexPattern *fPattern; 01771 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 01772 // should delete it when through. 01773 01774 const UnicodeString *fInput; // The string being matched. Only used for input() 01775 UText *fInputText; // The text being matched. Is never NULL. 01776 UText *fAltInputText; // A shallow copy of the text being matched. 01777 // Only created if the pattern contains backreferences. 01778 int64_t fInputLength; // Full length of the input text. 01779 int32_t fFrameSize; // The size of a frame in the backtrack stack. 01780 01781 int64_t fRegionStart; // Start of the input region, default = 0. 01782 int64_t fRegionLimit; // End of input region, default to input.length. 01783 01784 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 01785 int64_t fAnchorLimit; // See useAnchoringBounds 01786 01787 int64_t fLookStart; // Region bounds for look-ahead/behind and 01788 int64_t fLookLimit; // and other boundary tests. See 01789 // useTransparentBounds 01790 01791 int64_t fActiveStart; // Currently active bounds for matching. 01792 int64_t fActiveLimit; // Usually is the same as region, but 01793 // is changed to fLookStart/Limit when 01794 // entering look around regions. 01795 01796 UBool fTransparentBounds; // True if using transparent bounds. 01797 UBool fAnchoringBounds; // True if using anchoring bounds. 01798 01799 UBool fMatch; // True if the last attempted match was successful. 01800 int64_t fMatchStart; // Position of the start of the most recent match 01801 int64_t fMatchEnd; // First position after the end of the most recent match 01802 // Zero if no previous match, even when a region 01803 // is active. 01804 int64_t fLastMatchEnd; // First position after the end of the previous match, 01805 // or -1 if there was no previous match. 01806 int64_t fAppendPosition; // First position after the end of the previous 01807 // appendReplacement(). As described by the 01808 // JavaDoc for Java Matcher, where it is called 01809 // "append position" 01810 UBool fHitEnd; // True if the last match touched the end of input. 01811 UBool fRequireEnd; // True if the last match required end-of-input 01812 // (matched $ or Z) 01813 01814 UVector64 *fStack; 01815 REStackFrame *fFrame; // After finding a match, the last active stack frame, 01816 // which will contain the capture group results. 01817 // NOT valid while match engine is running. 01818 01819 int64_t *fData; // Data area for use by the compiled pattern. 01820 int64_t fSmallData[8]; // Use this for data if it's enough. 01821 01822 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the 01823 // match engine run. Zero for unlimited. 01824 01825 int32_t fTime; // Match time, accumulates while matching. 01826 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. 01827 // Kept separately from fTime to keep as much 01828 // code as possible out of the inline 01829 // StateSave function. 01830 01831 int32_t fStackLimit; // Maximum memory size to use for the backtrack 01832 // stack, in bytes. Zero for unlimited. 01833 01834 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. 01835 // NULL if there is no callback. 01836 const void *fCallbackContext; // User Context ptr for callback function. 01837 01838 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct. 01839 // NULL if there is no callback. 01840 const void *fFindProgressCallbackContext; // User Context ptr for callback function. 01841 01842 01843 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. 01844 01845 UBool fTraceDebug; // Set true for debug tracing of match engine. 01846 01847 UErrorCode fDeferredStatus; // Save error state that cannot be immediately 01848 // reported, or that permanently disables this matcher. 01849 01850 RuleBasedBreakIterator *fWordBreakItr; 01851 }; 01852 01853 U_NAMESPACE_END 01854 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 01855 #endif