ICU 4.8.1.1  4.8.1.1
regex.h
Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2011, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053 
00054 #include "unicode/uregex.h"
00055 
00056 // Forward Declarations
00057 
00058 U_NAMESPACE_BEGIN
00059 
00060 struct Regex8BitSet;
00061 class  RegexCImpl;
00062 class  RegexMatcher;
00063 class  RegexPattern;
00064 struct REStackFrame;
00065 class  RuleBasedBreakIterator;
00066 class  UnicodeSet;
00067 class  UVector;
00068 class  UVector32;
00069 class  UVector64;
00070 
00075 #ifdef REGEX_DEBUG
00076 U_INTERNAL void U_EXPORT2
00077     RegexPatternDump(const RegexPattern *pat);
00078 #else
00079     #undef RegexPatternDump
00080     #define RegexPatternDump(pat)
00081 #endif
00082 
00083 
00084 
00096 class U_I18N_API RegexPattern: public UObject {
00097 public:
00098 
00106     RegexPattern();
00107 
00114     RegexPattern(const RegexPattern &source);
00115 
00121     virtual ~RegexPattern();
00122 
00131     UBool           operator==(const RegexPattern& that) const;
00132 
00141     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
00142 
00148     RegexPattern  &operator =(const RegexPattern &source);
00149 
00157     virtual RegexPattern  *clone() const;
00158 
00159 
00184     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00185         UParseError          &pe,
00186         UErrorCode           &status);
00187 
00188 
00215     static RegexPattern * U_EXPORT2 compile( UText *regex,
00216         UParseError          &pe,
00217         UErrorCode           &status);
00218 
00243     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00244         uint32_t             flags,
00245         UParseError          &pe,
00246         UErrorCode           &status);
00247         
00248         
00275     static RegexPattern * U_EXPORT2 compile( UText *regex,
00276         uint32_t             flags,
00277         UParseError          &pe,
00278         UErrorCode           &status);
00279     
00280 
00303     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00304         uint32_t             flags,
00305         UErrorCode           &status);
00306 
00307 
00332     static RegexPattern * U_EXPORT2 compile( UText *regex,
00333         uint32_t             flags,
00334         UErrorCode           &status);
00335     
00336 
00342     virtual uint32_t flags() const;
00343 
00361     virtual RegexMatcher *matcher(const UnicodeString &input,
00362         UErrorCode          &status) const;
00363         
00364 private:
00378     RegexMatcher *matcher(const UChar *input,
00379         UErrorCode          &status) const;
00380 public:
00381 
00382 
00394     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00395 
00396 
00411     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00412         const UnicodeString   &input,
00413               UParseError     &pe,
00414               UErrorCode      &status);
00415 
00416 
00431     static UBool U_EXPORT2 matches(UText *regex,
00432         UText           *input,
00433         UParseError     &pe,
00434         UErrorCode      &status);
00435 
00436 
00445     virtual UnicodeString pattern() const;
00446     
00447     
00458     virtual UText *patternText(UErrorCode      &status) const;
00459 
00460 
00499     virtual int32_t  split(const UnicodeString &input,
00500         UnicodeString    dest[],
00501         int32_t          destCapacity,
00502         UErrorCode       &status) const;
00503 
00504 
00543     virtual int32_t  split(UText *input,
00544         UText            *dest[],
00545         int32_t          destCapacity,
00546         UErrorCode       &status) const;
00547 
00548 
00554     virtual UClassID getDynamicClassID() const;
00555 
00561     static UClassID U_EXPORT2 getStaticClassID();
00562 
00563 private:
00564     //
00565     //  Implementation Data
00566     //
00567     UText          *fPattern;      // The original pattern string.
00568     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
00569     uint32_t        fFlags;        // The flags used when compiling the pattern.
00570                                    //
00571     UVector64       *fCompiledPat; // The compiled pattern p-code.
00572     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00573                                    //   after un-escaping, for use during the match.
00574 
00575     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00576     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00577 
00578 
00579     UErrorCode      fDeferredStatus; // status if some prior error has left this
00580                                    //  RegexPattern in an unusable state.
00581 
00582     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00583                                    //   >= this value.  For some patterns, this calculated
00584                                    //   value may be less than the true shortest
00585                                    //   possible match.
00586     
00587     int32_t         fFrameSize;    // Size of a state stack frame in the
00588                                    //   execution engine.
00589 
00590     int32_t         fDataSize;     // The size of the data needed by the pattern that
00591                                    //   does not go on the state stack, but has just
00592                                    //   a single copy per matcher.
00593 
00594     UVector32       *fGroupMap;    // Map from capture group number to position of
00595                                    //   the group's variables in the matcher stack frame.
00596 
00597     int32_t         fMaxCaptureDigits;
00598 
00599     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00600                                    //   regex character classes, e.g. Word.
00601 
00602     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00603                                    //  sets for predefined regex classes.
00604 
00605     int32_t         fStartType;    // Info on how a match must start.
00606     int32_t         fInitialStringIdx;     //
00607     int32_t         fInitialStringLen;
00608     UnicodeSet     *fInitialChars;
00609     UChar32         fInitialChar;
00610     Regex8BitSet   *fInitialChars8;
00611     UBool           fNeedsAltInput;
00612 
00613     friend class RegexCompile;
00614     friend class RegexMatcher;
00615     friend class RegexCImpl;
00616 
00617     //
00618     //  Implementation Methods
00619     //
00620     void        init();            // Common initialization, for use by constructors.
00621     void        zap();             // Common cleanup
00622 #ifdef REGEX_DEBUG
00623     void        dumpOp(int32_t index) const;
00624     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00625 #endif
00626 
00627 };
00628 
00629 
00630 
00640 class U_I18N_API RegexMatcher: public UObject {
00641 public:
00642 
00657     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00658 
00674     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00675     
00697     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00698         uint32_t flags, UErrorCode &status);
00699 
00721     RegexMatcher(UText *regexp, UText *input,
00722         uint32_t flags, UErrorCode &status);
00723 
00724 private:
00738     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00739         uint32_t flags, UErrorCode &status);
00740 public:
00741 
00742 
00748     virtual ~RegexMatcher();
00749 
00750 
00757     virtual UBool matches(UErrorCode &status);
00758 
00759 
00770     virtual UBool matches(int64_t startIndex, UErrorCode &status);
00771 
00772 
00786     virtual UBool lookingAt(UErrorCode &status);
00787 
00788 
00802     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
00803 
00804 
00817     virtual UBool find();
00818 
00819 
00829     virtual UBool find(int64_t start, UErrorCode &status);
00830 
00831 
00841     virtual UnicodeString group(UErrorCode &status) const;
00842 
00843 
00856     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00857 
00858 
00864     virtual int32_t groupCount() const;
00865 
00866 
00881     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
00882 
00898     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
00899 
00915     virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00916 
00917 
00925     virtual int32_t start(UErrorCode &status) const;
00926 
00934     virtual int64_t start64(UErrorCode &status) const;
00935 
00936 
00950     virtual int32_t start(int32_t group, UErrorCode &status) const;
00951 
00965     virtual int64_t start64(int32_t group, UErrorCode &status) const;
00966 
00967 
00981     virtual int32_t end(UErrorCode &status) const;
00982 
00996     virtual int64_t end64(UErrorCode &status) const;
00997 
00998 
01016     virtual int32_t end(int32_t group, UErrorCode &status) const;
01017 
01035     virtual int64_t end64(int32_t group, UErrorCode &status) const;
01036 
01037 
01046     virtual RegexMatcher &reset();
01047 
01048 
01064     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
01065 
01066 
01084     virtual RegexMatcher &reset(const UnicodeString &input);
01085 
01086 
01100     virtual RegexMatcher &reset(UText *input);
01101 
01102 
01127     virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
01128 
01129 private:
01143     RegexMatcher &reset(const UChar *input);
01144 public:
01145 
01153     virtual const UnicodeString &input() const;
01154     
01163     virtual UText *inputText() const;
01164     
01175     virtual UText *getInput(UText *dest, UErrorCode &status) const;
01176     
01177 
01196      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
01197 
01209      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
01210 
01219      virtual int32_t regionStart() const;
01220 
01229      virtual int64_t regionStart64() const;
01230 
01231 
01240       virtual int32_t regionEnd() const;
01241 
01250       virtual int64_t regionEnd64() const;
01251 
01260       virtual UBool hasTransparentBounds() const;
01261 
01280       virtual RegexMatcher &useTransparentBounds(UBool b);
01281 
01282      
01290       virtual UBool hasAnchoringBounds() const;
01291 
01292 
01305       virtual RegexMatcher &useAnchoringBounds(UBool b);
01306 
01307 
01320       virtual UBool hitEnd() const;
01321 
01331       virtual UBool requireEnd() const;
01332 
01333 
01339     virtual const RegexPattern &pattern() const;
01340 
01341 
01358     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01359 
01360 
01381     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01382     
01383 
01404     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01405     
01406 
01431     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01432     
01433     
01461     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01462         const UnicodeString &replacement, UErrorCode &status);
01463     
01464     
01492     virtual RegexMatcher &appendReplacement(UText *dest,
01493         UText *replacement, UErrorCode &status);
01494 
01495 
01506     virtual UnicodeString &appendTail(UnicodeString &dest);
01507 
01508 
01522     virtual UText *appendTail(UText *dest, UErrorCode &status);
01523 
01524 
01548     virtual int32_t  split(const UnicodeString &input,
01549         UnicodeString    dest[],
01550         int32_t          destCapacity,
01551         UErrorCode       &status);
01552 
01553 
01577     virtual int32_t  split(UText *input,
01578         UText           *dest[],
01579         int32_t          destCapacity,
01580         UErrorCode       &status);
01581     
01603     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01604 
01611     virtual int32_t getTimeLimit() const;
01612 
01634     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
01635     
01643     virtual int32_t  getStackLimit() const;
01644 
01645 
01659     virtual void setMatchCallback(URegexMatchCallback     *callback,
01660                                   const void              *context,
01661                                   UErrorCode              &status);
01662 
01663 
01674     virtual void getMatchCallback(URegexMatchCallback     *&callback,
01675                                   const void              *&context,
01676                                   UErrorCode              &status);
01677 
01678 
01692     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
01693                                               const void                              *context,
01694                                               UErrorCode                              &status);
01695 
01696 
01707     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
01708                                               const void                      *&context,
01709                                               UErrorCode                      &status);
01710 
01711 
01717     void setTrace(UBool state);
01718 
01719 
01725     static UClassID U_EXPORT2 getStaticClassID();
01726 
01732     virtual UClassID getDynamicClassID() const;
01733 
01734 private:
01735     // Constructors and other object boilerplate are private.
01736     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
01737     RegexMatcher();                  // default constructor not implemented
01738     RegexMatcher(const RegexPattern *pat);
01739     RegexMatcher(const RegexMatcher &other);
01740     RegexMatcher &operator =(const RegexMatcher &rhs);
01741     void init(UErrorCode &status);                      // Common initialization
01742     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
01743 
01744     friend class RegexPattern;
01745     friend class RegexCImpl;
01746 public:
01748     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
01749 private:
01750 
01751     //
01752     //  MatchAt   This is the internal interface to the match engine itself.
01753     //            Match status comes back in matcher member variables.
01754     //
01755     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01756     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
01757     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
01758     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
01759     REStackFrame        *resetStack();
01760     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01761     void                 IncrementTime(UErrorCode &status);
01762     UBool                ReportFindProgress(int64_t matchIndex, UErrorCode &status);
01763     
01764     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01765     
01766     UBool                findUsingChunk();
01767     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01768     UBool                isChunkWordBoundary(int32_t pos);
01769 
01770     const RegexPattern  *fPattern;
01771     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
01772                                            //   should delete it when through.
01773 
01774     const UnicodeString *fInput;           // The string being matched. Only used for input()
01775     UText               *fInputText;       // The text being matched. Is never NULL.
01776     UText               *fAltInputText;    // A shallow copy of the text being matched.
01777                                            //   Only created if the pattern contains backreferences.
01778     int64_t              fInputLength;     // Full length of the input text.
01779     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
01780     
01781     int64_t              fRegionStart;     // Start of the input region, default = 0.
01782     int64_t              fRegionLimit;     // End of input region, default to input.length.
01783     
01784     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
01785     int64_t              fAnchorLimit;     //   See useAnchoringBounds
01786     
01787     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
01788     int64_t              fLookLimit;       //   and other boundary tests.  See
01789                                            //   useTransparentBounds
01790 
01791     int64_t              fActiveStart;     // Currently active bounds for matching.
01792     int64_t              fActiveLimit;     //   Usually is the same as region, but
01793                                            //   is changed to fLookStart/Limit when
01794                                            //   entering look around regions.
01795 
01796     UBool                fTransparentBounds;  // True if using transparent bounds.
01797     UBool                fAnchoringBounds; // True if using anchoring bounds.
01798 
01799     UBool                fMatch;           // True if the last attempted match was successful.
01800     int64_t              fMatchStart;      // Position of the start of the most recent match
01801     int64_t              fMatchEnd;        // First position after the end of the most recent match
01802                                            //   Zero if no previous match, even when a region
01803                                            //   is active.
01804     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
01805                                            //   or -1 if there was no previous match.
01806     int64_t              fAppendPosition;  // First position after the end of the previous
01807                                            //   appendReplacement().  As described by the
01808                                            //   JavaDoc for Java Matcher, where it is called 
01809                                            //   "append position"
01810     UBool                fHitEnd;          // True if the last match touched the end of input.
01811     UBool                fRequireEnd;      // True if the last match required end-of-input
01812                                            //    (matched $ or Z)
01813 
01814     UVector64           *fStack;
01815     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
01816                                            //   which will contain the capture group results.
01817                                            //   NOT valid while match engine is running.
01818 
01819     int64_t             *fData;            // Data area for use by the compiled pattern.
01820     int64_t             fSmallData[8];     //   Use this for data if it's enough.
01821 
01822     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
01823                                            //   match engine run.  Zero for unlimited.
01824     
01825     int32_t             fTime;             // Match time, accumulates while matching.
01826     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
01827                                            //   Kept separately from fTime to keep as much
01828                                            //   code as possible out of the inline
01829                                            //   StateSave function.
01830 
01831     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
01832                                            //   stack, in bytes.  Zero for unlimited.
01833 
01834     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
01835                                            //   NULL if there is no callback.
01836     const void         *fCallbackContext;  // User Context ptr for callback function.
01837 
01838     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
01839                                                            //   NULL if there is no callback.
01840     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
01841 
01842 
01843     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
01844 
01845     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
01846 
01847     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
01848                                            //   reported, or that permanently disables this matcher.
01849 
01850     RuleBasedBreakIterator  *fWordBreakItr;
01851 };
01852 
01853 U_NAMESPACE_END
01854 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
01855 #endif
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines