Main Page | Class Hierarchy | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2003, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 
00039 #include "unicode/utypes.h"
00040 
00041 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00042 
00043 #include "unicode/uobject.h"
00044 #include "unicode/unistr.h"
00045 #include "unicode/parseerr.h"
00046 
00047 U_NAMESPACE_BEGIN
00048 
00049 
00050 // Forward Declarations...
00051 
00052 class RegexMatcher;
00053 class UVector;
00054 class UVector32;
00055 class UnicodeSet;
00056 struct REStackFrame;
00057 struct Regex8BitSet;
00058 
00059 
00064 enum {
00066     UREGEX_CANON_EQ         = 128,
00067 
00069     UREGEX_CASE_INSENSITIVE = 2,
00070 
00072     UREGEX_COMMENTS         = 4,
00073 
00076     UREGEX_DOTALL           = 32,
00077 
00082     UREGEX_MULTILINE        = 8
00083 };
00084 
00085 
00086 
00098 class U_I18N_API RegexPattern: public UObject {
00099 public:
00100 
00108     RegexPattern();
00109 
00115     RegexPattern(const RegexPattern &source);
00116 
00122     virtual ~RegexPattern();
00123 
00132     UBool           operator==(const RegexPattern& that) const;
00133 
00142     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00143 
00149     RegexPattern  &operator =(const RegexPattern &source);
00150 
00158     virtual RegexPattern  *clone() const;
00159 
00160 
00181     static RegexPattern *compile( const UnicodeString &regex,
00182         UParseError          &pe,
00183         UErrorCode           &status);
00184 
00205     static RegexPattern *compile( const UnicodeString &regex,
00206         uint32_t             flags,
00207         UParseError          &pe,
00208         UErrorCode           &status);
00209 
00210 
00229     static RegexPattern *compile( const UnicodeString &regex,
00230         uint32_t             flags,
00231         UErrorCode           &status);
00232 
00233 
00239     virtual uint32_t flags() const;
00240 
00253     virtual RegexMatcher *matcher(const UnicodeString &input,
00254         UErrorCode          &status) const;
00255 
00256 
00268     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00269 
00270 
00285     static UBool matches(const UnicodeString   &regex,
00286         const UnicodeString   &input,
00287         UParseError     &pe,
00288         UErrorCode      &status);
00289 
00290 
00295     virtual UnicodeString pattern() const;
00296 
00297 
00323     virtual int32_t  split(const UnicodeString &input,
00324         UnicodeString    dest[],
00325         int32_t          destCapacity,
00326         UErrorCode       &status) const;
00327 
00328 
00329 
00334     void dump() const;
00335 
00341     virtual inline UClassID getDynamicClassID() const; 
00342 
00348     static inline UClassID getStaticClassID(); 
00349 
00350 private:
00351     //
00352     //  Implementation Data
00353     //
00354     UnicodeString   fPattern;      // The original pattern string.
00355     uint32_t        fFlags;        // The flags used when compiling the pattern.
00356                                    //
00357     UVector32       *fCompiledPat; // The compiled pattern p-code.
00358     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00359                                    //   after un-escaping, for use during the match.
00360 
00361     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00362     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00363 
00364 
00365     UErrorCode      fDeferredStatus; // status if some prior error has left this
00366                                    //  RegexPattern in an unusable state.
00367 
00368     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00369                                    //   >= this value.  For some patterns, this calculated
00370                                    //   value may be less than the true shortest
00371                                    //   possible match.
00372 
00373     int32_t         fFrameSize;    // Size of a state stack frame in the
00374                                    //   execution engine.
00375 
00376     int32_t         fDataSize;     // The size of the data needed by the pattern that
00377                                    //   does not go on the state stack, but has just
00378                                    //   a single copy per matcher.
00379 
00380     UVector32       *fGroupMap;    // Map from capture group number to position of
00381                                    //   the group's variables in the matcher stack frame.
00382 
00383     int32_t         fMaxCaptureDigits;
00384 
00385     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00386                                    //   regex character classes, e.g. Word.
00387 
00388     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00389                                    //  sets for predefined regex classes.
00390 
00391     int32_t         fStartType;    // Info on how a match must start.
00392     int32_t         fInitialStringIdx;     //  
00393     int32_t         fInitialStringLen;
00394     UnicodeSet     *fInitialChars;  
00395     UChar32         fInitialChar;
00396     Regex8BitSet   *fInitialChars8;
00397 
00402     static const char fgClassID;
00403 
00404     friend class RegexCompile;
00405     friend class RegexMatcher;
00406 
00407     //
00408     //  Implementation Methods
00409     //
00410     void        init();            // Common initialization, for use by constructors.
00411     void        zap();             // Common cleanup
00412     void        dumpOp(int32_t index) const;
00413 
00414 
00415 };
00416 
00417 
00418 
00419 
00420 
00421 
00422 
00423 
00424 
00434 class U_I18N_API RegexMatcher: public UObject {
00435 public:
00436 
00451     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00452 
00468     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00469         uint32_t flags, UErrorCode &status);
00470 
00471 
00477     virtual ~RegexMatcher();
00478 
00479 
00486     virtual UBool matches(UErrorCode &status);
00487 
00488 
00489 
00502     virtual UBool lookingAt(UErrorCode &status);
00503 
00504 
00517     virtual UBool find();
00518 
00519 
00529     virtual UBool find(int32_t start, UErrorCode &status);
00530 
00531 
00541     virtual UnicodeString group(UErrorCode &status) const;
00542 
00543 
00556     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00557 
00558 
00564     virtual int32_t groupCount() const;
00565 
00566 
00574     virtual int32_t start(UErrorCode &status) const;
00575 
00576 
00590     virtual int32_t start(int group, UErrorCode &status) const;
00591 
00592 
00602     virtual int32_t end(UErrorCode &status) const;
00603 
00604 
00618     virtual int32_t end(int group, UErrorCode &status) const;
00619 
00620 
00629     virtual RegexMatcher &reset();
00630 
00631 
00639     virtual RegexMatcher &reset(const UnicodeString &input);
00640 
00641 
00648     virtual const UnicodeString &input() const;
00649 
00650 
00656     virtual const RegexPattern &pattern() const;
00657 
00658 
00675     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
00676 
00677 
00698     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
00699 
00727     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
00728         const UnicodeString &replacement, UErrorCode &status);
00729 
00730 
00741     virtual UnicodeString &appendTail(UnicodeString &dest);
00742 
00743 
00744 
00769     virtual int32_t  split(const UnicodeString &input,
00770         UnicodeString    dest[],
00771         int32_t          destCapacity,
00772         UErrorCode       &status);
00773 
00774 
00775 
00781     void setTrace(UBool state);
00782 
00783 
00789     static inline UClassID getStaticClassID();
00790 
00796     virtual inline UClassID getDynamicClassID() const;
00797 
00798 private:
00799     // Constructors and other object boilerplate are private.
00800     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
00801     RegexMatcher(); // default constructor not implemented
00802     RegexMatcher(const RegexPattern *pat);
00803     RegexMatcher(const RegexMatcher &other);
00804     RegexMatcher &operator =(const RegexMatcher &rhs);
00805     friend class RegexPattern;
00806 
00807 
00808     //
00809     //  MatchAt   This is the internal interface to the match engine itself.
00810     //            Match status comes back in matcher member variables.
00811     //
00812     void                 MatchAt(int32_t startIdx, UErrorCode &status);
00813     inline void          backTrack(int32_t &inputIdx, int32_t &patIdx);
00814     UBool                isWordBoundary(int32_t pos);         // perform the \b test
00815     REStackFrame        *resetStack();
00816     inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx,
00817                                    int32_t frameSize, UErrorCode &status);
00818 
00819 
00820     const RegexPattern  *fPattern;
00821     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
00822                                            //   should delete it when through.
00823     const UnicodeString *fInput;
00824 
00825     UBool                fMatch;           // True if the last match was successful.
00826     int32_t              fMatchStart;      // Position of the start of the most recent match
00827     int32_t              fMatchEnd;        // First position after the end of the most recent match
00828     int32_t              fLastMatchEnd;    // First position after the end of the previous match.
00829 
00830     UVector32           *fStack;
00831     REStackFrame        *fFrame;           // After finding a match, the last active stack
00832                                            //   frame, which will contain the capture group results.
00833                                            //   NOT valid while match engine is running.
00834 
00835     int32_t             *fData;            // Data area for use by the compiled pattern.
00836     int32_t             fSmallData[8];     //   Use this for data if it's enough.
00837 
00838     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
00839 
00840     UErrorCode          fDeferredStatus;   // Save error state if that cannot be immediately
00841                                            //   reported, or that permanently disables this matcher.
00842 
00847     static const char   fgClassID;
00848 
00849 
00850 };
00851 
00852 inline UClassID RegexPattern::getStaticClassID() { return (UClassID)&fgClassID; }
00853 inline UClassID RegexPattern::getDynamicClassID() const { return getStaticClassID(); }
00854 
00855 inline UClassID RegexMatcher::getStaticClassID() { return (UClassID)&fgClassID; }
00856 inline UClassID RegexMatcher::getDynamicClassID() const { return getStaticClassID(); }
00857 
00858 
00859 U_NAMESPACE_END
00860 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
00861 #endif

Generated on Wed Sep 3 17:47:09 2003 for ICU 2.6 by doxygen 1.3.2