2025-11-27 16:46:48 +09:00

413 lines
16 KiB
C++

// ==========================================================================
// Class Specification :
// COXRegExpression
// ==========================================================================
// Header file : OXRegExpression.h
// Version: 9.3
// This software along with its related components, documentation and files ("The Libraries")
// is © 1994-2007 The Code Project (1612916 Ontario Limited) and use of The Libraries is
// governed by a software license agreement ("Agreement"). Copies of the Agreement are
// available at The Code Project (www.codeproject.com), as part of the package you downloaded
// to obtain this file, or directly from our office. For a copy of the license governing
// this software, you may contact us at legalaffairs@codeproject.com, or by calling 416-849-8900.
// //////////////////////////////////////////////////////////////////////////
// Properties:
// YES Derived from CObject
// NO Is a Cwnd.
// NO Two stage creation
// NO Has a message map
// YES Persistent objects (saveable on disk)
// NO Uses exceptions
// //////////////////////////////////////////////////////////////////////////
// Desciption :
//
// COXRegExpression - CObject derived class.
//
// COXRegExpression - new class that implements regular expression functionality.
// Most of us had problems with searching some text in text files.
// Yes, we can use searching capabilities that are provided with
// many programs. But, in case we have to do some generic search
// it is not the case. For such cases COXRegExpression class is exactly
// what we had to have. This class provided generic search regarding
// the rule you've supplied. To do search, you have to load regular
// expression that defines what do you want to search. There are
// different formats and different implementation for regular expression.
// This implementation is most close to that one described in
// MSDN that comes with Visual Studio 6.0
//
// The rules:
//
// Character Description
//
// \ Marks the next character as special. All characters that are
// special but you want to define it for search should be
// preceding by this character.
// ^ Matches the begining of input or line. In this implementation this
// cannot be defined in charset.
// $ Matches the end of input or line. In this implementation this
// cannot be defined in charset.
// * Matches preceding character zero or more times. In this implementation
// cannot be defined if only one character specified in the regular
// expression. That means that /zo*/ matches z and zoo, but
// /z*/ will match nothing because only one character has been specified.
// + Matches preceding character one or more times.
// ? Matches preceding character zero or one time. In this implementation
// cannot be defined if only one character specified in the regular
// expression.
// . Matches any single character except '\n'
// (pattern) Matches pattern and remembers the match. The matched substring can
// be retrieved by using '\0'-'\9' in regular expression, where '0'-'9'
// are number of the pattern. Example:
// regular expression '(re).*\0s+ion' will match 'regular expression'
// because first matches pattern 're' and remember the pattern with
// index 0. '.*' will match 'gular exp' in 'regular expression'.
// Now we rettrieve pattern with index 0, that has been remembered
// with index 0, this is 're' that matches 're' in 'regular expression'
// before 'ssion' and , finally, 's+ion' matches 'ssion'
// x|y Matches either character 'x' or 'y'. You can combine more than two
// characters like 'x|y|z'
// {n} Means preceding character will match exactly n times (nonnegative, of course)
// {n,} Means preseding character will match at least n times (nonnegative)
// {n,m} Means preceding character will match at keast n times and at
// most m times. (n,m - nonnegative)
// [xyz] A character set. Matches any one of enclosed characters
// [^xyz] A non-matching character set. Matches any character that is not in the set.
// \b Matches word boundary, that is boundary between any character excluding
// space characters (" \f\n\r\t\v") and space characters
// \B Matches non-word boundary. Matches any boundary between space
// characters or between nonspace characters.
// \d Matches any digit /0-9/
// \D Matches any non-digit.
// \f Matches a formfeed.
// \n Matches a new-line character
// \r Matches a carridge return character.
// \s Matches any white space character
// \S Matches any non-white space character
// \t Matches a tab character
// \v Matches any vertical tab character
// \w Matches any word character including underscore. [A-Za-z0-9_]
// \W Matches any non-word character (any character that does not match \w)
// \num Where num is number between 0 and 9. Matches remmembered pattern.
// (See description of pattern)
// /n/ Where n is between 1 and 255. Matches supplied in n ASCII code
//
//
// Three easy steps to use:
// 1) Create your own derived from COXRegExpression class and override function OnMatch()
// This virtual function is always called when a matched substring has been found.
// When this function is called you can return TRUE to continue search or FALSE
// if you do not want continue search.
// 2) Load your regular expression by function LoadRule()
// 3) Call function Match to do the search in supplied text.
//
//
// Now how it works.
// There are two steps in the work -1)load regular expression and 2) search the text
//
// 1)load regular expression
// When you are loading regular expression, LoadRule() calls function Parse() that
// has a huge switch. This function do parsing of the rule and create tags of different
// types for every sequences of special characters, depends on the type of special
// characters. For some special characters founded it calls appropriate functions
// like GetAsciiChar() or GetEither() to do this special parsing. The tags are added
// to the collection of tags. Any tag can be retrieved by GetTag().
// The function Parse is virtual, so, if you need to process some special characters,
// that are not specified here, you can override it. Once Parse() or called subfunction
// found error, Parce stops parsing and returns FALSE. You can retrieve error code
// by GetError() and translate it to description by TranslateError()
//
// 2) search the text
// The search text makes function Match() that call recursive function MatchNextTag()
// with start tag number=0. Match() does loop, every time incrementing start position
// in the supplied text for the search untill end of the text has been reached.
// MatchNextTag() does main job and once the tag matches the text at the specified
// position does either, call OnMatch() if the tag was last in the collection of the tags
// or calls MatchNextTag() to match next tag from the collection.
//
// The function MatchNextTag() like Parse() is virtual, so, it's up to you override it
// if you want to process some special sequences.
#if !defined(_OXREGEXPRESSION_H__)
#define _OXREGEXPRESSION_H__
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#include "OXDllExt.h"
#include "UTB64Bit.h"
const int OX_REGEXP_ERROR_MUST_CHAR_BEFORE=1;
const int OX_REGEXP_ERROR_UNEXPECTED_SPECCHAR=2;
const int OX_REGEXP_ERROR_UNEXPECTED_MATCH=3;
const int OX_REGEXP_ERROR_UNEXPECTED_END_EITHER=4;
const int OX_REGEXP_ERROR_UNEXPECTED_END_SPECCHAR=5;
const int OX_REGEXP_ERROR_MUST_DIGIT=6;
const int OX_REGEXP_ERROR_UNEXPECTED_END_ASCII=7;
const int OX_REGEXP_ERROR_EMPTY_PARENTHESES_ASCII=8;
const int OX_REGEXP_ERROR_WRONG_ASCII_NUMBER=9;
const int OX_REGEXP_ERROR_NO_LAST_PARENTHES_ASCII=10;
const int OX_REGEXP_ERROR_EMPTY_PARENTHESES=11;
const int OX_REGEXP_ERROR_TOO_MANY_COMMAS=12;
const int OX_REGEXP_ERROR_NO_LAST_PARENTHES=13;
const int OX_REGEXP_ERROR_UNEXPECTED_END_OF_STRING=14;
const int OX_REGEXP_ERROR_INVALID_MOSTLEAST=15;
const int OX_REGEXP_ERROR_NO_REFERENCE=16;
const int OX_REGEXP_TAG_COMMON_CHAR=1;
const int OX_REGEXP_TAG_NOT_DEFINED=4;
const int OX_REGEXP_TAG_BEGINING_OF_THE_LINE=5;//^
const int OX_REGEXP_TAG_END_OF_THE_LINE=6;//$
const int OX_REGEXP_TAG_CHAR_NOT_NEW_LINE=13;//.
const int OX_REGEXP_TAG_PATTERN=15;//(PATTERN)
const int OX_REGEXP_TAG_CHARS_EITHER=17;//| //the same as charset
const int OX_REGEXP_TAG_CHARS_EXACTLY=18;//THE SAME common char
const int OX_REGEXP_TAG_CHARS_AT_LEAST=23;//{N,}
const int OX_REGEXP_TAG_CHARS_AT_LEAST_MOST=24;//{N,M}
const int OX_REGEXP_TAG_CHARSET=25;//[XYZ]
const int OX_REGEXP_TAG_NON_CHARSET=26;//[^XYZ]
const int OX_REGEXP_TAG_CHARSET_BEGIN_LINE=27;//[X^YZ]
const int OX_REGEXP_TAG_NON_CHARSET_BEGIN_LINE=28;//[^X^YZ]
const int OX_REGEXP_TAG_CHARSET_END_LINE=29;//[X$YZ]
const int OX_REGEXP_TAG_NON_CHARSET_END_LINE=30;//[^X$YZ]
const int OX_REGEXP_TAG_CHARSET_BEGIN_END_LINE=31;//[X^$YZ]
const int OX_REGEXP_TAG_NON_CHARSET_BEGIN_END_LINE=32;//[^X^$YZ]
const int OX_REGEXP_TAG_WORD_BOUNDARY=33;//\b
const int OX_REGEXP_TAG_NON_WORD_BOUNDARY=34;//\B
const int OX_REGEXP_TAG_DIGIT=35;//\d
const int OX_REGEXP_TAG_NON_DIGIT=36;//\D
const int OX_REGEXP_TAG_WHITESPACE=40;//\s
const int OX_REGEXP_TAG_NON_WHITESPACE=41;//\S
const int OX_REGEXP_TAG_ANY_WORD=44;//\w
const int OX_REGEXP_TAG_ANY_NON_WORD=45;//\W
const int OX_REGEXP_TAG_REFERENCE_BACK=46;//\num
const int OX_REGEXP_CANCELED=0x02;
#define CHAR_ZERO TEXT('0')
#define CHAR_NINE TEXT('9')
#define OX_REGEXP_VERSION 1
typedef struct sRegExpTag
{
CString sValue;
int nType;
int nMin;
int nMax;
sRegExpTag():nType(OX_REGEXP_TAG_NOT_DEFINED),
nMin(0),nMax(0){}
}tRegExpTag;
//////////////////////////////////////////////////////////////////////////////
// class COXRegExpression
class OX_CLASS_DECL COXRegExpression : public CObject
{
DECLARE_SERIAL(COXRegExpression)
public:
//////////////////
//public functions
// --- In :
// --- Out :
// --- Returns:
// --- Effect: Constructs the object
COXRegExpression():m_nError(0), m_sSpace(_T(" \f\n\r\t\v")),
m_nNumber(-1), m_nFound(0) {}
// --- In : regExpression - instance COXRegExpression to be create
// from
// --- Out :
// --- Returns:
// --- Effect: Copy constructor
COXRegExpression(const COXRegExpression& regExpression);
virtual ~COXRegExpression();
// --- In : sRule - regular expression to load into the object
// pnNumber - start position in this expression to load from
// --- Out : pnNumber - in case of error pointer to last valid character
// in regular expression
// --- Returns: TRUE on success, FALSE otherwise.
// In case of FALSE you can call GetError()
// and TranslateError() functions to get reason of the error
// --- Effect: Load regular expression into the object
BOOL LoadRule(CString sRule, int* pnNumber=NULL);
// --- In : lpszRule - regular expression to load into the object
// pnNumber - start position in this expression to load from
// --- Out : pnNumber - in case of error pointer to last valid character
// in regular expression
// --- Returns: TRUE on success, FALSE otherwise.
// In case of FALSE you can call GetError()
// and TranslateError() functions to get reason of the error
// --- Effect: Load regular expression into the object
BOOL LoadRule(LPCTSTR lpszRule, int* pnNumber=NULL);
// --- In :
// --- Out :
// --- Returns: sRule - rule that has been loaded into the object
// --- Effect: retrieves rule has been loaded into the object
inline void GetRule(CString& sRule) const {sRule=m_sRule;}
// --- In : sString - text to match to rule
// --- Out :
// --- Returns: number of mathes
// --- Effect: Searches the supplied text to match the rule
// loaded before by LoadRule() function.
// Every time matching substring has been found
// virtual function OnMatch() will be called
int Match(CString& sString);
// --- In :
// --- Out :
// --- Returns: error code
// --- Effect: returns error that found object in the rule while loading it.
inline int GetError() const {return m_nError;}
// --- In : nError - error has been retrieved by GetError()
// --- Out : sError - description of the error
// --- Returns:
// --- Effect: translates error code into description
void TranslateError(int nError, CString& sError);
// --- In : nNumber - serial number of the tag
// --- Out :
// --- Returns: requested tag
// --- Effect: retrieves tag by number, that has been loaded
// by LoadRule()
inline tRegExpTag* GetTag(int nNumber) const
{
return
(nNumber>=0 && nNumber<m_arTags.GetSize())?
(tRegExpTag*) m_arTags.GetAt(nNumber):NULL;
}
// --- In :
// --- Out :
// --- Returns: number of the tags
// --- Effect: number of the tags has been loaded by LoadRule()
inline int GetTagCount() const {return PtrToInt(m_arTags.GetSize());}
// --- In : regExpression - instance of COXregExpression to assign to
// --- Out :
// --- Returns:
// --- Effect: Assignment operator
const COXRegExpression& operator=(const COXRegExpression& regExpression);
// --- In :
// --- Out :
// --- Returns:
// --- Effect: Retrieves searched string
inline CString GetText() const { return m_sText;}
// --- In :
// --- Out :
// --- Returns:
// --- Effect: Retrieves searched string
inline int GetFound() const {return m_nFound;}
/////////////////
//public members
protected:
/////////////////////
//protected functions
// --- In : nStart - number of first character of
// the matched substring
// nLength - length of the matching substring
// --- Out :
// --- Returns: if you want to continue search you must
// return TRUE, to stop the search return FALSE
// --- Effect: Override this function to retrieve founded matching
// substrings. This function is called by MatchNextTag()
// every time searched text matches the rule
virtual BOOL OnMatch(int nStart,int nLength);
// --- In : sString - text to be searched
// nTag - number of the tag to match to
// pNumber - number of the first character
// in the text to match to
// --- Out :
// --- Returns: non-zero if this tag matches the text
// at the position specified by pNumber
// --- Effect: This recursive function called to match the tag
// to the supplied text
virtual BOOL MatchNextTag(CString& sString, int nTag, int* pNumber);
// --- In : sString - regular expression to load
// into the object
// pnNumber - first character in this expresiion
// to parse from
// --- Out :
// --- Returns: TRUE on success, FALSE otherwise.
// --- Effect: Called by LoadRule() to parse supplied
// regular expression. If function not succeed,
// call GetError() and TranslateError() to
// get type of the error
virtual BOOL Parse(CString& sString, int* pnNumber=NULL);
// --- In : sString - regular expression to load
// pnNumber - pointer to special char in
// the regular expression
//--- Out : pch - special character found if any
// --- Returns: type of the tag depends on type
// the special character
// --- Effect: Do not call this function directly.
// Override it if you need to process your special characters
virtual int GetSpecialChar(CString& sString, int* pnNumber, LPTSTR pch);
BOOL GetPattern(int nNumber, CString& sPattern);
void AddTag(int nType, CString* pValue, int nFirst=1, int nSecond=1);
inline void SetError(int nError) { m_nError=nError;}
int GetMinMaxCount(CString& sString, int* pnNumber,
int* pMin, int* pMax);
BOOL GetEither(CString& sString,int* pnNumber,LPTSTR pchEither);
BOOL GetAsciiChar(CString& sString, int* pnNumber, LPTSTR pch);
inline void RemoveAll()
{
while(m_arTags.GetSize())
{
delete (tRegExpTag*) m_arTags.GetAt(0);
m_arTags.RemoveAt(0);
};
m_sRule="";
}
/////////////////////
//protected members
CPtrArray m_arTags;
CString m_sRule;
int m_nNumber;
int m_nError;
CString m_sSpace;
CString m_sText;
int m_nFound;
};
#endif // !defined(_OXREGEXPRESSION_H__)