/*********************************************************************** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY DEVELOPER: Zihan Chen(vczh) ***********************************************************************/ #include "VlppRegex.h" /*********************************************************************** .\REGEX.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex { using namespace collections; using namespace regex_internal; /*********************************************************************** RegexString ***********************************************************************/ RegexString::RegexString(vint _start) :start(_start) ,length(0) { } RegexString::RegexString(const WString& _string, vint _start, vint _length) :value(_length==0?L"":_string.Sub(_start, _length)) ,start(_start) ,length(_length) { } vint RegexString::Start()const { return start; } vint RegexString::Length()const { return length; } const WString& RegexString::Value()const { return value; } bool RegexString::operator==(const RegexString& string)const { return start==string.start && length==string.length && value==string.value; } /*********************************************************************** RegexMatch ***********************************************************************/ RegexMatch::RegexMatch(const WString& _string, PureResult* _result) :success(true) ,result(_string, _result->start, _result->length) { } RegexMatch::RegexMatch(const WString& _string, RichResult* _result, RichInterpretor* _rich) :success(true) ,result(_string, _result->start, _result->length) { for(vint i=0;i<_result->captures.Count();i++) { CaptureRecord& capture=_result->captures[i]; if(capture.capture==-1) { captures.Add(RegexString(_string, capture.start, capture.length)); } else { groups.Add(_rich->CaptureNames().Get(capture.capture), RegexString(_string, capture.start, capture.length)); } } } RegexMatch::RegexMatch(const RegexString& _result) :success(false) ,result(_result) { } bool RegexMatch::Success()const { return success; } const RegexString& RegexMatch::Result()const { return result; } const RegexMatch::CaptureList& RegexMatch::Captures()const { return captures; } const RegexMatch::CaptureGroup& RegexMatch::Groups()const { return groups; } /*********************************************************************** Regex ***********************************************************************/ void Regex::Process(const WString& text, bool keepEmpty, bool keepSuccess, bool keepFail, RegexMatch::List& matches)const { if(rich) { const wchar_t* start=text.Buffer(); const wchar_t* input=start; RichResult result; while(rich->Match(input, start, result)) { vint offset=input-start; if(keepFail) { if(result.start>offset || keepEmpty) { matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset))); } } if(keepSuccess) { matches.Add(new RegexMatch(text, &result, rich)); } input=start+result.start+result.length; } if(keepFail) { vint remain=input-start; vint length=text.Length()-remain; if(length || keepEmpty) { matches.Add(new RegexMatch(RegexString(text, remain, length))); } } } else { const wchar_t* start=text.Buffer(); const wchar_t* input=start; PureResult result; while(pure->Match(input, start, result)) { vint offset=input-start; if(keepFail) { if(result.start>offset || keepEmpty) { matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset))); } } if(keepSuccess) { matches.Add(new RegexMatch(text, &result)); } input=start+result.start+result.length; } if(keepFail) { vint remain=input-start; vint length=text.Length()-remain; if(length || keepEmpty) { matches.Add(new RegexMatch(RegexString(text, remain, length))); } } } } Regex::Regex(const WString& code, bool preferPure) { CharRange::List subsets; RegexExpression::Ref regex=ParseRegexExpression(code); Expression::Ref expression=regex->Merge(); expression->NormalizeCharSet(subsets); bool pureRequired=false; bool richRequired=false; if(preferPure) { if(expression->HasNoExtension()) { pureRequired=true; } else { if(expression->CanTreatAsPure()) { pureRequired=true; richRequired=true; } else { richRequired=true; } } } else { richRequired=true; } try { if(pureRequired) { Dictionary nfaStateMap; Group dfaStateMap; Automaton::Ref eNfa=expression->GenerateEpsilonNfa(); Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap); Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap); pure=new PureInterpretor(dfa, subsets); } if(richRequired) { Dictionary nfaStateMap; Group dfaStateMap; Automaton::Ref eNfa=expression->GenerateEpsilonNfa(); Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap); Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap); rich=new RichInterpretor(dfa); } } catch(...) { if(pure)delete pure; if(rich)delete rich; throw; } } Regex::~Regex() { if(pure)delete pure; if(rich)delete rich; } bool Regex::IsPureMatch()const { return rich?false:true; } bool Regex::IsPureTest()const { return pure?true:false; } RegexMatch::Ref Regex::MatchHead(const WString& text)const { if(rich) { RichResult result; if(rich->MatchHead(text.Buffer(), text.Buffer(), result)) { return new RegexMatch(text, &result, rich); } else { return 0; } } else { PureResult result; if(pure->MatchHead(text.Buffer(), text.Buffer(), result)) { return new RegexMatch(text, &result); } else { return 0; } } } RegexMatch::Ref Regex::Match(const WString& text)const { if(rich) { RichResult result; if(rich->Match(text.Buffer(), text.Buffer(), result)) { return new RegexMatch(text, &result, rich); } else { return 0; } } else { PureResult result; if(pure->Match(text.Buffer(), text.Buffer(), result)) { return new RegexMatch(text, &result); } else { return 0; } } } bool Regex::TestHead(const WString& text)const { if(pure) { PureResult result; return pure->MatchHead(text.Buffer(), text.Buffer(), result); } else { RichResult result; return rich->MatchHead(text.Buffer(), text.Buffer(), result); } } bool Regex::Test(const WString& text)const { if(pure) { PureResult result; return pure->Match(text.Buffer(), text.Buffer(), result); } else { RichResult result; return rich->Match(text.Buffer(), text.Buffer(), result); } } void Regex::Search(const WString& text, RegexMatch::List& matches)const { Process(text, false, true, false, matches); } void Regex::Split(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const { Process(text, keepEmptyMatch, false, true, matches); } void Regex::Cut(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const { Process(text, keepEmptyMatch, true, true, matches); } /*********************************************************************** RegexTokens ***********************************************************************/ bool RegexToken::operator==(const RegexToken& _token)const { return length==_token.length && token==_token.token && reading==_token.reading; } bool RegexToken::operator==(const wchar_t* _token)const { return wcslen(_token)==length && wcsncmp(reading, _token, length)==0; } class RegexTokenEnumerator : public Object, public IEnumerator { protected: RegexToken token; vint index = -1; PureInterpretor* pure; const Array& stateTokens; const wchar_t* start; vint codeIndex; RegexProc proc; const wchar_t* reading; vint rowStart = 0; vint columnStart = 0; bool cacheAvailable = false; RegexToken cacheToken; public: RegexTokenEnumerator(const RegexTokenEnumerator& enumerator) :token(enumerator.token) , index(enumerator.index) , pure(enumerator.pure) , stateTokens(enumerator.stateTokens) , proc(enumerator.proc) , reading(enumerator.reading) , start(enumerator.start) , rowStart(enumerator.rowStart) , columnStart(enumerator.columnStart) , codeIndex(enumerator.codeIndex) , cacheAvailable(enumerator.cacheAvailable) , cacheToken(enumerator.cacheToken) { } RegexTokenEnumerator(PureInterpretor* _pure, const Array& _stateTokens, const wchar_t* _start, vint _codeIndex, RegexProc _proc) :index(-1) , pure(_pure) , stateTokens(_stateTokens) , start(_start) , codeIndex(_codeIndex) , proc(_proc) , reading(_start) { } IEnumerator* Clone()const { return new RegexTokenEnumerator(*this); } const RegexToken& Current()const { return token; } vint Index()const { return index; } bool Next() { if (!cacheAvailable && !*reading) return false; if (cacheAvailable) { token = cacheToken; cacheAvailable = false; } else { token.reading = reading; token.start = 0; token.length = 0; token.token = -2; token.completeToken = true; } token.rowStart = rowStart; token.columnStart = columnStart; token.rowEnd = rowStart; token.columnEnd = columnStart; token.codeIndex = codeIndex; PureResult result; while (*reading) { vint id = -1; bool completeToken = true; if (!pure->MatchHead(reading, start, result)) { result.start = reading - start; if (id == -1 && result.terminateState != -1) { vint state = pure->GetRelatedFinalState(result.terminateState); if (state != -1) { id = stateTokens[state]; } } if (id == -1) { result.length = 1; } else { completeToken = false; } } else { id = stateTokens.Get(result.finalState); } if (id != -1 && proc.extendProc) { RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr); proc.extendProc(proc.argument, reading, -1, true, token); #if _DEBUG CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer."); #endif result.length = token.length; id = token.token; completeToken = token.completeToken; } if (token.token == -2) { token.start = result.start; token.length = result.length; token.token = id; token.completeToken = completeToken; } else if (token.token == id && id == -1) { token.length += result.length; } else { cacheAvailable = true; cacheToken.reading = reading; cacheToken.start = result.start; cacheToken.length = result.length; cacheToken.codeIndex = codeIndex; cacheToken.token = id; cacheToken.completeToken = completeToken; } reading += result.length; if (cacheAvailable) { break; } } index++; for (vint i = 0; i < token.length; i++) { token.rowEnd = rowStart; token.columnEnd = columnStart; if (token.reading[i] == L'\n') { rowStart++; columnStart = 0; } else { columnStart++; } } return true; } void Reset() { index = -1; reading = start; cacheAvailable = false; } void ReadToEnd(List& tokens, bool(*discard)(vint)) { while (Next()) { if (!discard(token.token)) { tokens.Add(token); } } } }; RegexTokens::RegexTokens(PureInterpretor* _pure, const Array& _stateTokens, const WString& _code, vint _codeIndex, RegexProc _proc) :pure(_pure) , stateTokens(_stateTokens) , code(_code) , codeIndex(_codeIndex) , proc(_proc) { } RegexTokens::RegexTokens(const RegexTokens& tokens) :pure(tokens.pure) , stateTokens(tokens.stateTokens) , code(tokens.code) , codeIndex(tokens.codeIndex) , proc(tokens.proc) { } IEnumerator* RegexTokens::CreateEnumerator()const { return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc); } bool DefaultDiscard(vint token) { return false; } void RegexTokens::ReadToEnd(collections::List& tokens, bool(*discard)(vint))const { if(discard==0) { discard=&DefaultDiscard; } RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard); } /*********************************************************************** RegexLexerWalker ***********************************************************************/ RegexLexerWalker::RegexLexerWalker(PureInterpretor* _pure, const Array& _stateTokens) :pure(_pure) , stateTokens(_stateTokens) { } RegexLexerWalker::RegexLexerWalker(const RegexLexerWalker& tokens) : pure(tokens.pure) , stateTokens(tokens.stateTokens) { } RegexLexerWalker::~RegexLexerWalker() { } RegexTokens::~RegexTokens() { } vint RegexLexerWalker::GetStartState()const { return pure->GetStartState(); } vint RegexLexerWalker::GetRelatedToken(vint state)const { vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state); return finalState == -1 ? -1 : stateTokens.Get(finalState); } void RegexLexerWalker::Walk(wchar_t input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const { vint previousState=state; token=-1; finalState=false; previousTokenStop=false; if(state==-1) { state=pure->GetStartState(); previousTokenStop=true; } state=pure->Transit(input, state); if(state==-1) { previousTokenStop=true; if(previousState==-1) { finalState=true; return; } else if(pure->IsFinalState(previousState)) { state=pure->Transit(input, pure->GetStartState()); } } if(pure->IsFinalState(state)) { token=stateTokens.Get(state); finalState=true; return; } else { finalState=state==-1; return; } } vint RegexLexerWalker::Walk(wchar_t input, vint state)const { vint token=-1; bool finalState=false; bool previousTokenStop=false; Walk(input, state, token, finalState, previousTokenStop); return state; } bool RegexLexerWalker::IsClosedToken(const wchar_t* input, vint length)const { vint state=pure->GetStartState(); for(vint i=0;iTransit(input[i], state); if(state==-1) return true; if(pure->IsDeadState(state)) return true; } return false; } bool RegexLexerWalker::IsClosedToken(const WString& input)const { return IsClosedToken(input.Buffer(), input.Length()); } /*********************************************************************** RegexLexerColorizer ***********************************************************************/ RegexLexerColorizer::RegexLexerColorizer(const RegexLexerWalker& _walker, RegexProc _proc) :walker(_walker) , proc(_proc) { internalState.currentState = walker.GetStartState(); } RegexLexerColorizer::RegexLexerColorizer(const RegexLexerColorizer& colorizer) :walker(colorizer.walker) , proc(colorizer.proc) , internalState(colorizer.internalState) { } RegexLexerColorizer::~RegexLexerColorizer() { } RegexLexerColorizer::InternalState RegexLexerColorizer::GetInternalState() { return internalState; } void RegexLexerColorizer::SetInternalState(InternalState state) { internalState = state; } void RegexLexerColorizer::Pass(wchar_t input) { WalkOneToken(&input, 1, 0, false); } vint RegexLexerColorizer::GetStartState()const { return walker.GetStartState(); } void RegexLexerColorizer::CallExtendProcAndColorizeProc(const wchar_t* input, vint length, RegexProcessingToken& token, bool colorize) { vint oldTokenLength = token.length; proc.extendProc(proc.argument, input + token.start, length - token.start, false, token); #if _DEBUG { bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken; CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input." ); CHECK_ERROR( token.completeToken || token.token != -1, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id." ); CHECK_ERROR( oldTokenLength <= token.length, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length." ); CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." ); } #endif if ((internalState.interTokenState = token.interTokenState)) { internalState.interTokenId = token.token; } if (colorize) { proc.colorizeProc(proc.argument, token.start, token.length, token.token); } } vint RegexLexerColorizer::WalkOneToken(const wchar_t* input, vint length, vint start, bool colorize) { if (internalState.interTokenState) { RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState); proc.extendProc(proc.argument, input, length, false, token); #if _DEBUG { bool pausedAtTheEnd = token.length == length && !token.completeToken; CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input." ); CHECK_ERROR( token.completeToken || token.token == internalState.interTokenId, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id." ); CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." ); } #endif if (colorize) { proc.colorizeProc(proc.argument, 0, token.length, token.token); } if (!(internalState.interTokenState = token.interTokenState)) { internalState.interTokenId = -1; } return token.length; } vint lastFinalStateLength = 0; vint lastFinalStateToken = -1; vint lastFinalStateState = -1; vint tokenStartState = internalState.currentState; for (vint i = start; i < length; i++) { vint currentToken = -1; bool finalState = false; bool previousTokenStop = false; walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop); if (previousTokenStop) { if (proc.extendProc && lastFinalStateToken != -1) { RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr); CallExtendProcAndColorizeProc(input, length, token, colorize); if (token.completeToken) { internalState.currentState = walker.GetStartState(); } return start + token.length; } else if (i == start) { if (tokenStartState == GetStartState()) { if (colorize) { proc.colorizeProc(proc.argument, start, 1, -1); } internalState.currentState = walker.GetStartState(); return i + 1; } } else { if (colorize) { proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken); } internalState.currentState = lastFinalStateState; return start + lastFinalStateLength; } } if (finalState) { lastFinalStateLength = i + 1 - start; lastFinalStateToken = currentToken; lastFinalStateState = internalState.currentState; } } if (lastFinalStateToken != -1 && start + lastFinalStateLength == length) { if (proc.extendProc) { RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr); CallExtendProcAndColorizeProc(input, length, token, colorize); } else if (colorize) { proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken); } } else if (colorize) { proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState)); } return length; } void* RegexLexerColorizer::Colorize(const wchar_t* input, vint length) { vint index = 0; while (index != length) { index = WalkOneToken(input, length, index, true); } return internalState.interTokenState; } /*********************************************************************** RegexLexer ***********************************************************************/ RegexLexer::RegexLexer(const collections::IEnumerable& tokens, RegexProc _proc) :proc(_proc) { // Build DFA for all tokens List expressions; List dfas; CharRange::List subsets; Ptr> enumerator = tokens.CreateEnumerator(); while (enumerator->Next()) { const WString& code = enumerator->Current(); RegexExpression::Ref regex = ParseRegexExpression(code); Expression::Ref expression = regex->Merge(); expression->CollectCharSet(subsets); expressions.Add(expression); } for (vint i = 0; i < expressions.Count(); i++) { Dictionary nfaStateMap; Group dfaStateMap; Expression::Ref expression = expressions[i]; expression->ApplyCharSet(subsets); Automaton::Ref eNfa = expression->GenerateEpsilonNfa(); Automaton::Ref nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap); Automaton::Ref dfa = NfaToDfa(nfa, dfaStateMap); dfas.Add(dfa); } // Mark all states in DFAs for (vint i = 0; i < dfas.Count(); i++) { Automaton::Ref dfa = dfas[i]; for (vint j = 0; j < dfa->states.Count(); j++) { if (dfa->states[j]->finalState) { dfa->states[j]->userData = (void*)i; } else { dfa->states[j]->userData = (void*)dfas.Count(); } } } // Connect all DFAs to an e-NFA Automaton::Ref bigEnfa = new Automaton; for (vint i = 0; i < dfas.Count(); i++) { CopyFrom(bigEnfa->states, dfas[i]->states); CopyFrom(bigEnfa->transitions, dfas[i]->transitions); } bigEnfa->startState = bigEnfa->NewState(); for (vint i = 0; i < dfas.Count(); i++) { bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState); } // Build a single DFA out of the e-NFA Dictionary nfaStateMap; Group dfaStateMap; Automaton::Ref bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap); for (vint i = 0; i < nfaStateMap.Keys().Count(); i++) { void* userData = nfaStateMap.Values().Get(i)->userData; nfaStateMap.Keys()[i]->userData = userData; } Automaton::Ref bigDfa = NfaToDfa(bigNfa, dfaStateMap); for (vint i = 0; i < dfaStateMap.Keys().Count(); i++) { void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData; for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++) { void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData; if (userData > newData) { userData = newData; } } dfaStateMap.Keys()[i]->userData = userData; } // Build state machine pure = new PureInterpretor(bigDfa, subsets); stateTokens.Resize(bigDfa->states.Count()); for (vint i = 0; i < stateTokens.Count(); i++) { void* userData = bigDfa->states[i]->userData; stateTokens[i] = (vint)userData; } } RegexLexer::~RegexLexer() { if (pure)delete pure; } RegexTokens RegexLexer::Parse(const WString& code, vint codeIndex)const { pure->PrepareForRelatedFinalStateTable(); return RegexTokens(pure, stateTokens, code, codeIndex, proc); } RegexLexerWalker RegexLexer::Walk()const { pure->PrepareForRelatedFinalStateTable(); return RegexLexerWalker(pure, stateTokens); } RegexLexerColorizer RegexLexer::Colorize()const { return RegexLexerColorizer(Walk(), proc); } } } /*********************************************************************** .\REGEXAUTOMATON.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { using namespace collections; /*********************************************************************** Automaton ***********************************************************************/ Automaton::Automaton() { startState=0; } State* Automaton::NewState() { State* state=new State; state->finalState=false; state->userData=0; states.Add(state); return state; } Transition* Automaton::NewTransition(State* start, State* end) { Transition* transition=new Transition; transition->source=start; transition->target=end; start->transitions.Add(transition); end->inputs.Add(transition); transitions.Add(transition); return transition; } Transition* Automaton::NewChars(State* start, State* end, CharRange range) { Transition* transition=NewTransition(start, end); transition->type=Transition::Chars; transition->range=range; return transition; } Transition* Automaton::NewEpsilon(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::Epsilon; return transition; } Transition* Automaton::NewBeginString(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::BeginString; return transition; } Transition* Automaton::NewEndString(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::EndString; return transition; } Transition* Automaton::NewNop(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::Nop; return transition; } Transition* Automaton::NewCapture(State* start, State* end, vint capture) { Transition* transition=NewTransition(start, end); transition->type=Transition::Capture; transition->capture=capture; return transition; } Transition* Automaton::NewMatch(State* start, State* end, vint capture, vint index) { Transition* transition=NewTransition(start, end); transition->type=Transition::Match; transition->capture=capture; transition->index=index; return transition; } Transition* Automaton::NewPositive(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::Positive; return transition; } Transition* Automaton::NewNegative(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::Negative; return transition; } Transition* Automaton::NewNegativeFail(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::NegativeFail; return transition; } Transition* Automaton::NewEnd(State* start, State* end) { Transition* transition=NewTransition(start, end); transition->type=Transition::End; return transition; } /*********************************************************************** Helpers ***********************************************************************/ bool PureEpsilonChecker(Transition* transition) { switch(transition->type) { case Transition::Epsilon: case Transition::Nop: case Transition::Capture: case Transition::End: return true; default: return false; } } bool RichEpsilonChecker(Transition* transition) { switch(transition->type) { case Transition::Epsilon: return true; default: return false; } } bool AreEqual(Transition* transA, Transition* transB) { if(transA->type!=transB->type)return false; switch(transA->type) { case Transition::Chars: return transA->range==transB->range; case Transition::Capture: return transA->capture==transB->capture; case Transition::Match: return transA->capture==transB->capture && transA->index==transB->index; default: return true; } } // Collect epsilon states and non-epsilon transitions, their order are maintained to match the e-NFA void CollectEpsilon(State* targetState, State* sourceState, bool(*epsilonChecker)(Transition*), List& epsilonStates, List& transitions) { if(!epsilonStates.Contains(sourceState)) { epsilonStates.Add(sourceState); for(vint i=0;itransitions.Count();i++) { Transition* transition=sourceState->transitions[i]; if(epsilonChecker(transition)) { if(!epsilonStates.Contains(transition->target)) { if(transition->target->finalState) { targetState->finalState=true; } CollectEpsilon(targetState, transition->target, epsilonChecker, epsilonStates, transitions); } } else { transitions.Add(transition); } } } } Automaton::Ref EpsilonNfaToNfa(Automaton::Ref source, bool(*epsilonChecker)(Transition*), Dictionary& nfaStateMap) { Automaton::Ref target=new Automaton; Dictionary stateMap; // source->target List epsilonStates; // current epsilon closure List transitions; // current non-epsilon transitions stateMap.Add(source->startState, target->NewState()); nfaStateMap.Add(stateMap[source->startState], source->startState); target->startState=target->states[0].Obj(); CopyFrom(target->captureNames, source->captureNames); for(vint i=0;istates.Count();i++) { // Clear cache State* targetState=target->states[i].Obj(); State* sourceState=nfaStateMap[targetState]; if(sourceState->finalState) { targetState->finalState=true; } epsilonStates.Clear(); transitions.Clear(); // Collect epsilon states and non-epsilon transitions CollectEpsilon(targetState, sourceState, epsilonChecker, epsilonStates, transitions); // Iterate through all non-epsilon transitions for(vint j=0;jtarget)) { stateMap.Add(transition->target, target->NewState()); nfaStateMap.Add(stateMap[transition->target], transition->target); } // Copy transition to connect between two non-epsilon state Transition* newTransition=target->NewTransition(targetState, stateMap[transition->target]); newTransition->capture=transition->capture; newTransition->index=transition->index; newTransition->range=transition->range; newTransition->type=transition->type; } } return target; } Automaton::Ref NfaToDfa(Automaton::Ref source, Group& dfaStateMap) { Automaton::Ref target=new Automaton; Group nfaTransitions; List transitionClasses; // Maintain order for nfaTransitions.Keys CopyFrom(target->captureNames, source->captureNames); State* startState=target->NewState(); target->startState=startState; dfaStateMap.Add(startState, source->startState); SortedList transitionTargets; SortedList relativeStates; transitionTargets.SetLessMemoryMode(false); relativeStates.SetLessMemoryMode(false); for(vint i=0;istates.Count();i++) { State* currentState=target->states[i].Obj(); nfaTransitions.Clear(); transitionClasses.Clear(); // Iterate through all NFA states which represent the DFA state const List& nfaStates=dfaStateMap[currentState]; for(vint j=0;jtransitions.Count();k++) { Transition* nfaTransition=nfaState->transitions[k]; // Check if there is any key in nfaTransitions that has the same input as the current transition Transition* transitionClass=0; for(vint l=0;l& transitionSet=nfaTransitions[transitionClasses[j]]; // Sort all target states and keep unique transitionTargets.Clear(); for(vint l=0;ltarget; if(!transitionTargets.Contains(nfaState)) { transitionTargets.Add(nfaState); } } // Check if these NFA states represent a created DFA state State* dfaState=0; for(vint k=0;kNewState(); for(vint k=0;kfinalState) { dfaState->finalState=true; } } } // Create corresponding DFA transition Transition* transitionClass=transitionClasses[j]; Transition* newTransition=target->NewTransition(currentState, dfaState); newTransition->capture=transitionClass->capture; newTransition->index=transitionClass->index; newTransition->range=transitionClass->range; newTransition->type=transitionClass->type; } } return target; } } } /*********************************************************************** .\REGEXDATA.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** CharRange ***********************************************************************/ CharRange::CharRange() :begin(L'\0') ,end(L'\0') { } CharRange::CharRange(wchar_t _begin, wchar_t _end) :begin(_begin) ,end(_end) { } bool CharRange::operator<(CharRange item)const { return end(CharRange item)const { return item.end=(CharRange item)const { return *this>item || *this==item; } bool CharRange::operator==(CharRange item)const { return begin==item.begin && end==item.end; } bool CharRange::operator!=(CharRange item)const { return begin!=item.begin || item.end!=end; } bool CharRange::operator<(wchar_t item)const { return end(wchar_t item)const { return item=(wchar_t item)const { return item<=end; } bool CharRange::operator==(wchar_t item)const { return begin<=item && item<=end; } bool CharRange::operator!=(wchar_t item)const { return item { public: bool Apply(CharSetExpression* expression, Expression* target) { CharSetExpression* expected=dynamic_cast(target); if(expected) { if(expression->reverse!=expected->reverse)return false; if(expression->ranges.Count()!=expected->ranges.Count())return false; for(vint i=0;iranges.Count();i++) { if(expression->ranges[i]!=expected->ranges[i])return false; } return true; } return false; } bool Apply(LoopExpression* expression, Expression* target) { LoopExpression* expected=dynamic_cast(target); if(expected) { if(expression->min!=expected->min)return false; if(expression->max!=expected->max)return false; if(expression->preferLong!=expected->preferLong)return false; if(!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(SequenceExpression* expression, Expression* target) { SequenceExpression* expected=dynamic_cast(target); if(expected) { if(!Invoke(expression->left, expected->left.Obj()))return false; if(!Invoke(expression->right, expected->right.Obj()))return false; return true; } return false; } bool Apply(AlternateExpression* expression, Expression* target) { AlternateExpression* expected=dynamic_cast(target); if(expected) { if(!Invoke(expression->left, expected->left.Obj()))return false; if(!Invoke(expression->right, expected->right.Obj()))return false; return true; } return false; } bool Apply(BeginExpression* expression, Expression* target) { BeginExpression* expected=dynamic_cast(target); if(expected) { return true; } return false; } bool Apply(EndExpression* expression, Expression* target) { EndExpression* expected=dynamic_cast(target); if(expected) { return true; } return false; } bool Apply(CaptureExpression* expression, Expression* target) { CaptureExpression* expected=dynamic_cast(target); if(expected) { if(expression->name!=expected->name)return false; if(!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(MatchExpression* expression, Expression* target) { MatchExpression* expected=dynamic_cast(target); if(expected) { if(expression->name!=expected->name)return false; if(expression->index!=expected->index)return false; return true; } return false; } bool Apply(PositiveExpression* expression, Expression* target) { PositiveExpression* expected=dynamic_cast(target); if(expected) { if(!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(NegativeExpression* expression, Expression* target) { NegativeExpression* expected=dynamic_cast(target); if(expected) { if(!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(UsingExpression* expression, Expression* target) { UsingExpression* expected=dynamic_cast(target); if(expected) { if(expression->name!=expected->name)return false; return true; } return false; } }; /*********************************************************************** HasNoExtensionAlgorithm ***********************************************************************/ class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm { public: bool Apply(CharSetExpression* expression, void* target) { return true; } bool Apply(LoopExpression* expression, void* target) { return expression->preferLong && Invoke(expression->expression, 0); } bool Apply(SequenceExpression* expression, void* target) { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(AlternateExpression* expression, void* target) { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(BeginExpression* expression, void* target) { return false; } bool Apply(EndExpression* expression, void* target) { return false; } bool Apply(CaptureExpression* expression, void* target) { return false; } bool Apply(MatchExpression* expression, void* target) { return false; } bool Apply(PositiveExpression* expression, void* target) { return false; } bool Apply(NegativeExpression* expression, void* target) { return false; } bool Apply(UsingExpression* expression, void* target) { return false; } }; /*********************************************************************** CanTreatAsPureAlgorithm ***********************************************************************/ class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm { public: bool Apply(CharSetExpression* expression, void* target) { return true; } bool Apply(LoopExpression* expression, void* target) { return expression->preferLong && Invoke(expression->expression, 0); } bool Apply(SequenceExpression* expression, void* target) { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(AlternateExpression* expression, void* target) { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(BeginExpression* expression, void* target) { return false; } bool Apply(EndExpression* expression, void* target) { return false; } bool Apply(CaptureExpression* expression, void* target) { return Invoke(expression->expression, 0); } bool Apply(MatchExpression* expression, void* target) { return false; } bool Apply(PositiveExpression* expression, void* target) { return false; } bool Apply(NegativeExpression* expression, void* target) { return false; } bool Apply(UsingExpression* expression, void* target) { return false; } }; /*********************************************************************** CharSetNormalizationAlgorithm ***********************************************************************/ class NormalizedCharSet { public: CharRange::List ranges; }; class CharSetAlgorithm : public RegexExpressionAlgorithm { public: virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)=0; void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target) { if(expression->reverse) { wchar_t begin=1; for(vint i=0;ibegin) { Process(expression, target, CharRange(begin, range.begin-1)); } begin=range.end+1; } if(begin<=65535) { Process(expression, target, CharRange(begin, 65535)); } } else { for(vint i=0;iexpression, target); } void Apply(SequenceExpression* expression, NormalizedCharSet* target) { Invoke(expression->left, target); Invoke(expression->right, target); } void Apply(AlternateExpression* expression, NormalizedCharSet* target) { Invoke(expression->left, target); Invoke(expression->right, target); } void Apply(BeginExpression* expression, NormalizedCharSet* target) { } void Apply(EndExpression* expression, NormalizedCharSet* target) { } void Apply(CaptureExpression* expression, NormalizedCharSet* target) { Invoke(expression->expression, target); } void Apply(MatchExpression* expression, NormalizedCharSet* target) { } void Apply(PositiveExpression* expression, NormalizedCharSet* target) { Invoke(expression->expression, target); } void Apply(NegativeExpression* expression, NormalizedCharSet* target) { Invoke(expression->expression, target); } void Apply(UsingExpression* expression, NormalizedCharSet* target) { } }; class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm { public: void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) { vint index=0; while(indexranges.Count()) { CharRange current=target->ranges[index]; if(currentrange) { index++; } else if(current.beginranges.RemoveAt(index); target->ranges.Add(CharRange(current.begin, range.begin-1)); target->ranges.Add(CharRange(range.begin, current.end)); index++; } else if(current.begin>range.begin) { // range : [ ] // current : [ ? target->ranges.Add(CharRange(range.begin, current.begin-1)); range.begin=current.begin; } else if(current.endrange.end) { // range : [ ] // current : [ ] target->ranges.RemoveAt(index); target->ranges.Add(range); target->ranges.Add(CharRange(range.end+1, current.end)); return; } else { // range : [ ] // current : [ ] return; } } target->ranges.Add(range); } void Apply(CharSetExpression* expression, NormalizedCharSet* target) { Loop(expression, expression->ranges, target); } }; class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm { public: void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) { for(vint j=0;jranges.Count();j++) { CharRange targetRange=target->ranges[j]; if(range.begin<=targetRange.begin && targetRange.end<=range.end) { expression->ranges.Add(targetRange); } } } void Apply(CharSetExpression* expression, NormalizedCharSet* target) { CharRange::List source; CopyFrom(source, expression->ranges); expression->ranges.Clear(); Loop(expression, source, target); expression->reverse=false; } }; /*********************************************************************** MergeAlgorithm ***********************************************************************/ class MergeParameter { public: Expression::Map definitions; RegexExpression* regex; }; class MergeAlgorithm : public RegexExpressionAlgorithm { public: Expression::Ref Apply(CharSetExpression* expression, MergeParameter* target) { Ptr result=new CharSetExpression; CopyFrom(result->ranges, expression->ranges); result->reverse=expression->reverse; return result; } Expression::Ref Apply(LoopExpression* expression, MergeParameter* target) { Ptr result=new LoopExpression; result->max=expression->max; result->min=expression->min; result->preferLong=expression->preferLong; result->expression=Invoke(expression->expression, target); return result; } Expression::Ref Apply(SequenceExpression* expression, MergeParameter* target) { Ptr result=new SequenceExpression; result->left=Invoke(expression->left, target); result->right=Invoke(expression->right, target); return result; } Expression::Ref Apply(AlternateExpression* expression, MergeParameter* target) { Ptr result=new AlternateExpression; result->left=Invoke(expression->left, target); result->right=Invoke(expression->right, target); return result; } Expression::Ref Apply(BeginExpression* expression, MergeParameter* target) { return new BeginExpression; } Expression::Ref Apply(EndExpression* expression, MergeParameter* target) { return new EndExpression; } Expression::Ref Apply(CaptureExpression* expression, MergeParameter* target) { Ptr result=new CaptureExpression; result->expression=Invoke(expression->expression, target); result->name=expression->name; return result; } Expression::Ref Apply(MatchExpression* expression, MergeParameter* target) { Ptr result=new MatchExpression; result->name=expression->name; result->index=expression->index; return result; } Expression::Ref Apply(PositiveExpression* expression, MergeParameter* target) { Ptr result=new PositiveExpression; result->expression=Invoke(expression->expression, target); return result; } Expression::Ref Apply(NegativeExpression* expression, MergeParameter* target) { Ptr result=new NegativeExpression; result->expression=Invoke(expression->expression, target); return result; } Expression::Ref Apply(UsingExpression* expression, MergeParameter* target) { if(target->definitions.Keys().Contains(expression->name)) { Expression::Ref reference=target->definitions[expression->name]; if(reference) { return reference; } else { throw ArgumentException(L"Regular expression syntax error: Found reference loops in\""+expression->name+L"\".", L"vl::regex_internal::RegexExpression::Merge", L""); } } else if(target->regex->definitions.Keys().Contains(expression->name)) { target->definitions.Add(expression->name, 0); Expression::Ref result=Invoke(target->regex->definitions[expression->name], target); target->definitions.Set(expression->name, result); return result; } else { throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\""+expression->name+L"\".", L"vl::regex_internal::RegexExpression::Merge", L""); } } }; /*********************************************************************** EpsilonNfaAlgorithm ***********************************************************************/ class EpsilonNfaInfo { public: Automaton::Ref automaton; }; class EpsilonNfa { public: State* start; State* end; EpsilonNfa() { start=0; end=0; } }; class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm { public: EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target) { if(a.start) { target->NewEpsilon(a.end, b.start); a.end=b.end; return a; } else { return b; } } EpsilonNfa Apply(CharSetExpression* expression, Automaton* target) { EpsilonNfa nfa; nfa.start=target->NewState(); nfa.end=target->NewState(); for(vint i=0;iranges.Count();i++) { target->NewChars(nfa.start, nfa.end, expression->ranges[i]); } return nfa; } EpsilonNfa Apply(LoopExpression* expression, Automaton* target) { EpsilonNfa head; for(vint i=0;imin;i++) { EpsilonNfa body=Invoke(expression->expression, target); head=Connect(head, body, target); } if(expression->max==-1) { EpsilonNfa body=Invoke(expression->expression, target); if(!head.start) { head.start=head.end=target->NewState(); } State* loopBegin=head.end; State* loopEnd=target->NewState(); if(expression->preferLong) { target->NewEpsilon(loopBegin, body.start); target->NewEpsilon(body.end, loopBegin); target->NewNop(loopBegin, loopEnd); } else { target->NewNop(loopBegin, loopEnd); target->NewEpsilon(loopBegin, body.start); target->NewEpsilon(body.end, loopBegin); } head.end=loopEnd; } else if(expression->max>expression->min) { for(vint i=expression->min;imax;i++) { EpsilonNfa body=Invoke(expression->expression, target); State* start=target->NewState(); State* end=target->NewState(); if(expression->preferLong) { target->NewEpsilon(start, body.start); target->NewEpsilon(body.end, end); target->NewNop(start, end); } else { target->NewNop(start, end); target->NewEpsilon(start, body.start); target->NewEpsilon(body.end, end); } body.start=start; body.end=end; head=Connect(head, body, target); } } return head; } EpsilonNfa Apply(SequenceExpression* expression, Automaton* target) { EpsilonNfa a=Invoke(expression->left, target); EpsilonNfa b=Invoke(expression->right, target); return Connect(a, b, target); } EpsilonNfa Apply(AlternateExpression* expression, Automaton* target) { EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); EpsilonNfa a=Invoke(expression->left, target); EpsilonNfa b=Invoke(expression->right, target); target->NewEpsilon(result.start, a.start); target->NewEpsilon(a.end, result.end); target->NewEpsilon(result.start, b.start); target->NewEpsilon(b.end, result.end); return result; } EpsilonNfa Apply(BeginExpression* expression, Automaton* target) { EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); target->NewBeginString(result.start, result.end); return result; } EpsilonNfa Apply(EndExpression* expression, Automaton* target) { EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); target->NewEndString(result.start, result.end); return result; } EpsilonNfa Apply(CaptureExpression* expression, Automaton* target) { EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); vint capture=-1; if(expression->name!=L"") { capture=target->captureNames.IndexOf(expression->name); if(capture==-1) { capture=target->captureNames.Count(); target->captureNames.Add(expression->name); } } EpsilonNfa body=Invoke(expression->expression, target); target->NewCapture(result.start, body.start, capture); target->NewEnd(body.end, result.end); return result; } EpsilonNfa Apply(MatchExpression* expression, Automaton* target) { vint capture=-1; if(expression->name!=L"") { capture=target->captureNames.IndexOf(expression->name); if(capture==-1) { capture=target->captureNames.Count(); target->captureNames.Add(expression->name); } } EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); target->NewMatch(result.start, result.end, capture, expression->index); return result; } EpsilonNfa Apply(PositiveExpression* expression, Automaton* target) { EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); EpsilonNfa body=Invoke(expression->expression, target); target->NewPositive(result.start, body.start); target->NewEnd(body.end, result.end); return result; } EpsilonNfa Apply(NegativeExpression* expression, Automaton* target) { EpsilonNfa result; result.start=target->NewState(); result.end=target->NewState(); EpsilonNfa body=Invoke(expression->expression, target); target->NewNegative(result.start, body.start); target->NewEnd(body.end, result.end); target->NewNegativeFail(result.start, result.end); return result; } EpsilonNfa Apply(UsingExpression* expression, Automaton* target) { CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine."); } }; /*********************************************************************** Expression ***********************************************************************/ bool Expression::IsEqual(vl::regex_internal::Expression *expression) { return IsEqualAlgorithm().Invoke(this, expression); } bool Expression::HasNoExtension() { return HasNoExtensionAlgorithm().Invoke(this, 0); } bool Expression::CanTreatAsPure() { return CanTreatAsPureAlgorithm().Invoke(this, 0); } void Expression::NormalizeCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized); SetNormalizedCharSetAlgorithm().Invoke(this, &normalized); CopyFrom(subsets, normalized.ranges); } void Expression::CollectCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; CopyFrom(normalized.ranges, subsets); BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized); CopyFrom(subsets, normalized.ranges); } void Expression::ApplyCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; CopyFrom(normalized.ranges, subsets); SetNormalizedCharSetAlgorithm().Invoke(this, &normalized); } Automaton::Ref Expression::GenerateEpsilonNfa() { Automaton::Ref automaton=new Automaton; EpsilonNfa result=EpsilonNfaAlgorithm().Invoke(this, automaton.Obj()); automaton->startState=result.start; result.end->finalState=true; return automaton; } /*********************************************************************** CharSetExpression ***********************************************************************/ bool CharSetExpression::AddRangeWithConflict(CharRange range) { if(range.begin>range.end) { wchar_t t=range.begin; range.begin=range.end; range.end=t; } for(vint i=0;iranges[i])) { return false; } } ranges.Add(range); return true; } /*********************************************************************** RegexExpression ***********************************************************************/ Expression::Ref RegexExpression::Merge() { MergeParameter merge; merge.regex=this; return MergeAlgorithm().Invoke(expression, &merge); } /*********************************************************************** Expression::Apply ***********************************************************************/ void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } } } /*********************************************************************** .\REGEXPARSER.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** Helper Functions ***********************************************************************/ bool IsChar(const wchar_t*& input, wchar_t c) { if(*input==c) { input++; return true; } else { return false; } } bool IsChars(const wchar_t*& input, const wchar_t* chars, wchar_t& c) { const wchar_t* position=::wcschr(chars, *input); if(position) { c=*input++; return true; } else { return false; } } bool IsStr(const wchar_t*& input, const wchar_t* str) { size_t len=wcslen(str); if(wcsncmp(input, str, len)==0) { input+=len; return true; } else { return false; } } bool IsChars(const wchar_t*& input, const wchar_t* chars) { wchar_t c; return IsChars(input, chars, c); } bool IsPositiveInteger(const wchar_t*& input, vint& number) { bool readed=false; number=0; while(L'0'<=*input && *input<=L'9') { number=number*10+(*input++)-L'0'; readed=true; } return readed; } bool IsName(const wchar_t*& input, WString& name) { const wchar_t* read=input; if((L'A'<=*read && *read<=L'Z') || (L'a'<=*read && *read<=L'z') || *read==L'_') { read++; while((L'A'<=*read && *read<=L'Z') || (L'a'<=*read && *read<=L'z') || (L'0'<=*read && *read<=L'9') || *read==L'_') { read++; } } if(input==read) { return false; } else { name=WString(input, vint(read-input)); input=read; return true; } } Ptr ParseLoop(const wchar_t*& input) { vint min=0; vint max=0; if(!*input) { return 0; } else if(IsChar(input, L'+')) { min=1; max=-1; } else if(IsChar(input, L'*')) { min=0; max=-1; } else if(IsChar(input, L'?')) { min=0; max=1; } else if(IsChar(input, L'{')) { if(IsPositiveInteger(input, min)) { if(IsChar(input, L',')) { if(!IsPositiveInteger(input, max)) { max=-1; } } else { max=min; } if(!IsChar(input, L'}')) { goto THROW_EXCEPTION; } } else { goto THROW_EXCEPTION; } } else { return 0; } { LoopExpression* expression=new LoopExpression; expression->min=min; expression->max=max; expression->preferLong=!IsChar(input, L'?'); return expression; } THROW_EXCEPTION: throw ArgumentException(L"Regular expression syntax error: Illegal loop expression.", L"vl::regex_internal::ParseLoop", L"input"); } Ptr ParseCharSet(const wchar_t*& input) { if(!*input) { return 0; } else if(IsChar(input, L'^')) { return new BeginExpression; } else if(IsChar(input, L'$')) { return new EndExpression; } else if(IsChar(input, L'\\') || IsChar(input, L'/')) { Ptr expression=new CharSetExpression; expression->reverse=false; switch(*input) { case L'.': expression->ranges.Add(CharRange(1, 65535)); break; case L'r': expression->ranges.Add(CharRange(L'\r', L'\r')); break; case L'n': expression->ranges.Add(CharRange(L'\n', L'\n')); break; case L't': expression->ranges.Add(CharRange(L'\t', L'\t')); break; case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|': case L'{':case L'}':case L'[':case L']':case L'<':case L'>': case L'^':case L'$':case L'!':case L'=': expression->ranges.Add(CharRange(*input, *input)); break; case L'S': expression->reverse=true; case L's': expression->ranges.Add(CharRange(L' ', L' ')); expression->ranges.Add(CharRange(L'\r', L'\r')); expression->ranges.Add(CharRange(L'\n', L'\n')); expression->ranges.Add(CharRange(L'\t', L'\t')); break; case L'D': expression->reverse=true; case L'd': expression->ranges.Add(CharRange(L'0', L'9')); break; case L'L': expression->reverse=true; case L'l': expression->ranges.Add(CharRange(L'_', L'_')); expression->ranges.Add(CharRange(L'A', L'Z')); expression->ranges.Add(CharRange(L'a', L'z')); break; case L'W': expression->reverse=true; case L'w': expression->ranges.Add(CharRange(L'_', L'_')); expression->ranges.Add(CharRange(L'0', L'9')); expression->ranges.Add(CharRange(L'A', L'Z')); expression->ranges.Add(CharRange(L'a', L'z')); break; default: throw ArgumentException(L"Regular expression syntax error: Illegal character escaping.", L"vl::regex_internal::ParseCharSet", L"input"); } input++; return expression; } else if(IsChar(input, L'[')) { Ptr expression=new CharSetExpression; if(IsChar(input, L'^')) { expression->reverse=true; } else { expression->reverse=false; } bool midState=false; wchar_t a=L'\0'; wchar_t b=L'\0'; while(true) { if(IsChar(input, L'\\') || IsChar(input, L'/')) { wchar_t c=L'\0'; switch(*input) { case L'r': c=L'\r'; break; case L'n': c=L'\n'; break; case L't': c=L'\t'; break; case L'-':case L'[':case L']':case L'\\':case L'/':case L'^':case L'$': c=*input; break; default: throw ArgumentException(L"Regular expression syntax error: Illegal character escaping, only \"rnt-[]\\/\" are legal escaped characters in [].", L"vl::regex_internal::ParseCharSet", L"input"); } input++; midState?b=c:a=c; midState=!midState; } else if(IsChars(input, L"-]")) { goto THROW_EXCEPTION; } else if(*input) { midState?b=*input++:a=*input++; midState=!midState; } else { goto THROW_EXCEPTION; } if(IsChar(input, L']')) { if(midState) { b=a; } if(!expression->AddRangeWithConflict(CharRange(a, b))) { goto THROW_EXCEPTION; } break; } else if(IsChar(input, L'-')) { if(!midState) { goto THROW_EXCEPTION; } } else { if(midState) { b=a; } if(expression->AddRangeWithConflict(CharRange(a, b))) { midState=false; } else { goto THROW_EXCEPTION; } } } return expression; THROW_EXCEPTION: throw ArgumentException(L"Regular expression syntax error: Illegal character set definition."); } else if(IsChars(input, L"()+*?{}|")) { input--; return 0; } else { CharSetExpression* expression=new CharSetExpression; expression->reverse=false; expression->ranges.Add(CharRange(*input, *input)); input++; return expression; } } Ptr ParseFunction(const wchar_t*& input) { if(IsStr(input, L"(=")) { Ptr sub=ParseExpression(input); if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } PositiveExpression* expression=new PositiveExpression; expression->expression=sub; return expression; } else if(IsStr(input, L"(!")) { Ptr sub=ParseExpression(input); if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } NegativeExpression* expression=new NegativeExpression; expression->expression=sub; return expression; } else if(IsStr(input, L"(<&")) { WString name; if(!IsName(input, name)) { goto NEED_NAME; } if(!IsChar(input, L'>')) { goto NEED_GREATER; } if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } UsingExpression* expression=new UsingExpression; expression->name=name; return expression; } else if(IsStr(input, L"(<$")) { WString name; vint index=-1; if(IsName(input, name)) { if(IsChar(input, L';')) { if(!IsPositiveInteger(input, index)) { goto NEED_NUMBER; } } } else if(!IsPositiveInteger(input, index)) { goto NEED_NUMBER; } if(!IsChar(input, L'>')) { goto NEED_GREATER; } if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } MatchExpression* expression=new MatchExpression; expression->name=name; expression->index=index; return expression; } else if(IsStr(input, L"(<")) { WString name; if(!IsName(input, name)) { goto NEED_NAME; } if(!IsChar(input, L'>')) { goto NEED_GREATER; } Ptr sub=ParseExpression(input); if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } CaptureExpression* expression=new CaptureExpression; expression->name=name; expression->expression=sub; return expression; } else if(IsStr(input, L"(?")) { Ptr sub=ParseExpression(input); if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } CaptureExpression* expression=new CaptureExpression; expression->expression=sub; return expression; } else if(IsChar(input, L'(')) { Ptr sub=ParseExpression(input); if(!IsChar(input, L')')) { goto NEED_RIGHT_BRACKET; } return sub; } else { return 0; } NEED_RIGHT_BRACKET: throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input"); NEED_GREATER: throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input"); NEED_NAME: throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseFunction", L"input"); NEED_NUMBER: throw ArgumentException(L"Regular expression syntax error: Number expected.", L"vl::regex_internal::ParseFunction", L"input"); } Ptr ParseUnit(const wchar_t*& input) { Ptr unit=ParseCharSet(input); if(!unit) { unit=ParseFunction(input); } if(!unit) { return 0; } Ptr loop; while((loop=ParseLoop(input))) { loop->expression=unit; unit=loop; } return unit; } Ptr ParseJoin(const wchar_t*& input) { Ptr expression=ParseUnit(input); while(true) { Ptr right=ParseUnit(input); if(right) { SequenceExpression* sequence=new SequenceExpression; sequence->left=expression; sequence->right=right; expression=sequence; } else { break; } } return expression; } Ptr ParseAlt(const wchar_t*& input) { Ptr expression=ParseJoin(input); while(true) { if(IsChar(input, L'|')) { Ptr right=ParseJoin(input); if(right) { AlternateExpression* alternate=new AlternateExpression; alternate->left=expression; alternate->right=right; expression=alternate; } else { throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseAlt", L"input"); } } else { break; } } return expression; } Ptr ParseExpression(const wchar_t*& input) { return ParseAlt(input); } RegexExpression::Ref ParseRegexExpression(const WString& code) { RegexExpression::Ref regex=new RegexExpression; const wchar_t* start=code.Buffer(); const wchar_t* input=start; try { while(IsStr(input, L"(<#")) { WString name; if(!IsName(input, name)) { throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseRegexExpression", L"code"); } if(!IsChar(input, L'>')) { throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input"); } Ptr sub=ParseExpression(input); if(!IsChar(input, L')')) { throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input"); } if(regex->definitions.Keys().Contains(name)) { throw ArgumentException(L"Regular expression syntax error: Found duplicated sub expression name: \""+name+L"\". ", L"vl::regex_internal::ParseFunction", L"input"); } else { regex->definitions.Add(name, sub); } } regex->expression=ParseExpression(input); if(!regex->expression) { throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseUnit", L"input"); } if(*input) { throw ArgumentException(L"Regular expression syntax error: Found unnecessary tokens.", L"vl::regex_internal::ParseUnit", L"input"); } return regex; } catch(const ArgumentException& e) { throw ParsingException(e.Message(), code, input-start); } } WString EscapeTextForRegex(const WString& literalString) { WString result; for(vint i=0;i': case L'^':case L'$':case L'!':case L'=': result+=WString(L"\\")+c; break; case L'\r': result+=L"\\r"; break; case L'\n': result+=L"\\n"; break; case L'\t': result+=L"\\t"; break; default: result+=c; } } return result; } WString UnescapeTextForRegex(const WString& escapedText) { WString result; for(vint i=0;i': case L'^':case L'$':case L'!':case L'=': return false; } } } return true; } } } /*********************************************************************** .\REGEXPURE.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** PureInterpretor ***********************************************************************/ PureInterpretor::PureInterpretor(Automaton::Ref dfa, CharRange::List& subsets) :transition(0) ,finalState(0) ,relatedFinalState(0) { stateCount=dfa->states.Count(); charSetCount=subsets.Count()+1; startState=dfa->states.IndexOf(dfa->startState); // Map char to input index (equivalent char class) for(vint i=0;istates[i].Obj(); for(vint j=0;jtransitions.Count();j++) { Transition* dfaTransition=state->transitions[j]; switch(dfaTransition->type) { case Transition::Chars: { vint index=subsets.IndexOf(dfaTransition->range); if(index==-1) { CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Automaton::Ref, CharRange::List&)#Specified chars don't appear in the normalized char ranges."); } transition[i][index]=dfa->states.IndexOf(dfaTransition->target); } break; default: CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Automaton::Ref, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions."); } } } // Mark final states finalState=new bool[stateCount]; for(vint i=0;istates[i]->finalState; } } PureInterpretor::~PureInterpretor() { if(relatedFinalState) delete[] relatedFinalState; delete[] finalState; for(vint i=0;i=SupportedCharCount)break; #endif vint charIndex=charMap[*read++]; currentState=transition[currentState][charIndex]; } if(result.finalState==-1) { if(terminateLength>0) { result.terminateState=terminateState; } result.length=terminateLength; return false; } else { return true; } } bool PureInterpretor::Match(const wchar_t* input, const wchar_t* start, PureResult& result) { const wchar_t* read=input; while(*read) { if(MatchHead(read, start, result)) { return true; } read++; } return false; } vint PureInterpretor::GetStartState() { return startState; } vint PureInterpretor::Transit(wchar_t input, vint state) { if(0<=state && state struct POD { static const bool Result = true; }; template<> struct POD { static const bool Result = true; }; namespace regex_internal { using namespace collections; void Push(List& elements, vint& available, vint& count, const ExtensionSaver& element) { if(elements.Count()==count) { elements.Add(element); } else { elements[count]=element; } ExtensionSaver& current=elements[count]; current.previous=available; available=count++; } ExtensionSaver Pop(List& elements, vint& available, vint& count) { ExtensionSaver& current=elements[available]; available=current.previous; return current; } template void PushNonSaver(List& elements, vint& count, const T& element) { if(elements.Count()==count) { elements.Add(element); } else { elements[count]=element; } count++; } template T PopNonSaver(List& elements, vint& count) { return elements[--count]; } } namespace regex_internal { /*********************************************************************** CaptureRecord ***********************************************************************/ bool CaptureRecord::operator==(const CaptureRecord& record)const { return capture==record.capture && start==record.start && length==record.length; } /*********************************************************************** RichInterpretor ***********************************************************************/ RichInterpretor::RichInterpretor(Automaton::Ref _dfa) :dfa(_dfa) { datas=new UserData[dfa->states.Count()]; for(vint i=0;istates.Count();i++) { State* state=dfa->states[i].Obj(); vint charEdges=0; vint nonCharEdges=0; bool mustSave=false; for(vint j=0;jtransitions.Count();j++) { if(state->transitions[j]->type==Transition::Chars) { charEdges++; } else { if(state->transitions[j]->type==Transition::Negative || state->transitions[j]->type==Transition::Positive) { mustSave=true; } nonCharEdges++; } } datas[i].NeedKeepState=mustSave || nonCharEdges>1 || (nonCharEdges!=0 && charEdges!=0); state->userData=&datas[i]; } } RichInterpretor::~RichInterpretor() { delete[] datas; } bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, RichResult& result) { List stateSavers; List extensionSavers; StateSaver currentState; currentState.captureCount=0; currentState.currentState=dfa->startState; currentState.extensionSaverAvailable=-1; currentState.extensionSaverCount=0; currentState.minTransition=0; currentState.reading=input; currentState.stateSaverCount=0; currentState.storeType=StateSaver::Other; while (!currentState.currentState->finalState) { bool found = false; // true means at least one transition matches the input StateSaver oldState = currentState; // Iterate through all transitions from the current state for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++) { Transition* transition = currentState.currentState->transitions[i]; switch (transition->type) { case Transition::Chars: { // match the input if the current character fall into the range CharRange range = transition->range; found = range.begin <= *currentState.reading && range.end >= *currentState.reading; if (found) { currentState.reading++; } } break; case Transition::BeginString: { // match the input if this is the first character, and it is not consumed found = currentState.reading == start; } break; case Transition::EndString: { // match the input if this is after the last character, and it is not consumed found = *currentState.reading == L'\0'; } break; case Transition::Nop: { // match without any condition found = true; } break; case Transition::Capture: { // Push the capture information ExtensionSaver saver; saver.captureListIndex = currentState.captureCount; saver.reading = currentState.reading; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Push the capture record, and it will be written if the input matches the regex CaptureRecord capture; capture.capture = transition->capture; capture.start = currentState.reading - start; capture.length = -1; PushNonSaver(result.captures, currentState.captureCount, capture); found = true; } break; case Transition::Match: { vint index = 0; for (vint j = 0; j < currentState.captureCount; j++) { CaptureRecord& capture = result.captures[j]; // If the capture name matched if (capture.capture == transition->capture) { // If the capture index matched, or it is -1 if (capture.length != -1 && (transition->index == -1 || transition->index == index)) { // If the captured text matched if (wcsncmp(start + capture.start, currentState.reading, capture.length) == 0) { // Consume so much input currentState.reading += capture.length; found = true; break; } } // Fail if f the captured text with the specified name and index doesn't match if (transition->index != -1 && index == transition->index) { break; } else { index++; } } } } break; case Transition::Positive: { // Push the positive lookahead information ExtensionSaver saver; saver.captureListIndex = -1; saver.reading = currentState.reading; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Set found = true so that PushNonSaver(oldState) happens later oldState.storeType = StateSaver::Positive; found = true; } break; case Transition::Negative: { // Push the positive lookahead information ExtensionSaver saver; saver.captureListIndex = -1; saver.reading = currentState.reading; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Set found = true so that PushNonSaver(oldState) happens later oldState.storeType = StateSaver::Negative; found = true; } break; case Transition::NegativeFail: { // NegativeFail will be used when the nagative lookahead failed } break; case Transition::End: { // Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount); switch (extensionSaver.transition->type) { case Transition::Capture: { // Write the captured text CaptureRecord& capture = result.captures[extensionSaver.captureListIndex]; capture.length = (currentState.reading - start) - capture.start; found = true; } break; case Transition::Positive: // Find the last positive lookahead state saver for (vint j = currentState.stateSaverCount - 1; j >= 0; j--) { StateSaver& stateSaver = stateSavers[j]; if (stateSaver.storeType == StateSaver::Positive) { // restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input oldState.reading = stateSaver.reading; oldState.stateSaverCount = j; currentState.reading = stateSaver.reading; currentState.stateSaverCount = j; break; } } found = true; break; case Transition::Negative: // Find the last negative lookahead state saver for (vint j = currentState.stateSaverCount - 1; j >= 0; j--) { StateSaver& stateSaver = stateSavers[j]; if (stateSaver.storeType == StateSaver::Negative) { // restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input oldState = stateSaver; oldState.storeType = StateSaver::Other; currentState = stateSaver; currentState.storeType = StateSaver::Other; i = currentState.minTransition - 1; break; } } break; default:; } } break; default:; } // Save the parsing state when necessary if (found) { UserData* data = (UserData*)currentState.currentState->userData; if (data->NeedKeepState) { oldState.minTransition = i + 1; PushNonSaver(stateSavers, currentState.stateSaverCount, oldState); } currentState.currentState = transition->target; currentState.minTransition = 0; break; } } // If no transition from the current state can be used if (!found) { // If there is a chance to do backtracking if (currentState.stateSaverCount) { currentState = PopNonSaver(stateSavers, currentState.stateSaverCount); // minTransition - 1 is always valid since the value is stored with adding 1 // So minTransition - 1 record the transition, which is the reason the parsing state is saved if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative) { // Find the next NegativeFail transition // Because when a negative lookahead regex failed to match, it is actually succeeded // Since a negative lookahead means we don't want to match this regex for (vint i = 0; i < currentState.currentState->transitions.Count(); i++) { Transition* transition = currentState.currentState->transitions[i]; if (transition->type == Transition::NegativeFail) { // Restore the state to the target of NegativeFail to let the parsing continue currentState.currentState = transition->target; currentState.minTransition = 0; currentState.storeType = StateSaver::Other; break; } } } } else { break; } } } if (currentState.currentState->finalState) { // Keep available captures if succeeded result.start = input - start; result.length = (currentState.reading - start) - result.start; for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--) { result.captures.RemoveAt(i); } return true; } else { // Clear captures if failed result.captures.Clear(); return false; } } bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, RichResult& result) { const wchar_t* read=input; while(*read) { if(MatchHead(read, start, result)) { return true; } read++; } return false; } const List& RichInterpretor::CaptureNames() { return dfa->captureNames; } } } /*********************************************************************** .\REGEXWRITER.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex { using namespace vl::regex_internal; /*********************************************************************** RegexNode ***********************************************************************/ RegexNode::RegexNode(vl::regex_internal::Expression::Ref _expression) :expression(_expression) { } RegexNode RegexNode::Some()const { return Loop(1, -1); } RegexNode RegexNode::Any()const { return Loop(0, -1); } RegexNode RegexNode::Opt()const { return Loop(0, 1); } RegexNode RegexNode::Loop(vint min, vint max)const { LoopExpression* target=new LoopExpression; target->min=min; target->max=max; target->preferLong=true; target->expression=expression; return RegexNode(target); } RegexNode RegexNode::AtLeast(vint min)const { return Loop(min, -1); } RegexNode RegexNode::operator+(const RegexNode& node)const { SequenceExpression* target=new SequenceExpression; target->left=expression; target->right=node.expression; return RegexNode(target); } RegexNode RegexNode::operator|(const RegexNode& node)const { AlternateExpression* target=new AlternateExpression; target->left=expression; target->right=node.expression; return RegexNode(target); } RegexNode RegexNode::operator+()const { PositiveExpression* target=new PositiveExpression; target->expression=expression; return RegexNode(target); } RegexNode RegexNode::operator-()const { NegativeExpression* target=new NegativeExpression; target->expression=expression; return RegexNode(target); } RegexNode RegexNode::operator!()const { CharSetExpression* source=dynamic_cast(expression.Obj()); CHECK_ERROR(source, L"RegexNode::operator!()#operator ! can only applies on charset expressions."); Ptr target=new CharSetExpression; CopyFrom(target->ranges, source->ranges); target->reverse=!source->reverse; return RegexNode(target); } RegexNode RegexNode::operator%(const RegexNode& node)const { CharSetExpression* left=dynamic_cast(expression.Obj()); CharSetExpression* right=dynamic_cast(node.expression.Obj()); CHECK_ERROR(left && right && !left->reverse && !right->reverse, L"RegexNode::operator%(const RegexNode&)#operator % only connects non-reverse charset expressions."); Ptr target=new CharSetExpression; target->reverse=false; CopyFrom(target->ranges, left->ranges); for(vint i=0;iranges.Count();i++) { if(!target->AddRangeWithConflict(right->ranges[i])) { CHECK_ERROR(false, L"RegexNode::operator%(const RegexNode&)#Failed to create charset expression from operator %."); } } return RegexNode(target); } /*********************************************************************** Regex Writer ***********************************************************************/ RegexNode rCapture(const WString& name, const RegexNode& node) { CaptureExpression* target=new CaptureExpression; target->name=name; target->expression=node.expression; return RegexNode(target); } RegexNode rUsing(const WString& name) { UsingExpression* target=new UsingExpression; target->name=name; return RegexNode(target); } RegexNode rMatch(const WString& name, vint index) { MatchExpression* target=new MatchExpression; target->name=name; target->index=index; return RegexNode(target); } RegexNode rMatch(vint index) { MatchExpression* target=new MatchExpression; target->index=index; return RegexNode(target); } RegexNode rBegin() { return RegexNode(new BeginExpression); } RegexNode rEnd() { return RegexNode(new EndExpression); } RegexNode rC(wchar_t a, wchar_t b) { if(!b)b=a; CharSetExpression* target=new CharSetExpression; target->reverse=false; target->AddRangeWithConflict(CharRange(a, b)); return RegexNode(target); } RegexNode r_d() { return rC(L'0', L'9'); } RegexNode r_l() { return rC(L'a', L'z')%rC(L'A', L'Z')%rC(L'_'); } RegexNode r_w() { return rC(L'0', L'9')%rC(L'a', L'z')%rC(L'A', L'Z')%rC(L'_'); } RegexNode rAnyChar() { return rC(1, 65535); } } }