/*********************************************************************** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY DEVELOPER: Zihan Chen(vczh) ***********************************************************************/ #include "VlppRegex.h" /*********************************************************************** .\REGEX.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { void ReadInt(stream::IStream& inputStream, vint& value); void ReadInts(stream::IStream& inputStream, vint count, vint* values); void WriteInt(stream::IStream& outputStream, vint value); void WriteInts(stream::IStream& outputStream, vint count, vint* values); } namespace regex { using namespace collections; using namespace regex_internal; /*********************************************************************** String Conversion ***********************************************************************/ template struct U32; template<> struct U32 { static constexpr U32String(*ToU32)(const WString&) = &wtou32; static constexpr WString(*FromU32)(const U32String&) = &u32tow; }; template<> struct U32 { static constexpr U32String(*ToU32)(const U8String&) = &u8tou32; static constexpr U8String(*FromU32)(const U32String&) = &u32tou8; }; template<> struct U32 { static constexpr U32String(*ToU32)(const U16String&) = &u16tou32; static constexpr U16String(*FromU32)(const U32String&) = &u32tou16; }; template<> struct U32 { static U32String ToU32(const U32String& text) { return text; } static U32String FromU32(const U32String& text) { return text; } }; /*********************************************************************** RegexMatch_ ***********************************************************************/ template RegexMatch_::RegexMatch_(const ObjectString& _string, PureResult* _result) :success(true) , result(_string, _result->start, _result->length) { } template RegexMatch_::RegexMatch_(const ObjectString& _string, RichResult* _result) : success(true) , result(_string, _result->start, _result->length) { // TODO: (enumerable) foreach for (vint i = 0; i < _result->captures.Count(); i++) { CaptureRecord& capture = _result->captures[i]; if (capture.capture == -1) { captures.Add(RegexString_(_string, capture.start, capture.length)); } else { groups.Add(capture.capture, RegexString_(_string, capture.start, capture.length)); } } } template RegexMatch_::RegexMatch_(const RegexString_& _result) :success(false) , result(_result) { } template bool RegexMatch_::Success()const { return success; } template const RegexString_& RegexMatch_::Result()const { return result; } template const typename RegexMatch_::CaptureList& RegexMatch_::Captures()const { return captures; } template const typename RegexMatch_::CaptureGroup& RegexMatch_::Groups()const { return groups; } /*********************************************************************** RegexBase_ ***********************************************************************/ template void RegexBase_::Process(const ObjectString& text, bool keepEmpty, bool keepSuccess, bool keepFail, typename RegexMatch_::List& matches)const { if (rich) { const T* start = text.Buffer(); const T* input = start; RichResult result; while (rich->Match(input, start, result)) { vint offset = input - start; if (keepFail) { if (result.start > offset || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, offset, result.start - offset)))); } } if (keepSuccess) { matches.Add(Ptr(new RegexMatch_(text, &result))); } input = start + result.start + result.length; } if (keepFail) { vint remain = input - start; vint length = text.Length() - remain; if (length || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, remain, length)))); } } } else { const T* start = text.Buffer(); const T* input = start; PureResult result; while (pure->Match(input, start, result)) { vint offset = input - start; if (keepFail) { if (result.start > offset || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, offset, result.start - offset)))); } } if (keepSuccess) { matches.Add(Ptr(new RegexMatch_(text, &result))); } input = start + result.start + result.length; } if (keepFail) { vint remain = input - start; vint length = text.Length() - remain; if (length || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, remain, length)))); } } } } RegexBase_::~RegexBase_() { if (pure) delete pure; if (rich) delete rich; } template typename RegexMatch_::Ref RegexBase_::MatchHead(const ObjectString& text)const { if (rich) { RichResult result; if (rich->MatchHead(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } else { PureResult result; if (pure->MatchHead(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } } template typename RegexMatch_::Ref RegexBase_::Match(const ObjectString& text)const { if (rich) { RichResult result; if (rich->Match(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } else { PureResult result; if (pure->Match(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } } template bool RegexBase_::TestHead(const ObjectString& text)const { if (pure) { PureResult result; return pure->MatchHead(text.Buffer(), text.Buffer(), result); } else { RichResult result; return rich->MatchHead(text.Buffer(), text.Buffer(), result); } } template bool RegexBase_::Test(const ObjectString& text)const { if (pure) { PureResult result; return pure->Match(text.Buffer(), text.Buffer(), result); } else { RichResult result; return rich->Match(text.Buffer(), text.Buffer(), result); } } template void RegexBase_::Search(const ObjectString& text, typename RegexMatch_::List& matches)const { Process(text, false, true, false, matches); } template void RegexBase_::Split(const ObjectString& text, bool keepEmptyMatch, typename RegexMatch_::List& matches)const { Process(text, keepEmptyMatch, false, true, matches); } template void RegexBase_::Cut(const ObjectString& text, bool keepEmptyMatch, typename RegexMatch_::List& matches)const { Process(text, keepEmptyMatch, true, true, matches); } /*********************************************************************** Regex_ ***********************************************************************/ template Regex_::Regex_(const ObjectString& code, bool preferPure) { CharRange::List subsets; auto regex = ParseRegexExpression(U32::ToU32(code)); auto expression = regex->Merge(); expression->NormalizeCharSet(subsets); bool pureRequired = false; bool richRequired = false; if (preferPure) { if (expression->HasNoExtension()) { pureRequired = true; } else { if (expression->CanTreatAsPure()) { pureRequired = true; richRequired = true; } else { richRequired = true; } } } else { richRequired = true; } try { if (pureRequired) { Dictionary nfaStateMap; Group dfaStateMap; Ptr eNfa = expression->GenerateEpsilonNfa(); Ptr nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap); Ptr dfa = NfaToDfa(nfa, dfaStateMap); pure = new PureInterpretor(dfa, subsets); } if (richRequired) { Dictionary nfaStateMap; Group dfaStateMap; Ptr eNfa = expression->GenerateEpsilonNfa(); Ptr nfa = EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap); Ptr dfa = NfaToDfa(nfa, dfaStateMap); rich = new RichInterpretor(dfa); for (auto&& name : rich->CaptureNames()) { captureNames.Add(U32::FromU32(name)); } } } catch (...) { if (pure)delete pure; if (rich)delete rich; throw; } } /*********************************************************************** RegexTokens_ ***********************************************************************/ template class RegexTokenEnumerator : public Object, public IEnumerator> { protected: RegexToken_ token; vint index = -1; PureInterpretor* pure; const Array& stateTokens; const T* start; vint codeIndex; RegexProc_ proc; const T* reading; vint rowStart = 0; vint columnStart = 0; bool cacheAvailable = false; RegexToken_ cacheToken; public: RegexTokenEnumerator(const RegexTokenEnumerator& enumerator) : token(enumerator.token) , index(enumerator.index) , pure(enumerator.pure) , stateTokens(enumerator.stateTokens) , start(enumerator.start) , codeIndex(enumerator.codeIndex) , proc(enumerator.proc) , reading(enumerator.reading) , rowStart(enumerator.rowStart) , columnStart(enumerator.columnStart) , cacheAvailable(enumerator.cacheAvailable) , cacheToken(enumerator.cacheToken) { } RegexTokenEnumerator(PureInterpretor* _pure, const Array& _stateTokens, const T* _start, vint _codeIndex, RegexProc_ _proc) :index(-1) , pure(_pure) , stateTokens(_stateTokens) , start(_start) , codeIndex(_codeIndex) , proc(_proc) , reading(_start) { } IEnumerator>* Clone()const { return new RegexTokenEnumerator(*this); } const RegexToken_& Current()const { return token; } vint Index()const { return index; } bool Next() { if (!cacheAvailable && !*reading) return false; if (cacheAvailable) { token = cacheToken; cacheAvailable = false; } else { token.reading = reading; token.start = 0; token.length = 0; token.token = -2; token.completeToken = true; } token.rowStart = rowStart; token.columnStart = columnStart; token.rowEnd = rowStart; token.columnEnd = columnStart; token.codeIndex = codeIndex; PureResult result; while (*reading) { vint id = -1; bool completeToken = true; if (!pure->MatchHead(reading, start, result)) { result.start = reading - start; if (id == -1 && result.terminateState != -1) { vint state = pure->GetRelatedFinalState(result.terminateState); if (state != -1) { id = stateTokens[state]; } } if (id == -1) { result.length = 1; } else { completeToken = false; } } else { id = stateTokens.Get(result.finalState); } if (id != -1 && proc.extendProc) { RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr); proc.extendProc(proc.argument, reading, -1, true, token); #if _DEBUG CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer."); #endif result.length = token.length; id = token.token; completeToken = token.completeToken; } if (token.token == -2) { token.start = result.start; token.length = result.length; token.token = id; token.completeToken = completeToken; } else if (token.token == id && id == -1) { token.length += result.length; } else { cacheAvailable = true; cacheToken.reading = reading; cacheToken.start = result.start; cacheToken.length = result.length; cacheToken.codeIndex = codeIndex; cacheToken.token = id; cacheToken.completeToken = completeToken; } reading += result.length; if (cacheAvailable) { break; } } index++; for (vint i = 0; i < token.length; i++) { token.rowEnd = rowStart; token.columnEnd = columnStart; if (token.reading[i] == L'\n') { rowStart++; columnStart = 0; } else { columnStart++; } } return true; } void Reset() { index = -1; reading = start; cacheAvailable = false; } void ReadToEnd(List>& tokens, bool(*discard)(vint)) { while (Next()) { if (!discard(token.token)) { tokens.Add(token); } } } }; template RegexTokens_::RegexTokens_(PureInterpretor* _pure, const Array& _stateTokens, const ObjectString& _code, vint _codeIndex, RegexProc_ _proc) :pure(_pure) , stateTokens(_stateTokens) , code(_code) , codeIndex(_codeIndex) , proc(_proc) { } template RegexTokens_::RegexTokens_(const RegexTokens_& tokens) :pure(tokens.pure) , stateTokens(tokens.stateTokens) , code(tokens.code) , codeIndex(tokens.codeIndex) , proc(tokens.proc) { } template IEnumerator>* RegexTokens_::CreateEnumerator() const { return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc); } bool DefaultDiscard(vint token) { return false; } template void RegexTokens_::ReadToEnd(collections::List>& tokens, bool(*discard)(vint))const { if (discard == 0) { discard = &DefaultDiscard; } RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard); } /*********************************************************************** RegexLexerWalker_ ***********************************************************************/ template RegexLexerWalker_::RegexLexerWalker_(PureInterpretor* _pure, const Array& _stateTokens) :pure(_pure) , stateTokens(_stateTokens) { } template RegexLexerWalker_::RegexLexerWalker_(const RegexLexerWalker_& tokens) : pure(tokens.pure) , stateTokens(tokens.stateTokens) { } template vint RegexLexerWalker_::GetStartState()const { return pure->GetStartState(); } template vint RegexLexerWalker_::GetRelatedToken(vint state)const { vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state); return finalState == -1 ? -1 : stateTokens.Get(finalState); } template void RegexLexerWalker_::Walk(T input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const { vint previousState = state; token = -1; finalState = false; previousTokenStop = false; if (state == -1) { state = pure->GetStartState(); previousTokenStop = true; } state = pure->Transit(input, state); if (state == -1) { previousTokenStop = true; if (previousState == -1) { finalState = true; return; } else if (pure->IsFinalState(previousState)) { state = pure->Transit(input, pure->GetStartState()); } } if (pure->IsFinalState(state)) { token = stateTokens.Get(state); finalState = true; return; } else { finalState = state == -1; return; } } template vint RegexLexerWalker_::Walk(T input, vint state)const { vint token = -1; bool finalState = false; bool previousTokenStop = false; Walk(input, state, token, finalState, previousTokenStop); return state; } template bool RegexLexerWalker_::IsClosedToken(const T* input, vint length)const { vint state = pure->GetStartState(); for (vint i = 0; i < length; i++) { state = pure->Transit(input[i], state); if (state == -1) return true; if (pure->IsDeadState(state)) return true; } return false; } template bool RegexLexerWalker_::IsClosedToken(const ObjectString& input)const { return IsClosedToken(input.Buffer(), input.Length()); } /*********************************************************************** RegexLexerColorizer_ ***********************************************************************/ template RegexLexerColorizer_::RegexLexerColorizer_(const RegexLexerWalker_& _walker, RegexProc_ _proc) :walker(_walker) , proc(_proc) { internalState.currentState = walker.GetStartState(); } template typename RegexLexerColorizer_::InternalState RegexLexerColorizer_::GetInternalState() { return internalState; } template void RegexLexerColorizer_::SetInternalState(InternalState state) { internalState = state; } template void RegexLexerColorizer_::Pass(T input) { WalkOneToken(&input, 1, 0, false); } template vint RegexLexerColorizer_::GetStartState()const { return walker.GetStartState(); } template void RegexLexerColorizer_::CallExtendProcAndColorizeProc(const T* input, vint length, RegexProcessingToken& token, bool colorize) { vint oldTokenLength = token.length; proc.extendProc(proc.argument, input + token.start, length - token.start, false, token); #if _DEBUG { bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken; CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input." ); CHECK_ERROR( token.completeToken || token.token != -1, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id." ); CHECK_ERROR( oldTokenLength <= token.length, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length." ); CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." ); } #endif if ((internalState.interTokenState = token.interTokenState)) { internalState.interTokenId = token.token; } if (colorize) { proc.colorizeProc(proc.argument, token.start, token.length, token.token); } } template vint RegexLexerColorizer_::WalkOneToken(const T* input, vint length, vint start, bool colorize) { if (internalState.interTokenState) { RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState); proc.extendProc(proc.argument, input, length, false, token); #if _DEBUG { bool pausedAtTheEnd = token.length == length && !token.completeToken; CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input." ); CHECK_ERROR( token.completeToken || token.token == internalState.interTokenId, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id." ); CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." ); } #endif if (colorize) { proc.colorizeProc(proc.argument, 0, token.length, token.token); } if (!(internalState.interTokenState = token.interTokenState)) { internalState.interTokenId = -1; } return token.length; } vint lastFinalStateLength = 0; vint lastFinalStateToken = -1; vint lastFinalStateState = -1; vint tokenStartState = internalState.currentState; for (vint i = start; i < length; i++) { vint currentToken = -1; bool finalState = false; bool previousTokenStop = false; walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop); if (previousTokenStop) { if (proc.extendProc && lastFinalStateToken != -1) { RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr); CallExtendProcAndColorizeProc(input, length, token, colorize); if (token.completeToken) { internalState.currentState = walker.GetStartState(); } return start + token.length; } else if (i == start) { if (tokenStartState == GetStartState()) { if (colorize) { proc.colorizeProc(proc.argument, start, 1, -1); } internalState.currentState = walker.GetStartState(); return i + 1; } } else { if (colorize) { proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken); } internalState.currentState = lastFinalStateState; return start + lastFinalStateLength; } } if (finalState) { lastFinalStateLength = i + 1 - start; lastFinalStateToken = currentToken; lastFinalStateState = internalState.currentState; } } if (lastFinalStateToken != -1 && start + lastFinalStateLength == length) { if (proc.extendProc) { RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr); CallExtendProcAndColorizeProc(input, length, token, colorize); } else if (colorize) { proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken); } } else if (colorize) { proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState)); } return length; } template void* RegexLexerColorizer_::Colorize(const T* input, vint length) { vint index = 0; while (index != length) { index = WalkOneToken(input, length, index, true); } return internalState.interTokenState; } /*********************************************************************** RegexLexerBase_ ***********************************************************************/ RegexLexerBase_::~RegexLexerBase_() { if (pure) delete pure; } template RegexTokens_ RegexLexerBase_::Parse(const ObjectString& code, RegexProc_ proc, vint codeIndex)const { code.Buffer(); pure->PrepareForRelatedFinalStateTable(); return RegexTokens_(pure, stateTokens, code, codeIndex, proc); } template RegexLexerWalker_ RegexLexerBase_::Walk()const { pure->PrepareForRelatedFinalStateTable(); return RegexLexerWalker_(pure, stateTokens); } RegexLexerWalker_ RegexLexerBase_::Walk()const { pure->PrepareForRelatedFinalStateTable(); return RegexLexerWalker_(pure, stateTokens); } template RegexLexerColorizer_ RegexLexerBase_::Colorize(RegexProc_ proc)const { return RegexLexerColorizer_(Walk(), proc); } /*********************************************************************** RegexLexer_ (Serialization) ***********************************************************************/ template RegexLexer_::RegexLexer_(stream::IStream& inputStream) { pure = new PureInterpretor(inputStream); vint count = 0; ReadInt(inputStream, count); stateTokens.Resize(count); if (count > 0) { ReadInts(inputStream, count, &stateTokens[0]); } } template void RegexLexer_::Serialize(stream::IStream& outputStream) { pure->Serialize(outputStream); WriteInt(outputStream, stateTokens.Count()); if (stateTokens.Count() > 0) { WriteInts(outputStream, stateTokens.Count(), &stateTokens[0]); } } /*********************************************************************** RegexLexer_ ***********************************************************************/ template RegexLexer_::RegexLexer_(const collections::IEnumerable>& tokens) { // Build DFA for all tokens List> expressions; List> dfas; CharRange::List subsets; for (auto&& code : tokens) { auto regex = ParseRegexExpression(U32::ToU32(code)); auto expression = regex->Merge(); expression->CollectCharSet(subsets); expressions.Add(expression); } // TODO: (enumerable) foreach for (vint i = 0; i < expressions.Count(); i++) { Dictionary nfaStateMap; Group dfaStateMap; expressions[i]->ApplyCharSet(subsets); auto eNfa = expressions[i]->GenerateEpsilonNfa(); auto nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap); auto dfa = NfaToDfa(nfa, dfaStateMap); dfas.Add(dfa); } // Mark all states in DFAs // TODO: (enumerable) foreach for (vint i = 0; i < dfas.Count(); i++) { Ptr dfa = dfas[i]; // TODO: (enumerable) foreach for (vint j = 0; j < dfa->states.Count(); j++) { if (dfa->states[j]->finalState) { dfa->states[j]->userData = (void*)i; } else { dfa->states[j]->userData = (void*)dfas.Count(); } } } // Connect all DFAs to an e-NFA auto bigEnfa = Ptr(new Automaton); // TODO: (enumerable) foreach for (vint i = 0; i < dfas.Count(); i++) { CopyFrom(bigEnfa->states, dfas[i]->states, true); CopyFrom(bigEnfa->transitions, dfas[i]->transitions, true); } bigEnfa->startState = bigEnfa->NewState(); // TODO: (enumerable) foreach for (vint i = 0; i < dfas.Count(); i++) { bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState); } // Build a single DFA out of the e-NFA Dictionary nfaStateMap; Group dfaStateMap; auto bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap); // TODO: (enumerable) foreach on dictionary for (vint i = 0; i < nfaStateMap.Keys().Count(); i++) { void* userData = nfaStateMap.Values().Get(i)->userData; nfaStateMap.Keys()[i]->userData = userData; } auto bigDfa = NfaToDfa(bigNfa, dfaStateMap); // TODO: (enumerable) foreach on group for (vint i = 0; i < dfaStateMap.Keys().Count(); i++) { void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData; for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++) { void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData; if (userData > newData) { userData = newData; } } dfaStateMap.Keys()[i]->userData = userData; } // Build state machine pure = new PureInterpretor(bigDfa, subsets); stateTokens.Resize(bigDfa->states.Count()); for (vint i = 0; i < stateTokens.Count(); i++) { void* userData = bigDfa->states[i]->userData; stateTokens[i] = (vint)userData; } } /*********************************************************************** Template Instantiation ***********************************************************************/ template class RegexString_; template class RegexString_; template class RegexString_; template class RegexString_; template class RegexMatch_; template class RegexMatch_; template class RegexMatch_; template class RegexMatch_; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template class Regex_; template class Regex_; template class Regex_; template class Regex_; template class RegexTokens_; template class RegexTokens_; template class RegexTokens_; template class RegexTokens_; template class RegexLexerWalker_; template class RegexLexerWalker_; template class RegexLexerWalker_; template class RegexLexerWalker_; template class RegexLexerColorizer_; template class RegexLexerColorizer_; template class RegexLexerColorizer_; template class RegexLexerColorizer_; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template class RegexLexer_; template class RegexLexer_; template class RegexLexer_; template class RegexLexer_; } } /*********************************************************************** .\REGEXPURE.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { using namespace collections; /*********************************************************************** Read ***********************************************************************/ void ReadInt(stream::IStream& inputStream, vint& value) { #ifdef VCZH_64 vint32_t x = 0; CHECK_ERROR( inputStream.Read(&x, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to deserialize RegexLexer." ); value = (vint)x; #else CHECK_ERROR( inputStream.Read(&value, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to deserialize RegexLexer." ); #endif } void ReadInts(stream::IStream& inputStream, vint count, vint* values) { #ifdef VCZH_64 Array xs(count); CHECK_ERROR( inputStream.Read(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to deserialize RegexLexer." ); for (vint i = 0; i < count; i++) { values[i] = (vint)xs[i]; } #else CHECK_ERROR( inputStream.Read(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to deserialize RegexLexer." ); #endif } void ReadBools(stream::IStream& inputStream, vint count, bool* values) { Array bits((count + 7) / 8); CHECK_ERROR( inputStream.Read(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(), L"Failed to deserialize RegexLexer." ); for (vint i = 0; i < count; i++) { vint x = i / 8; vint y = i % 8; values[i] = ((bits[x] >> y) & 1) == 1; } } /*********************************************************************** Write ***********************************************************************/ void WriteInt(stream::IStream& outputStream, vint value) { #ifdef VCZH_64 vint32_t x = (vint32_t)value; CHECK_ERROR( outputStream.Write(&x, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to serialize RegexLexer." ); #else CHECK_ERROR( outputStream.Write(&value, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to serialize RegexLexer." ); #endif } void WriteInts(stream::IStream& outputStream, vint count, vint* values) { #ifdef VCZH_64 Array xs(count); for (vint i = 0; i < count; i++) { xs[i] = (vint32_t)values[i]; } CHECK_ERROR( outputStream.Write(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to serialize RegexLexer." ); #else CHECK_ERROR( outputStream.Write(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to serialize RegexLexer." ); #endif } void WriteBools(stream::IStream& outputStream, vint count, bool* values) { Array bits((count + 7) / 8); memset(&bits[0], 0, sizeof(vuint8_t) * bits.Count()); for (vint i = 0; i < count; i++) { if (values[i]) { vint x = i / 8; vint y = i % 8; bits[x] |= (vuint8_t)1 << y; } } CHECK_ERROR( outputStream.Write(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(), L"Failed to serialize RegexLexer." ); } /*********************************************************************** PureInterpretor (Serialization) ***********************************************************************/ PureInterpretor::PureInterpretor(stream::IStream& inputStream) { ReadInt(inputStream, stateCount); ReadInt(inputStream, charSetCount); ReadInt(inputStream, startState); { vint count = 0; ReadInt(inputStream, count); charRanges.Resize(count); if (count > 0) { vint size = charRanges.Count() * sizeof(CharRange); CHECK_ERROR(inputStream.Read(&charRanges[0], size) == size, L"Failed to serialize RegexLexer."); } ExpandCharRanges(); } transitions = new vint[stateCount * charSetCount]; ReadInts(inputStream, stateCount * charSetCount, transitions); finalState = new bool[stateCount]; ReadBools(inputStream, stateCount, finalState); } void PureInterpretor::Serialize(stream::IStream& outputStream) { WriteInt(outputStream, stateCount); WriteInt(outputStream, charSetCount); WriteInt(outputStream, startState); { WriteInt(outputStream, charRanges.Count()); if (charRanges.Count() > 0) { vint size = charRanges.Count() * sizeof(CharRange); CHECK_ERROR(outputStream.Write(&charRanges[0], size) == size, L"Failed to serialize RegexLexer."); } } WriteInts(outputStream, stateCount * charSetCount, transitions); WriteBools(outputStream, stateCount, finalState); } /*********************************************************************** PureInterpretor ***********************************************************************/ void PureInterpretor::ExpandCharRanges() { for (vint i = 0; i < SupportedCharCount; i++) { charMap[i] = charSetCount - 1; } // TODO: (enumerable) foreach for (vint i = 0; i < charRanges.Count(); i++) { CharRange range = charRanges[i]; for (char32_t j = range.begin; j <= range.end; j++) { if (j > MaxChar32) break; charMap[j] = i; } } } PureInterpretor::PureInterpretor(Ptr dfa, CharRange::List& subsets) { stateCount = dfa->states.Count(); charSetCount = subsets.Count() + 1; startState = dfa->states.IndexOf(dfa->startState); // Map char to input index (equivalent char class) CopyFrom(charRanges, subsets); ExpandCharRanges(); // Create transitions from DFA, using input index to represent input char transitions = new vint[stateCount * charSetCount]; for (vint i = 0; i < stateCount; i++) { for (vint j = 0; j < charSetCount; j++) { transitions[i * charSetCount + j] = -1; } State* state = dfa->states[i].Obj(); // TODO: (enumerable) foreach for (vint j = 0; j < state->transitions.Count(); j++) { Transition* dfaTransition = state->transitions[j]; switch (dfaTransition->type) { case Transition::Chars: { vint index = subsets.IndexOf(dfaTransition->range); if (index == -1) { CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr, CharRange::List&)#Specified chars don't appear in the normalized char ranges."); } transitions[i * charSetCount + index] = dfa->states.IndexOf(dfaTransition->target); } break; default: CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions."); } } } // Mark final states finalState = new bool[stateCount]; for (vint i = 0; i < stateCount; i++) { finalState[i] = dfa->states[i]->finalState; } } PureInterpretor::~PureInterpretor() { if (relatedFinalState) delete[] relatedFinalState; delete[] finalState; delete[] transitions; } template bool PureInterpretor::MatchHead(const TChar* input, const TChar* start, PureResult& result) { CharReader reader(input); vint currentState = startState; vint terminateState = -1; vint terminateLength = -1; result.start = input - start; result.length = -1; result.finalState = -1; result.terminateState = -1; while (currentState != -1) { auto c = reader.Read(); terminateState = currentState; terminateLength = reader.Index(); if (finalState[currentState]) { result.length = terminateLength; result.finalState = currentState; } if (!c) break; if (c >= SupportedCharCount) break; vint charIndex = charMap[c]; currentState = transitions[currentState * charSetCount + charIndex]; } if (result.finalState == -1) { if (terminateLength > 0) { result.terminateState = terminateState; } result.length = terminateLength; return false; } else { return true; } } template bool PureInterpretor::Match(const TChar* input, const TChar* start, PureResult& result) { CharReader reader(input); while (reader.Read()) { if (MatchHead(reader.Reading(), start, result)) { return true; } } return false; } vint PureInterpretor::GetStartState() { return startState; } vint PureInterpretor::Transit(char32_t input, vint state) { if (0 <= state && state < stateCount && 0 <= input && input <= MaxChar32) { vint charIndex = charMap[input]; vint nextState = transitions[state * charSetCount + charIndex]; return nextState; } else { return -1; } } bool PureInterpretor::IsFinalState(vint state) { return 0 <= state && state < stateCount&& finalState[state]; } bool PureInterpretor::IsDeadState(vint state) { if (state == -1) return true; for (vint i = 0; i < charSetCount; i++) { if (transitions[state * charSetCount + i] != -1) { return false; } } return true; } void PureInterpretor::PrepareForRelatedFinalStateTable() { if (!relatedFinalState) { relatedFinalState = new vint[stateCount]; for (vint i = 0; i < stateCount; i++) { relatedFinalState[i] = finalState[i] ? i : -1; } while (true) { vint modifyCount = 0; for (vint i = 0; i < stateCount; i++) { if (relatedFinalState[i] == -1) { vint state = -1; for (vint j = 0; j < charSetCount; j++) { vint nextState = transitions[i * charSetCount + j]; if (nextState != -1) { state = relatedFinalState[nextState]; if (state != -1) { break; } } } if (state != -1) { relatedFinalState[i] = state; modifyCount++; } } } if (modifyCount == 0) { break; } } } } vint PureInterpretor::GetRelatedFinalState(vint state) { return relatedFinalState ? relatedFinalState[state] : -1; } template bool PureInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, PureResult& result); template bool PureInterpretor::MatchHead(const char8_t* input, const char8_t* start, PureResult& result); template bool PureInterpretor::MatchHead(const char16_t* input, const char16_t* start, PureResult& result); template bool PureInterpretor::MatchHead(const char32_t* input, const char32_t* start, PureResult& result); template bool PureInterpretor::Match(const wchar_t* input, const wchar_t* start, PureResult& result); template bool PureInterpretor::Match(const char8_t* input, const char8_t* start, PureResult& result); template bool PureInterpretor::Match(const char16_t* input, const char16_t* start, PureResult& result); template bool PureInterpretor::Match(const char32_t* input, const char32_t* start, PureResult& result); } } /*********************************************************************** .\REGEXRICH.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** Data Structures for Backtracking ***********************************************************************/ enum class StateStoreType { Positive, Negative, Other }; template class StateSaver { public: CharReader reader; // Current reading position char32_t ch; // Current character State* currentState; // Current state vint minTransition = 0; // The first transition to backtrack vint captureCount = 0; // Available capture count (the list size may larger than this) vint stateSaverCount = 0; // Available saver count (the list size may larger than this) vint extensionSaverAvailable = -1; // Available extension saver count (the list size may larger than this) vint extensionSaverCount = 0; // Available extension saver count (during executing) StateStoreType storeType = StateStoreType::Other; // Reason to keep this record StateSaver(const TChar* input, State* _currentState) : reader(input) , currentState(_currentState) { ch = reader.Read(); } StateSaver(const StateSaver&) = default; StateSaver& operator=(const StateSaver&) = default; void RestoreReaderTo(StateSaver& saver) { saver.reader = reader; saver.ch = ch; } }; template class ExtensionSaver { public: CharReader reader; // The reading position char32_t ch; // Current character vint previous; // Previous extension saver index vint captureListIndex; // Where to write the captured text Transition* transition; // The extension begin transition (Capture, Positive, Negative) ExtensionSaver(const StateSaver& saver) : reader(saver.reader) , ch(saver.ch) { } ExtensionSaver(const ExtensionSaver&) = default; ExtensionSaver& operator=(const ExtensionSaver&) = default; void RestoreReaderTo(StateSaver& saver) { saver.reader = reader; saver.ch = ch; } }; } namespace regex_internal { using namespace collections; template void Push(List>& elements, vint& available, vint& count, const ExtensionSaver& element) { if (elements.Count() == count) { elements.Add(element); } else { elements[count] = element; } auto& current = elements[count]; current.previous = available; available = count++; } template ExtensionSaver Pop(List>& elements, vint& available, vint& count) { auto& current = elements[available]; available = current.previous; return current; } template void PushNonSaver(List& elements, vint& count, const T& element) { if (elements.Count() == count) { elements.Add(element); } else { elements[count] = element; } count++; } template T PopNonSaver(List& elements, vint& count) { return elements[--count]; } } namespace regex_internal { /*********************************************************************** RichInterpretor ***********************************************************************/ RichInterpretor::RichInterpretor(Ptr _dfa) :dfa(_dfa) { datas = new UserData[dfa->states.Count()]; // TODO: (enumerable) foreach for (vint i = 0; i < dfa->states.Count(); i++) { State* state = dfa->states[i].Obj(); vint charEdges = 0; vint nonCharEdges = 0; bool mustSave = false; // TODO: (enumerable) foreach for (vint j = 0; j < state->transitions.Count(); j++) { if (state->transitions[j]->type == Transition::Chars) { charEdges++; } else { if (state->transitions[j]->type == Transition::Negative || state->transitions[j]->type == Transition::Positive) { mustSave = true; } nonCharEdges++; } } datas[i].NeedKeepState = mustSave || nonCharEdges > 1 || (nonCharEdges != 0 && charEdges != 0); state->userData = &datas[i]; } } RichInterpretor::~RichInterpretor() { delete[] datas; } template bool RichInterpretor::MatchHead(const TChar* input, const TChar* start, RichResult& result) { List> stateSavers; List> extensionSavers; StateSaver currentState(input, dfa->startState); while (!currentState.currentState->finalState) { bool found = false; // true means at least one transition matches the input StateSaver oldState = currentState; // Iterate through all transitions from the current state // TODO: (enumerable) foreach:reversed for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++) { Transition* transition = currentState.currentState->transitions[i]; switch (transition->type) { case Transition::Chars: { // match the input if the current character fall into the range CharRange range = transition->range; found = range.begin <= currentState.ch && range.end >= currentState.ch; if (found) { currentState.ch = currentState.reader.Read(); } } break; case Transition::BeginString: { // match the input if this is the first character, and it is not consumed found = currentState.reader.Index() == 0 && input == start; } break; case Transition::EndString: { // match the input if this is after the last character, and it is not consumed found = currentState.ch == 0; } break; case Transition::Nop: { // match without any condition found = true; } break; case Transition::Capture: { // Push the capture information ExtensionSaver saver(currentState); saver.captureListIndex = currentState.captureCount; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Push the capture record, and it will be written if the input matches the regex CaptureRecord capture; capture.capture = transition->capture; capture.start = currentState.reader.Index() + (input - start); capture.length = -1; PushNonSaver(result.captures, currentState.captureCount, capture); found = true; } break; case Transition::Match: { vint index = 0; for (vint j = 0; j < currentState.captureCount; j++) { CaptureRecord& capture = result.captures[j]; // If the capture name matched if (capture.capture == transition->capture) { // If the capture index matched, or it is -1 if (capture.length != -1 && (transition->index == -1 || transition->index == index)) { // If the captured text matched if (memcmp(start + capture.start, input + currentState.reader.Index(), sizeof(TChar) * capture.length) == 0) { // Consume so much input vint targetIndex = currentState.reader.Index() + capture.length; while (currentState.reader.Index() < targetIndex) { currentState.ch = currentState.reader.Read(); } CHECK_ERROR(currentState.reader.Index() == targetIndex, L"vl::regex_internal::RichInterpretor::MatchHead(const TChar*, const TChar*, RichResult&)#Input code could be an incorrect unicode sequence."); found = true; break; } } // Fail if f the captured text with the specified name and index doesn't match if (transition->index != -1 && index == transition->index) { break; } else { index++; } } } } break; case Transition::Positive: { // Push the positive lookahead information ExtensionSaver saver(currentState); saver.captureListIndex = -1; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Set found = true so that PushNonSaver(oldState) happens later oldState.storeType = StateStoreType::Positive; found = true; } break; case Transition::Negative: { // Push the positive lookahead information ExtensionSaver saver(currentState); saver.captureListIndex = -1; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Set found = true so that PushNonSaver(oldState) happens later oldState.storeType = StateStoreType::Negative; found = true; } break; case Transition::NegativeFail: { // NegativeFail will be used when the nagative lookahead failed } break; case Transition::End: { // Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount); switch (extensionSaver.transition->type) { case Transition::Capture: { // Write the captured text CaptureRecord& capture = result.captures[extensionSaver.captureListIndex]; capture.length = currentState.reader.Index() + (input - start) - capture.start; found = true; } break; case Transition::Positive: // Find the last positive lookahead state saver for (vint j = currentState.stateSaverCount - 1; j >= 0; j--) { auto& stateSaver = stateSavers[j]; if (stateSaver.storeType == StateStoreType::Positive) { // restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input stateSaver.RestoreReaderTo(oldState); oldState.stateSaverCount = j; stateSaver.RestoreReaderTo(currentState); currentState.stateSaverCount = j; break; } } found = true; break; case Transition::Negative: // Find the last negative lookahead state saver for (vint j = currentState.stateSaverCount - 1; j >= 0; j--) { auto& stateSaver = stateSavers[j]; if (stateSaver.storeType == StateStoreType::Negative) { // restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input oldState = stateSaver; oldState.storeType = StateStoreType::Other; currentState = stateSaver; currentState.storeType = StateStoreType::Other; i = currentState.minTransition - 1; break; } } break; default:; } } break; default:; } // Save the parsing state when necessary if (found) { UserData* data = (UserData*)currentState.currentState->userData; if (data->NeedKeepState) { oldState.minTransition = i + 1; PushNonSaver(stateSavers, currentState.stateSaverCount, oldState); } currentState.currentState = transition->target; currentState.minTransition = 0; break; } } // If no transition from the current state can be used if (!found) { // If there is a chance to do backtracking if (currentState.stateSaverCount) { currentState = PopNonSaver(stateSavers, currentState.stateSaverCount); // minTransition - 1 is always valid since the value is stored with adding 1 // So minTransition - 1 record the transition, which is the reason the parsing state is saved if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative) { // Find the next NegativeFail transition // Because when a negative lookahead regex failed to match, it is actually succeeded // Since a negative lookahead means we don't want to match this regex // TODO: (enumerable) foreach:reversed for (vint i = 0; i < currentState.currentState->transitions.Count(); i++) { Transition* transition = currentState.currentState->transitions[i]; if (transition->type == Transition::NegativeFail) { // Restore the state to the target of NegativeFail to let the parsing continue currentState.currentState = transition->target; currentState.minTransition = 0; currentState.storeType = StateStoreType::Other; break; } } } } else { break; } } } if (currentState.currentState->finalState) { // Keep available captures if succeeded result.start = input - start; result.length = currentState.reader.Index(); for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--) { result.captures.RemoveAt(i); } return true; } else { // Clear captures if failed result.captures.Clear(); return false; } } template bool RichInterpretor::Match(const TChar* input, const TChar* start, RichResult& result) { CharReader reader(input); while (reader.Read()) { if (MatchHead(reader.Reading(), start, result)) { return true; } } return false; } const List& RichInterpretor::CaptureNames() { return dfa->captureNames; } template bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, RichResult& result); template bool RichInterpretor::MatchHead(const char8_t* input, const char8_t* start, RichResult& result); template bool RichInterpretor::MatchHead(const char16_t* input, const char16_t* start, RichResult& result); template bool RichInterpretor::MatchHead(const char32_t* input, const char32_t* start, RichResult& result); template bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, RichResult& result); template bool RichInterpretor::Match(const char8_t* input, const char8_t* start, RichResult& result); template bool RichInterpretor::Match(const char16_t* input, const char16_t* start, RichResult& result); template bool RichInterpretor::Match(const char32_t* input, const char32_t* start, RichResult& result); } } /*********************************************************************** .\AST\REGEXEXPRESSION.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** MergeAlgorithm ***********************************************************************/ class MergeParameter { public: Expression::Map definitions; RegexExpression* regex = nullptr; }; class MergeAlgorithm : public RegexExpressionAlgorithm, MergeParameter*> { public: Ptr Apply(CharSetExpression* expression, MergeParameter* target) override { auto result = Ptr(new CharSetExpression); CopyFrom(result->ranges, expression->ranges); result->reverse = expression->reverse; return result; } Ptr Apply(LoopExpression* expression, MergeParameter* target) override { auto result = Ptr(new LoopExpression); result->max = expression->max; result->min = expression->min; result->preferLong = expression->preferLong; result->expression = Invoke(expression->expression, target); return result; } Ptr Apply(SequenceExpression* expression, MergeParameter* target) override { auto result = Ptr(new SequenceExpression); result->left = Invoke(expression->left, target); result->right = Invoke(expression->right, target); return result; } Ptr Apply(AlternateExpression* expression, MergeParameter* target) override { auto result = Ptr(new AlternateExpression); result->left = Invoke(expression->left, target); result->right = Invoke(expression->right, target); return result; } Ptr Apply(BeginExpression* expression, MergeParameter* target) override { return Ptr(new BeginExpression); } Ptr Apply(EndExpression* expression, MergeParameter* target) override { return Ptr(new EndExpression); } Ptr Apply(CaptureExpression* expression, MergeParameter* target) override { auto result = Ptr(new CaptureExpression); result->expression = Invoke(expression->expression, target); result->name = expression->name; return result; } Ptr Apply(MatchExpression* expression, MergeParameter* target) override { auto result = Ptr(new MatchExpression); result->name = expression->name; result->index = expression->index; return result; } Ptr Apply(PositiveExpression* expression, MergeParameter* target) override { auto result = Ptr(new PositiveExpression); result->expression = Invoke(expression->expression, target); return result; } Ptr Apply(NegativeExpression* expression, MergeParameter* target) override { auto result = Ptr(new NegativeExpression); result->expression = Invoke(expression->expression, target); return result; } Ptr Apply(UsingExpression* expression, MergeParameter* target) override { if (target->definitions.Keys().Contains(expression->name)) { Ptr reference = target->definitions[expression->name]; if (reference) { return reference; } else { throw ArgumentException(L"Regular expression syntax error: Found reference loops in\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L""); } } else if (target->regex->definitions.Keys().Contains(expression->name)) { target->definitions.Add(expression->name, nullptr); Ptr result = Invoke(target->regex->definitions[expression->name], target); target->definitions.Set(expression->name, result); return result; } else { throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L""); } } }; /*********************************************************************** CharSetExpression ***********************************************************************/ bool CharSetExpression::AddRangeWithConflict(CharRange range) { if (range.begin > range.end) { char32_t t = range.begin; range.begin = range.end; range.end = t; } // TODO: (enumerable) foreach for (vint i = 0; i < ranges.Count(); i++) { if (!(rangeranges[i])) { return false; } } ranges.Add(range); return true; } /*********************************************************************** RegexExpression ***********************************************************************/ Ptr RegexExpression::Merge() { MergeParameter merge; merge.regex = this; return MergeAlgorithm().Invoke(expression, &merge); } /*********************************************************************** Expression::Apply ***********************************************************************/ void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_CANTREATASPURE.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** CanTreatAsPureAlgorithm ***********************************************************************/ class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm { public: bool Apply(CharSetExpression* expression, void* target) override { return true; } bool Apply(LoopExpression* expression, void* target) override { return expression->preferLong && Invoke(expression->expression, 0); } bool Apply(SequenceExpression* expression, void* target) override { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(AlternateExpression* expression, void* target) override { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(BeginExpression* expression, void* target) override { return false; } bool Apply(EndExpression* expression, void* target) override { return false; } bool Apply(CaptureExpression* expression, void* target) override { return Invoke(expression->expression, 0); } bool Apply(MatchExpression* expression, void* target) override { return false; } bool Apply(PositiveExpression* expression, void* target) override { return false; } bool Apply(NegativeExpression* expression, void* target) override { return false; } bool Apply(UsingExpression* expression, void* target) override { return false; } }; /*********************************************************************** Expression ***********************************************************************/ bool Expression::CanTreatAsPure() { return CanTreatAsPureAlgorithm().Invoke(this, 0); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_CHARSET.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { class NormalizedCharSet { public: CharRange::List ranges; }; /*********************************************************************** CharSetAlgorithm ***********************************************************************/ class CharSetAlgorithm : public RegexExpressionAlgorithm { public: virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) = 0; void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target) { if (expression->reverse) { char32_t begin = 1; // TODO: (enumerable) foreach for (vint i = 0; i < ranges.Count(); i++) { CharRange range = ranges[i]; if (range.begin > begin) { Process(expression, target, CharRange(begin, range.begin - 1)); } begin = range.end + 1; } if (begin <= MaxChar32) { Process(expression, target, CharRange(begin, MaxChar32)); } } else { // TODO: (enumerable) foreach for (vint i = 0; i < ranges.Count(); i++) { Process(expression, target, ranges[i]); } } } void Apply(LoopExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(SequenceExpression* expression, NormalizedCharSet* target) override { Invoke(expression->left, target); Invoke(expression->right, target); } void Apply(AlternateExpression* expression, NormalizedCharSet* target) override { Invoke(expression->left, target); Invoke(expression->right, target); } void Apply(BeginExpression* expression, NormalizedCharSet* target) override { } void Apply(EndExpression* expression, NormalizedCharSet* target) override { } void Apply(CaptureExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(MatchExpression* expression, NormalizedCharSet* target) override { } void Apply(PositiveExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(NegativeExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(UsingExpression* expression, NormalizedCharSet* target) override { } }; /*********************************************************************** BuildNormalizedCharSetAlgorithm ***********************************************************************/ class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm { public: void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) { vint index = 0; while (index < target->ranges.Count()) { CharRange current = target->ranges[index]; if (currentrange) { index++; } else if (current.begin < range.begin) { // range : [ ? // current : [ ] target->ranges.RemoveAt(index); target->ranges.Add(CharRange(current.begin, range.begin - 1)); target->ranges.Add(CharRange(range.begin, current.end)); index++; } else if (current.begin > range.begin) { // range : [ ] // current : [ ? target->ranges.Add(CharRange(range.begin, current.begin - 1)); range.begin = current.begin; } else if (current.end < range.end) { // range : [ ] // current : [ ] range.begin = current.end + 1; index++; } else if (current.end > range.end) { // range : [ ] // current : [ ] target->ranges.RemoveAt(index); target->ranges.Add(range); target->ranges.Add(CharRange(range.end + 1, current.end)); return; } else { // range : [ ] // current : [ ] return; } } target->ranges.Add(range); } void Apply(CharSetExpression* expression, NormalizedCharSet* target) { Loop(expression, expression->ranges, target); } }; /*********************************************************************** SetNormalizedCharSetAlgorithm ***********************************************************************/ class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm { public: void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) { // TODO: (enumerable) foreach for (vint j = 0; j < target->ranges.Count(); j++) { CharRange targetRange = target->ranges[j]; if (range.begin <= targetRange.begin && targetRange.end <= range.end) { expression->ranges.Add(targetRange); } } } void Apply(CharSetExpression* expression, NormalizedCharSet* target) { CharRange::List source; CopyFrom(source, expression->ranges); expression->ranges.Clear(); Loop(expression, source, target); expression->reverse = false; } }; /*********************************************************************** Expression ***********************************************************************/ void Expression::NormalizeCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized); SetNormalizedCharSetAlgorithm().Invoke(this, &normalized); CopyFrom(subsets, normalized.ranges); } void Expression::CollectCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; CopyFrom(normalized.ranges, subsets); BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized); CopyFrom(subsets, normalized.ranges); } void Expression::ApplyCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; CopyFrom(normalized.ranges, subsets); SetNormalizedCharSetAlgorithm().Invoke(this, &normalized); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_GENERATEEPSILONNFA.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** EpsilonNfaAlgorithm ***********************************************************************/ class EpsilonNfaInfo { public: Ptr automaton; }; class EpsilonNfa { public: State* start; State* end; EpsilonNfa() { start = 0; end = 0; } }; class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm { public: EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target) { if (a.start) { target->NewEpsilon(a.end, b.start); a.end = b.end; return a; } else { return b; } } EpsilonNfa Apply(CharSetExpression* expression, Automaton* target) override { EpsilonNfa nfa; nfa.start = target->NewState(); nfa.end = target->NewState(); // TODO: (enumerable) foreach for (vint i = 0; i < expression->ranges.Count(); i++) { target->NewChars(nfa.start, nfa.end, expression->ranges[i]); } return nfa; } EpsilonNfa Apply(LoopExpression* expression, Automaton* target) override { EpsilonNfa head; for (vint i = 0; i < expression->min; i++) { EpsilonNfa body = Invoke(expression->expression, target); head = Connect(head, body, target); } if (expression->max == -1) { EpsilonNfa body = Invoke(expression->expression, target); if (!head.start) { head.start = head.end = target->NewState(); } State* loopBegin = head.end; State* loopEnd = target->NewState(); if (expression->preferLong) { target->NewEpsilon(loopBegin, body.start); target->NewEpsilon(body.end, loopBegin); target->NewNop(loopBegin, loopEnd); } else { target->NewNop(loopBegin, loopEnd); target->NewEpsilon(loopBegin, body.start); target->NewEpsilon(body.end, loopBegin); } head.end = loopEnd; } else if (expression->max > expression->min) { for (vint i = expression->min; i < expression->max; i++) { EpsilonNfa body = Invoke(expression->expression, target); State* start = target->NewState(); State* end = target->NewState(); if (expression->preferLong) { target->NewEpsilon(start, body.start); target->NewEpsilon(body.end, end); target->NewNop(start, end); } else { target->NewNop(start, end); target->NewEpsilon(start, body.start); target->NewEpsilon(body.end, end); } body.start = start; body.end = end; head = Connect(head, body, target); } } return head; } EpsilonNfa Apply(SequenceExpression* expression, Automaton* target) override { EpsilonNfa a = Invoke(expression->left, target); EpsilonNfa b = Invoke(expression->right, target); return Connect(a, b, target); } EpsilonNfa Apply(AlternateExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); EpsilonNfa a = Invoke(expression->left, target); EpsilonNfa b = Invoke(expression->right, target); target->NewEpsilon(result.start, a.start); target->NewEpsilon(a.end, result.end); target->NewEpsilon(result.start, b.start); target->NewEpsilon(b.end, result.end); return result; } EpsilonNfa Apply(BeginExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); target->NewBeginString(result.start, result.end); return result; } EpsilonNfa Apply(EndExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); target->NewEndString(result.start, result.end); return result; } EpsilonNfa Apply(CaptureExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); vint capture = -1; if (expression->name != U32String::Empty) { capture = target->captureNames.IndexOf(expression->name); if (capture == -1) { capture = target->captureNames.Count(); target->captureNames.Add(expression->name); } } EpsilonNfa body = Invoke(expression->expression, target); target->NewCapture(result.start, body.start, capture); target->NewEnd(body.end, result.end); return result; } EpsilonNfa Apply(MatchExpression* expression, Automaton* target) override { vint capture = -1; if (expression->name != U32String::Empty) { capture = target->captureNames.IndexOf(expression->name); if (capture == -1) { capture = target->captureNames.Count(); target->captureNames.Add(expression->name); } } EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); target->NewMatch(result.start, result.end, capture, expression->index); return result; } EpsilonNfa Apply(PositiveExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); EpsilonNfa body = Invoke(expression->expression, target); target->NewPositive(result.start, body.start); target->NewEnd(body.end, result.end); return result; } EpsilonNfa Apply(NegativeExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); EpsilonNfa body = Invoke(expression->expression, target); target->NewNegative(result.start, body.start); target->NewEnd(body.end, result.end); target->NewNegativeFail(result.start, result.end); return result; } EpsilonNfa Apply(UsingExpression* expression, Automaton* target) override { CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine."); } }; /*********************************************************************** Expression ***********************************************************************/ Ptr Expression::GenerateEpsilonNfa() { auto automaton = Ptr(new Automaton); EpsilonNfa result = EpsilonNfaAlgorithm().Invoke(this, automaton.Obj()); automaton->startState = result.start; result.end->finalState = true; return automaton; } } } /*********************************************************************** .\AST\REGEXEXPRESSION_HASNOEXTENSION.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** HasNoExtensionAlgorithm ***********************************************************************/ class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm