/*********************************************************************** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY DEVELOPER: Zihan Chen(vczh) ***********************************************************************/ #include "VlppRegex.h" /*********************************************************************** .\REGEX.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { void ReadInt(stream::IStream& inputStream, vint& value); void ReadInts(stream::IStream& inputStream, vint count, vint* values); void WriteInt(stream::IStream& outputStream, vint value); void WriteInts(stream::IStream& outputStream, vint count, vint* values); } namespace regex { using namespace collections; using namespace regex_internal; /*********************************************************************** String Conversion ***********************************************************************/ template struct U32; template<> struct U32 { static constexpr U32String(*ToU32)(const WString&) = &wtou32; static constexpr WString(*FromU32)(const U32String&) = &u32tow; }; template<> struct U32 { static constexpr U32String(*ToU32)(const U8String&) = &u8tou32; static constexpr U8String(*FromU32)(const U32String&) = &u32tou8; }; template<> struct U32 { static constexpr U32String(*ToU32)(const U16String&) = &u16tou32; static constexpr U16String(*FromU32)(const U32String&) = &u32tou16; }; template<> struct U32 { static U32String ToU32(const U32String& text) { return text; } static U32String FromU32(const U32String& text) { return text; } }; /*********************************************************************** RegexMatch_ ***********************************************************************/ template RegexMatch_::RegexMatch_(const ObjectString& _string, PureResult* _result) :success(true) , result(_string, _result->start, _result->length) { } template RegexMatch_::RegexMatch_(const ObjectString& _string, RichResult* _result) : success(true) , result(_string, _result->start, _result->length) { // TODO: (enumerable) foreach for (vint i = 0; i < _result->captures.Count(); i++) { CaptureRecord& capture = _result->captures[i]; if (capture.capture == -1) { captures.Add(RegexString_(_string, capture.start, capture.length)); } else { groups.Add(capture.capture, RegexString_(_string, capture.start, capture.length)); } } } template RegexMatch_::RegexMatch_(const RegexString_& _result) :success(false) , result(_result) { } template bool RegexMatch_::Success()const { return success; } template const RegexString_& RegexMatch_::Result()const { return result; } template const typename RegexMatch_::CaptureList& RegexMatch_::Captures()const { return captures; } template const typename RegexMatch_::CaptureGroup& RegexMatch_::Groups()const { return groups; } /*********************************************************************** RegexBase_ ***********************************************************************/ template void RegexBase_::Process(const ObjectString& text, bool keepEmpty, bool keepSuccess, bool keepFail, typename RegexMatch_::List& matches)const { if (rich) { const T* start = text.Buffer(); const T* input = start; RichResult result; while (rich->Match(input, start, result)) { vint offset = input - start; if (keepFail) { if (result.start > offset || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, offset, result.start - offset)))); } } if (keepSuccess) { matches.Add(Ptr(new RegexMatch_(text, &result))); } input = start + result.start + result.length; } if (keepFail) { vint remain = input - start; vint length = text.Length() - remain; if (length || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, remain, length)))); } } } else { const T* start = text.Buffer(); const T* input = start; PureResult result; while (pure->Match(input, start, result)) { vint offset = input - start; if (keepFail) { if (result.start > offset || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, offset, result.start - offset)))); } } if (keepSuccess) { matches.Add(Ptr(new RegexMatch_(text, &result))); } input = start + result.start + result.length; } if (keepFail) { vint remain = input - start; vint length = text.Length() - remain; if (length || keepEmpty) { matches.Add(Ptr(new RegexMatch_(RegexString_(text, remain, length)))); } } } } RegexBase_::~RegexBase_() { if (pure) delete pure; if (rich) delete rich; } template typename RegexMatch_::Ref RegexBase_::MatchHead(const ObjectString& text)const { if (rich) { RichResult result; if (rich->MatchHead(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } else { PureResult result; if (pure->MatchHead(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } } template typename RegexMatch_::Ref RegexBase_::Match(const ObjectString& text)const { if (rich) { RichResult result; if (rich->Match(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } else { PureResult result; if (pure->Match(text.Buffer(), text.Buffer(), result)) { return Ptr(new RegexMatch_(text, &result)); } else { return nullptr; } } } template bool RegexBase_::TestHead(const ObjectString& text)const { if (pure) { PureResult result; return pure->MatchHead(text.Buffer(), text.Buffer(), result); } else { RichResult result; return rich->MatchHead(text.Buffer(), text.Buffer(), result); } } template bool RegexBase_::Test(const ObjectString& text)const { if (pure) { PureResult result; return pure->Match(text.Buffer(), text.Buffer(), result); } else { RichResult result; return rich->Match(text.Buffer(), text.Buffer(), result); } } template void RegexBase_::Search(const ObjectString& text, typename RegexMatch_::List& matches)const { Process(text, false, true, false, matches); } template void RegexBase_::Split(const ObjectString& text, bool keepEmptyMatch, typename RegexMatch_::List& matches)const { Process(text, keepEmptyMatch, false, true, matches); } template void RegexBase_::Cut(const ObjectString& text, bool keepEmptyMatch, typename RegexMatch_::List& matches)const { Process(text, keepEmptyMatch, true, true, matches); } /*********************************************************************** Regex_ ***********************************************************************/ template Regex_::Regex_(const ObjectString& code, bool preferPure) { CharRange::List subsets; auto regex = ParseRegexExpression(U32::ToU32(code)); auto expression = regex->Merge(); expression->NormalizeCharSet(subsets); bool pureRequired = false; bool richRequired = false; if (preferPure) { if (expression->HasNoExtension()) { pureRequired = true; } else { if (expression->CanTreatAsPure()) { pureRequired = true; richRequired = true; } else { richRequired = true; } } } else { richRequired = true; } try { if (pureRequired) { Dictionary nfaStateMap; Group dfaStateMap; Ptr eNfa = expression->GenerateEpsilonNfa(); Ptr nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap); Ptr dfa = NfaToDfa(nfa, dfaStateMap); pure = new PureInterpretor(dfa, subsets); } if (richRequired) { Dictionary nfaStateMap; Group dfaStateMap; Ptr eNfa = expression->GenerateEpsilonNfa(); Ptr nfa = EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap); Ptr dfa = NfaToDfa(nfa, dfaStateMap); rich = new RichInterpretor(dfa); for (auto&& name : rich->CaptureNames()) { captureNames.Add(U32::FromU32(name)); } } } catch (...) { if (pure)delete pure; if (rich)delete rich; throw; } } /*********************************************************************** RegexTokens_ ***********************************************************************/ template class RegexTokenEnumerator : public Object, public IEnumerator> { protected: RegexToken_ token; vint index = -1; PureInterpretor* pure; const Array& stateTokens; const T* start; vint codeIndex; RegexProc_ proc; const T* reading; vint rowStart = 0; vint columnStart = 0; bool cacheAvailable = false; RegexToken_ cacheToken; public: RegexTokenEnumerator(const RegexTokenEnumerator& enumerator) : token(enumerator.token) , index(enumerator.index) , pure(enumerator.pure) , stateTokens(enumerator.stateTokens) , start(enumerator.start) , codeIndex(enumerator.codeIndex) , proc(enumerator.proc) , reading(enumerator.reading) , rowStart(enumerator.rowStart) , columnStart(enumerator.columnStart) , cacheAvailable(enumerator.cacheAvailable) , cacheToken(enumerator.cacheToken) { } RegexTokenEnumerator(PureInterpretor* _pure, const Array& _stateTokens, const T* _start, vint _codeIndex, RegexProc_ _proc) :index(-1) , pure(_pure) , stateTokens(_stateTokens) , start(_start) , codeIndex(_codeIndex) , proc(_proc) , reading(_start) { } IEnumerator>* Clone()const { return new RegexTokenEnumerator(*this); } const RegexToken_& Current()const { return token; } vint Index()const { return index; } bool Next() { if (!cacheAvailable && !*reading) return false; if (cacheAvailable) { token = cacheToken; cacheAvailable = false; } else { token.reading = reading; token.start = 0; token.length = 0; token.token = -2; token.completeToken = true; } token.rowStart = rowStart; token.columnStart = columnStart; token.rowEnd = rowStart; token.columnEnd = columnStart; token.codeIndex = codeIndex; PureResult result; while (*reading) { vint id = -1; bool completeToken = true; if (!pure->MatchHead(reading, start, result)) { result.start = reading - start; if (id == -1 && result.terminateState != -1) { vint state = pure->GetRelatedFinalState(result.terminateState); if (state != -1) { id = stateTokens[state]; } } if (id == -1) { result.length = 1; } else { completeToken = false; } } else { id = stateTokens.Get(result.finalState); } if (id != -1 && proc.extendProc) { RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr); proc.extendProc(proc.argument, reading, -1, true, token); #if _DEBUG CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer."); #endif result.length = token.length; id = token.token; completeToken = token.completeToken; } if (token.token == -2) { token.start = result.start; token.length = result.length; token.token = id; token.completeToken = completeToken; } else if (token.token == id && id == -1) { token.length += result.length; } else { cacheAvailable = true; cacheToken.reading = reading; cacheToken.start = result.start; cacheToken.length = result.length; cacheToken.codeIndex = codeIndex; cacheToken.token = id; cacheToken.completeToken = completeToken; } reading += result.length; if (cacheAvailable) { break; } } index++; for (vint i = 0; i < token.length; i++) { token.rowEnd = rowStart; token.columnEnd = columnStart; if (token.reading[i] == L'\n') { rowStart++; columnStart = 0; } else { columnStart++; } } return true; } void Reset() { index = -1; reading = start; cacheAvailable = false; } void ReadToEnd(List>& tokens, bool(*discard)(vint)) { while (Next()) { if (!discard(token.token)) { tokens.Add(token); } } } }; template RegexTokens_::RegexTokens_(PureInterpretor* _pure, const Array& _stateTokens, const ObjectString& _code, vint _codeIndex, RegexProc_ _proc) :pure(_pure) , stateTokens(_stateTokens) , code(_code) , codeIndex(_codeIndex) , proc(_proc) { } template RegexTokens_::RegexTokens_(const RegexTokens_& tokens) :pure(tokens.pure) , stateTokens(tokens.stateTokens) , code(tokens.code) , codeIndex(tokens.codeIndex) , proc(tokens.proc) { } template IEnumerator>* RegexTokens_::CreateEnumerator() const { return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc); } bool DefaultDiscard(vint token) { return false; } template void RegexTokens_::ReadToEnd(collections::List>& tokens, bool(*discard)(vint))const { if (discard == 0) { discard = &DefaultDiscard; } RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard); } /*********************************************************************** RegexLexerWalker_ ***********************************************************************/ template RegexLexerWalker_::RegexLexerWalker_(PureInterpretor* _pure, const Array& _stateTokens) :pure(_pure) , stateTokens(_stateTokens) { } template RegexLexerWalker_::RegexLexerWalker_(const RegexLexerWalker_& tokens) : pure(tokens.pure) , stateTokens(tokens.stateTokens) { } template vint RegexLexerWalker_::GetStartState()const { return pure->GetStartState(); } template vint RegexLexerWalker_::GetRelatedToken(vint state)const { vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state); return finalState == -1 ? -1 : stateTokens.Get(finalState); } template void RegexLexerWalker_::Walk(T input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const { vint previousState = state; token = -1; finalState = false; previousTokenStop = false; if (state == -1) { state = pure->GetStartState(); previousTokenStop = true; } state = pure->Transit(input, state); if (state == -1) { previousTokenStop = true; if (previousState == -1) { finalState = true; return; } else if (pure->IsFinalState(previousState)) { state = pure->Transit(input, pure->GetStartState()); } } if (pure->IsFinalState(state)) { token = stateTokens.Get(state); finalState = true; return; } else { finalState = state == -1; return; } } template vint RegexLexerWalker_::Walk(T input, vint state)const { vint token = -1; bool finalState = false; bool previousTokenStop = false; Walk(input, state, token, finalState, previousTokenStop); return state; } template bool RegexLexerWalker_::IsClosedToken(const T* input, vint length)const { vint state = pure->GetStartState(); for (vint i = 0; i < length; i++) { state = pure->Transit(input[i], state); if (state == -1) return true; if (pure->IsDeadState(state)) return true; } return false; } template bool RegexLexerWalker_::IsClosedToken(const ObjectString& input)const { return IsClosedToken(input.Buffer(), input.Length()); } /*********************************************************************** RegexLexerColorizer_ ***********************************************************************/ template RegexLexerColorizer_::RegexLexerColorizer_(const RegexLexerWalker_& _walker, RegexProc_ _proc) :walker(_walker) , proc(_proc) { internalState.currentState = walker.GetStartState(); } template typename RegexLexerColorizer_::InternalState RegexLexerColorizer_::GetInternalState() { return internalState; } template void RegexLexerColorizer_::SetInternalState(InternalState state) { internalState = state; } template void RegexLexerColorizer_::Pass(T input) { WalkOneToken(&input, 1, 0, false); } template vint RegexLexerColorizer_::GetStartState()const { return walker.GetStartState(); } template void RegexLexerColorizer_::CallExtendProcAndColorizeProc(const T* input, vint length, RegexProcessingToken& token, bool colorize) { vint oldTokenLength = token.length; proc.extendProc(proc.argument, input + token.start, length - token.start, false, token); #if _DEBUG { bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken; CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input." ); CHECK_ERROR( token.completeToken || token.token != -1, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id." ); CHECK_ERROR( oldTokenLength <= token.length, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length." ); CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." ); } #endif if ((internalState.interTokenState = token.interTokenState)) { internalState.interTokenId = token.token; } if (colorize) { proc.colorizeProc(proc.argument, token.start, token.length, token.token); } } template vint RegexLexerColorizer_::WalkOneToken(const T* input, vint length, vint start, bool colorize) { if (internalState.interTokenState) { RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState); proc.extendProc(proc.argument, input, length, false, token); #if _DEBUG { bool pausedAtTheEnd = token.length == length && !token.completeToken; CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input." ); CHECK_ERROR( token.completeToken || token.token == internalState.interTokenId, L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id." ); CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." ); } #endif if (colorize) { proc.colorizeProc(proc.argument, 0, token.length, token.token); } if (!(internalState.interTokenState = token.interTokenState)) { internalState.interTokenId = -1; } return token.length; } vint lastFinalStateLength = 0; vint lastFinalStateToken = -1; vint lastFinalStateState = -1; vint tokenStartState = internalState.currentState; for (vint i = start; i < length; i++) { vint currentToken = -1; bool finalState = false; bool previousTokenStop = false; walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop); if (previousTokenStop) { if (proc.extendProc && lastFinalStateToken != -1) { RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr); CallExtendProcAndColorizeProc(input, length, token, colorize); if (token.completeToken) { internalState.currentState = walker.GetStartState(); } return start + token.length; } else if (i == start) { if (tokenStartState == GetStartState()) { if (colorize) { proc.colorizeProc(proc.argument, start, 1, -1); } internalState.currentState = walker.GetStartState(); return i + 1; } } else { if (colorize) { proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken); } internalState.currentState = lastFinalStateState; return start + lastFinalStateLength; } } if (finalState) { lastFinalStateLength = i + 1 - start; lastFinalStateToken = currentToken; lastFinalStateState = internalState.currentState; } } if (lastFinalStateToken != -1 && start + lastFinalStateLength == length) { if (proc.extendProc) { RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr); CallExtendProcAndColorizeProc(input, length, token, colorize); } else if (colorize) { proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken); } } else if (colorize) { proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState)); } return length; } template void* RegexLexerColorizer_::Colorize(const T* input, vint length) { vint index = 0; while (index != length) { index = WalkOneToken(input, length, index, true); } return internalState.interTokenState; } /*********************************************************************** RegexLexerBase_ ***********************************************************************/ RegexLexerBase_::~RegexLexerBase_() { if (pure) delete pure; } template RegexTokens_ RegexLexerBase_::Parse(const ObjectString& code, RegexProc_ proc, vint codeIndex)const { code.Buffer(); pure->PrepareForRelatedFinalStateTable(); return RegexTokens_(pure, stateTokens, code, codeIndex, proc); } template RegexLexerWalker_ RegexLexerBase_::Walk()const { pure->PrepareForRelatedFinalStateTable(); return RegexLexerWalker_(pure, stateTokens); } RegexLexerWalker_ RegexLexerBase_::Walk()const { pure->PrepareForRelatedFinalStateTable(); return RegexLexerWalker_(pure, stateTokens); } template RegexLexerColorizer_ RegexLexerBase_::Colorize(RegexProc_ proc)const { return RegexLexerColorizer_(Walk(), proc); } /*********************************************************************** RegexLexer_ (Serialization) ***********************************************************************/ template RegexLexer_::RegexLexer_(stream::IStream& inputStream) { pure = new PureInterpretor(inputStream); vint count = 0; ReadInt(inputStream, count); stateTokens.Resize(count); if (count > 0) { ReadInts(inputStream, count, &stateTokens[0]); } } template void RegexLexer_::Serialize(stream::IStream& outputStream) { pure->Serialize(outputStream); WriteInt(outputStream, stateTokens.Count()); if (stateTokens.Count() > 0) { WriteInts(outputStream, stateTokens.Count(), &stateTokens[0]); } } /*********************************************************************** RegexLexer_ ***********************************************************************/ template RegexLexer_::RegexLexer_(const collections::IEnumerable>& tokens) { // Build DFA for all tokens List> expressions; List> dfas; CharRange::List subsets; for (auto&& code : tokens) { auto regex = ParseRegexExpression(U32::ToU32(code)); auto expression = regex->Merge(); expression->CollectCharSet(subsets); expressions.Add(expression); } // TODO: (enumerable) foreach for (vint i = 0; i < expressions.Count(); i++) { Dictionary nfaStateMap; Group dfaStateMap; expressions[i]->ApplyCharSet(subsets); auto eNfa = expressions[i]->GenerateEpsilonNfa(); auto nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap); auto dfa = NfaToDfa(nfa, dfaStateMap); dfas.Add(dfa); } // Mark all states in DFAs // TODO: (enumerable) foreach for (vint i = 0; i < dfas.Count(); i++) { Ptr dfa = dfas[i]; // TODO: (enumerable) foreach for (vint j = 0; j < dfa->states.Count(); j++) { if (dfa->states[j]->finalState) { dfa->states[j]->userData = (void*)i; } else { dfa->states[j]->userData = (void*)dfas.Count(); } } } // Connect all DFAs to an e-NFA auto bigEnfa = Ptr(new Automaton); // TODO: (enumerable) foreach for (vint i = 0; i < dfas.Count(); i++) { CopyFrom(bigEnfa->states, dfas[i]->states, true); CopyFrom(bigEnfa->transitions, dfas[i]->transitions, true); } bigEnfa->startState = bigEnfa->NewState(); // TODO: (enumerable) foreach for (vint i = 0; i < dfas.Count(); i++) { bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState); } // Build a single DFA out of the e-NFA Dictionary nfaStateMap; Group dfaStateMap; auto bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap); // TODO: (enumerable) foreach on dictionary for (vint i = 0; i < nfaStateMap.Keys().Count(); i++) { void* userData = nfaStateMap.Values().Get(i)->userData; nfaStateMap.Keys()[i]->userData = userData; } auto bigDfa = NfaToDfa(bigNfa, dfaStateMap); // TODO: (enumerable) foreach on group for (vint i = 0; i < dfaStateMap.Keys().Count(); i++) { void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData; for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++) { void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData; if (userData > newData) { userData = newData; } } dfaStateMap.Keys()[i]->userData = userData; } // Build state machine pure = new PureInterpretor(bigDfa, subsets); stateTokens.Resize(bigDfa->states.Count()); for (vint i = 0; i < stateTokens.Count(); i++) { void* userData = bigDfa->states[i]->userData; stateTokens[i] = (vint)userData; } } /*********************************************************************** Template Instantiation ***********************************************************************/ template class RegexString_; template class RegexString_; template class RegexString_; template class RegexString_; template class RegexMatch_; template class RegexMatch_; template class RegexMatch_; template class RegexMatch_; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const; template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const; template bool RegexBase_::TestHead (const ObjectString& text)const; template bool RegexBase_::Test (const ObjectString& text)const; template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const; template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const; template class Regex_; template class Regex_; template class Regex_; template class Regex_; template class RegexTokens_; template class RegexTokens_; template class RegexTokens_; template class RegexTokens_; template class RegexLexerWalker_; template class RegexLexerWalker_; template class RegexLexerWalker_; template class RegexLexerWalker_; template class RegexLexerColorizer_; template class RegexLexerColorizer_; template class RegexLexerColorizer_; template class RegexLexerColorizer_; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const; template RegexLexerWalker_ RegexLexerBase_::Walk ()const; template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const; template class RegexLexer_; template class RegexLexer_; template class RegexLexer_; template class RegexLexer_; } } /*********************************************************************** .\REGEXPURE.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { using namespace collections; /*********************************************************************** Read ***********************************************************************/ void ReadInt(stream::IStream& inputStream, vint& value) { #ifdef VCZH_64 vint32_t x = 0; CHECK_ERROR( inputStream.Read(&x, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to deserialize RegexLexer." ); value = (vint)x; #else CHECK_ERROR( inputStream.Read(&value, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to deserialize RegexLexer." ); #endif } void ReadInts(stream::IStream& inputStream, vint count, vint* values) { #ifdef VCZH_64 Array xs(count); CHECK_ERROR( inputStream.Read(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to deserialize RegexLexer." ); for (vint i = 0; i < count; i++) { values[i] = (vint)xs[i]; } #else CHECK_ERROR( inputStream.Read(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to deserialize RegexLexer." ); #endif } void ReadBools(stream::IStream& inputStream, vint count, bool* values) { Array bits((count + 7) / 8); CHECK_ERROR( inputStream.Read(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(), L"Failed to deserialize RegexLexer." ); for (vint i = 0; i < count; i++) { vint x = i / 8; vint y = i % 8; values[i] = ((bits[x] >> y) & 1) == 1; } } /*********************************************************************** Write ***********************************************************************/ void WriteInt(stream::IStream& outputStream, vint value) { #ifdef VCZH_64 vint32_t x = (vint32_t)value; CHECK_ERROR( outputStream.Write(&x, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to serialize RegexLexer." ); #else CHECK_ERROR( outputStream.Write(&value, sizeof(vint32_t)) == sizeof(vint32_t), L"Failed to serialize RegexLexer." ); #endif } void WriteInts(stream::IStream& outputStream, vint count, vint* values) { #ifdef VCZH_64 Array xs(count); for (vint i = 0; i < count; i++) { xs[i] = (vint32_t)values[i]; } CHECK_ERROR( outputStream.Write(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to serialize RegexLexer." ); #else CHECK_ERROR( outputStream.Write(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count, L"Failed to serialize RegexLexer." ); #endif } void WriteBools(stream::IStream& outputStream, vint count, bool* values) { Array bits((count + 7) / 8); memset(&bits[0], 0, sizeof(vuint8_t) * bits.Count()); for (vint i = 0; i < count; i++) { if (values[i]) { vint x = i / 8; vint y = i % 8; bits[x] |= (vuint8_t)1 << y; } } CHECK_ERROR( outputStream.Write(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(), L"Failed to serialize RegexLexer." ); } /*********************************************************************** PureInterpretor (Serialization) ***********************************************************************/ PureInterpretor::PureInterpretor(stream::IStream& inputStream) { ReadInt(inputStream, stateCount); ReadInt(inputStream, charSetCount); ReadInt(inputStream, startState); { vint count = 0; ReadInt(inputStream, count); charRanges.Resize(count); if (count > 0) { vint size = charRanges.Count() * sizeof(CharRange); CHECK_ERROR(inputStream.Read(&charRanges[0], size) == size, L"Failed to serialize RegexLexer."); } ExpandCharRanges(); } transitions = new vint[stateCount * charSetCount]; ReadInts(inputStream, stateCount * charSetCount, transitions); finalState = new bool[stateCount]; ReadBools(inputStream, stateCount, finalState); } void PureInterpretor::Serialize(stream::IStream& outputStream) { WriteInt(outputStream, stateCount); WriteInt(outputStream, charSetCount); WriteInt(outputStream, startState); { WriteInt(outputStream, charRanges.Count()); if (charRanges.Count() > 0) { vint size = charRanges.Count() * sizeof(CharRange); CHECK_ERROR(outputStream.Write(&charRanges[0], size) == size, L"Failed to serialize RegexLexer."); } } WriteInts(outputStream, stateCount * charSetCount, transitions); WriteBools(outputStream, stateCount, finalState); } /*********************************************************************** PureInterpretor ***********************************************************************/ void PureInterpretor::ExpandCharRanges() { for (vint i = 0; i < SupportedCharCount; i++) { charMap[i] = charSetCount - 1; } // TODO: (enumerable) foreach for (vint i = 0; i < charRanges.Count(); i++) { CharRange range = charRanges[i]; for (char32_t j = range.begin; j <= range.end; j++) { if (j > MaxChar32) break; charMap[j] = i; } } } PureInterpretor::PureInterpretor(Ptr dfa, CharRange::List& subsets) { stateCount = dfa->states.Count(); charSetCount = subsets.Count() + 1; startState = dfa->states.IndexOf(dfa->startState); // Map char to input index (equivalent char class) CopyFrom(charRanges, subsets); ExpandCharRanges(); // Create transitions from DFA, using input index to represent input char transitions = new vint[stateCount * charSetCount]; for (vint i = 0; i < stateCount; i++) { for (vint j = 0; j < charSetCount; j++) { transitions[i * charSetCount + j] = -1; } State* state = dfa->states[i].Obj(); // TODO: (enumerable) foreach for (vint j = 0; j < state->transitions.Count(); j++) { Transition* dfaTransition = state->transitions[j]; switch (dfaTransition->type) { case Transition::Chars: { vint index = subsets.IndexOf(dfaTransition->range); if (index == -1) { CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr, CharRange::List&)#Specified chars don't appear in the normalized char ranges."); } transitions[i * charSetCount + index] = dfa->states.IndexOf(dfaTransition->target); } break; default: CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions."); } } } // Mark final states finalState = new bool[stateCount]; for (vint i = 0; i < stateCount; i++) { finalState[i] = dfa->states[i]->finalState; } } PureInterpretor::~PureInterpretor() { if (relatedFinalState) delete[] relatedFinalState; delete[] finalState; delete[] transitions; } template bool PureInterpretor::MatchHead(const TChar* input, const TChar* start, PureResult& result) { CharReader reader(input); vint currentState = startState; vint terminateState = -1; vint terminateLength = -1; result.start = input - start; result.length = -1; result.finalState = -1; result.terminateState = -1; while (currentState != -1) { auto c = reader.Read(); terminateState = currentState; terminateLength = reader.Index(); if (finalState[currentState]) { result.length = terminateLength; result.finalState = currentState; } if (!c) break; if (c >= SupportedCharCount) break; vint charIndex = charMap[c]; currentState = transitions[currentState * charSetCount + charIndex]; } if (result.finalState == -1) { if (terminateLength > 0) { result.terminateState = terminateState; } result.length = terminateLength; return false; } else { return true; } } template bool PureInterpretor::Match(const TChar* input, const TChar* start, PureResult& result) { CharReader reader(input); while (reader.Read()) { if (MatchHead(reader.Reading(), start, result)) { return true; } } return false; } vint PureInterpretor::GetStartState() { return startState; } vint PureInterpretor::Transit(char32_t input, vint state) { if (0 <= state && state < stateCount && 0 <= input && input <= MaxChar32) { vint charIndex = charMap[input]; vint nextState = transitions[state * charSetCount + charIndex]; return nextState; } else { return -1; } } bool PureInterpretor::IsFinalState(vint state) { return 0 <= state && state < stateCount&& finalState[state]; } bool PureInterpretor::IsDeadState(vint state) { if (state == -1) return true; for (vint i = 0; i < charSetCount; i++) { if (transitions[state * charSetCount + i] != -1) { return false; } } return true; } void PureInterpretor::PrepareForRelatedFinalStateTable() { if (!relatedFinalState) { relatedFinalState = new vint[stateCount]; for (vint i = 0; i < stateCount; i++) { relatedFinalState[i] = finalState[i] ? i : -1; } while (true) { vint modifyCount = 0; for (vint i = 0; i < stateCount; i++) { if (relatedFinalState[i] == -1) { vint state = -1; for (vint j = 0; j < charSetCount; j++) { vint nextState = transitions[i * charSetCount + j]; if (nextState != -1) { state = relatedFinalState[nextState]; if (state != -1) { break; } } } if (state != -1) { relatedFinalState[i] = state; modifyCount++; } } } if (modifyCount == 0) { break; } } } } vint PureInterpretor::GetRelatedFinalState(vint state) { return relatedFinalState ? relatedFinalState[state] : -1; } template bool PureInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, PureResult& result); template bool PureInterpretor::MatchHead(const char8_t* input, const char8_t* start, PureResult& result); template bool PureInterpretor::MatchHead(const char16_t* input, const char16_t* start, PureResult& result); template bool PureInterpretor::MatchHead(const char32_t* input, const char32_t* start, PureResult& result); template bool PureInterpretor::Match(const wchar_t* input, const wchar_t* start, PureResult& result); template bool PureInterpretor::Match(const char8_t* input, const char8_t* start, PureResult& result); template bool PureInterpretor::Match(const char16_t* input, const char16_t* start, PureResult& result); template bool PureInterpretor::Match(const char32_t* input, const char32_t* start, PureResult& result); } } /*********************************************************************** .\REGEXRICH.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** Data Structures for Backtracking ***********************************************************************/ enum class StateStoreType { Positive, Negative, Other }; template class StateSaver { public: CharReader reader; // Current reading position char32_t ch; // Current character State* currentState; // Current state vint minTransition = 0; // The first transition to backtrack vint captureCount = 0; // Available capture count (the list size may larger than this) vint stateSaverCount = 0; // Available saver count (the list size may larger than this) vint extensionSaverAvailable = -1; // Available extension saver count (the list size may larger than this) vint extensionSaverCount = 0; // Available extension saver count (during executing) StateStoreType storeType = StateStoreType::Other; // Reason to keep this record StateSaver(const TChar* input, State* _currentState) : reader(input) , currentState(_currentState) { ch = reader.Read(); } StateSaver(const StateSaver&) = default; StateSaver& operator=(const StateSaver&) = default; void RestoreReaderTo(StateSaver& saver) { saver.reader = reader; saver.ch = ch; } }; template class ExtensionSaver { public: CharReader reader; // The reading position char32_t ch; // Current character vint previous; // Previous extension saver index vint captureListIndex; // Where to write the captured text Transition* transition; // The extension begin transition (Capture, Positive, Negative) ExtensionSaver(const StateSaver& saver) : reader(saver.reader) , ch(saver.ch) { } ExtensionSaver(const ExtensionSaver&) = default; ExtensionSaver& operator=(const ExtensionSaver&) = default; void RestoreReaderTo(StateSaver& saver) { saver.reader = reader; saver.ch = ch; } }; } namespace regex_internal { using namespace collections; template void Push(List>& elements, vint& available, vint& count, const ExtensionSaver& element) { if (elements.Count() == count) { elements.Add(element); } else { elements[count] = element; } auto& current = elements[count]; current.previous = available; available = count++; } template ExtensionSaver Pop(List>& elements, vint& available, vint& count) { auto& current = elements[available]; available = current.previous; return current; } template void PushNonSaver(List& elements, vint& count, const T& element) { if (elements.Count() == count) { elements.Add(element); } else { elements[count] = element; } count++; } template T PopNonSaver(List& elements, vint& count) { return elements[--count]; } } namespace regex_internal { /*********************************************************************** RichInterpretor ***********************************************************************/ RichInterpretor::RichInterpretor(Ptr _dfa) :dfa(_dfa) { datas = new UserData[dfa->states.Count()]; // TODO: (enumerable) foreach for (vint i = 0; i < dfa->states.Count(); i++) { State* state = dfa->states[i].Obj(); vint charEdges = 0; vint nonCharEdges = 0; bool mustSave = false; // TODO: (enumerable) foreach for (vint j = 0; j < state->transitions.Count(); j++) { if (state->transitions[j]->type == Transition::Chars) { charEdges++; } else { if (state->transitions[j]->type == Transition::Negative || state->transitions[j]->type == Transition::Positive) { mustSave = true; } nonCharEdges++; } } datas[i].NeedKeepState = mustSave || nonCharEdges > 1 || (nonCharEdges != 0 && charEdges != 0); state->userData = &datas[i]; } } RichInterpretor::~RichInterpretor() { delete[] datas; } template bool RichInterpretor::MatchHead(const TChar* input, const TChar* start, RichResult& result) { List> stateSavers; List> extensionSavers; StateSaver currentState(input, dfa->startState); while (!currentState.currentState->finalState) { bool found = false; // true means at least one transition matches the input StateSaver oldState = currentState; // Iterate through all transitions from the current state // TODO: (enumerable) foreach:reversed for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++) { Transition* transition = currentState.currentState->transitions[i]; switch (transition->type) { case Transition::Chars: { // match the input if the current character fall into the range CharRange range = transition->range; found = range.begin <= currentState.ch && range.end >= currentState.ch; if (found) { currentState.ch = currentState.reader.Read(); } } break; case Transition::BeginString: { // match the input if this is the first character, and it is not consumed found = currentState.reader.Index() == 0 && input == start; } break; case Transition::EndString: { // match the input if this is after the last character, and it is not consumed found = currentState.ch == 0; } break; case Transition::Nop: { // match without any condition found = true; } break; case Transition::Capture: { // Push the capture information ExtensionSaver saver(currentState); saver.captureListIndex = currentState.captureCount; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Push the capture record, and it will be written if the input matches the regex CaptureRecord capture; capture.capture = transition->capture; capture.start = currentState.reader.Index() + (input - start); capture.length = -1; PushNonSaver(result.captures, currentState.captureCount, capture); found = true; } break; case Transition::Match: { vint index = 0; for (vint j = 0; j < currentState.captureCount; j++) { CaptureRecord& capture = result.captures[j]; // If the capture name matched if (capture.capture == transition->capture) { // If the capture index matched, or it is -1 if (capture.length != -1 && (transition->index == -1 || transition->index == index)) { // If the captured text matched if (memcmp(start + capture.start, input + currentState.reader.Index(), sizeof(TChar) * capture.length) == 0) { // Consume so much input vint targetIndex = currentState.reader.Index() + capture.length; while (currentState.reader.Index() < targetIndex) { currentState.ch = currentState.reader.Read(); } CHECK_ERROR(currentState.reader.Index() == targetIndex, L"vl::regex_internal::RichInterpretor::MatchHead(const TChar*, const TChar*, RichResult&)#Input code could be an incorrect unicode sequence."); found = true; break; } } // Fail if f the captured text with the specified name and index doesn't match if (transition->index != -1 && index == transition->index) { break; } else { index++; } } } } break; case Transition::Positive: { // Push the positive lookahead information ExtensionSaver saver(currentState); saver.captureListIndex = -1; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Set found = true so that PushNonSaver(oldState) happens later oldState.storeType = StateStoreType::Positive; found = true; } break; case Transition::Negative: { // Push the positive lookahead information ExtensionSaver saver(currentState); saver.captureListIndex = -1; saver.transition = transition; Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver); // Set found = true so that PushNonSaver(oldState) happens later oldState.storeType = StateStoreType::Negative; found = true; } break; case Transition::NegativeFail: { // NegativeFail will be used when the nagative lookahead failed } break; case Transition::End: { // Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount); switch (extensionSaver.transition->type) { case Transition::Capture: { // Write the captured text CaptureRecord& capture = result.captures[extensionSaver.captureListIndex]; capture.length = currentState.reader.Index() + (input - start) - capture.start; found = true; } break; case Transition::Positive: // Find the last positive lookahead state saver for (vint j = currentState.stateSaverCount - 1; j >= 0; j--) { auto& stateSaver = stateSavers[j]; if (stateSaver.storeType == StateStoreType::Positive) { // restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input stateSaver.RestoreReaderTo(oldState); oldState.stateSaverCount = j; stateSaver.RestoreReaderTo(currentState); currentState.stateSaverCount = j; break; } } found = true; break; case Transition::Negative: // Find the last negative lookahead state saver for (vint j = currentState.stateSaverCount - 1; j >= 0; j--) { auto& stateSaver = stateSavers[j]; if (stateSaver.storeType == StateStoreType::Negative) { // restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input oldState = stateSaver; oldState.storeType = StateStoreType::Other; currentState = stateSaver; currentState.storeType = StateStoreType::Other; i = currentState.minTransition - 1; break; } } break; default:; } } break; default:; } // Save the parsing state when necessary if (found) { UserData* data = (UserData*)currentState.currentState->userData; if (data->NeedKeepState) { oldState.minTransition = i + 1; PushNonSaver(stateSavers, currentState.stateSaverCount, oldState); } currentState.currentState = transition->target; currentState.minTransition = 0; break; } } // If no transition from the current state can be used if (!found) { // If there is a chance to do backtracking if (currentState.stateSaverCount) { currentState = PopNonSaver(stateSavers, currentState.stateSaverCount); // minTransition - 1 is always valid since the value is stored with adding 1 // So minTransition - 1 record the transition, which is the reason the parsing state is saved if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative) { // Find the next NegativeFail transition // Because when a negative lookahead regex failed to match, it is actually succeeded // Since a negative lookahead means we don't want to match this regex // TODO: (enumerable) foreach:reversed for (vint i = 0; i < currentState.currentState->transitions.Count(); i++) { Transition* transition = currentState.currentState->transitions[i]; if (transition->type == Transition::NegativeFail) { // Restore the state to the target of NegativeFail to let the parsing continue currentState.currentState = transition->target; currentState.minTransition = 0; currentState.storeType = StateStoreType::Other; break; } } } } else { break; } } } if (currentState.currentState->finalState) { // Keep available captures if succeeded result.start = input - start; result.length = currentState.reader.Index(); for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--) { result.captures.RemoveAt(i); } return true; } else { // Clear captures if failed result.captures.Clear(); return false; } } template bool RichInterpretor::Match(const TChar* input, const TChar* start, RichResult& result) { CharReader reader(input); while (reader.Read()) { if (MatchHead(reader.Reading(), start, result)) { return true; } } return false; } const List& RichInterpretor::CaptureNames() { return dfa->captureNames; } template bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, RichResult& result); template bool RichInterpretor::MatchHead(const char8_t* input, const char8_t* start, RichResult& result); template bool RichInterpretor::MatchHead(const char16_t* input, const char16_t* start, RichResult& result); template bool RichInterpretor::MatchHead(const char32_t* input, const char32_t* start, RichResult& result); template bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, RichResult& result); template bool RichInterpretor::Match(const char8_t* input, const char8_t* start, RichResult& result); template bool RichInterpretor::Match(const char16_t* input, const char16_t* start, RichResult& result); template bool RichInterpretor::Match(const char32_t* input, const char32_t* start, RichResult& result); } } /*********************************************************************** .\AST\REGEXEXPRESSION.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** MergeAlgorithm ***********************************************************************/ class MergeParameter { public: Expression::Map definitions; RegexExpression* regex = nullptr; }; class MergeAlgorithm : public RegexExpressionAlgorithm, MergeParameter*> { public: Ptr Apply(CharSetExpression* expression, MergeParameter* target) override { auto result = Ptr(new CharSetExpression); CopyFrom(result->ranges, expression->ranges); result->reverse = expression->reverse; return result; } Ptr Apply(LoopExpression* expression, MergeParameter* target) override { auto result = Ptr(new LoopExpression); result->max = expression->max; result->min = expression->min; result->preferLong = expression->preferLong; result->expression = Invoke(expression->expression, target); return result; } Ptr Apply(SequenceExpression* expression, MergeParameter* target) override { auto result = Ptr(new SequenceExpression); result->left = Invoke(expression->left, target); result->right = Invoke(expression->right, target); return result; } Ptr Apply(AlternateExpression* expression, MergeParameter* target) override { auto result = Ptr(new AlternateExpression); result->left = Invoke(expression->left, target); result->right = Invoke(expression->right, target); return result; } Ptr Apply(BeginExpression* expression, MergeParameter* target) override { return Ptr(new BeginExpression); } Ptr Apply(EndExpression* expression, MergeParameter* target) override { return Ptr(new EndExpression); } Ptr Apply(CaptureExpression* expression, MergeParameter* target) override { auto result = Ptr(new CaptureExpression); result->expression = Invoke(expression->expression, target); result->name = expression->name; return result; } Ptr Apply(MatchExpression* expression, MergeParameter* target) override { auto result = Ptr(new MatchExpression); result->name = expression->name; result->index = expression->index; return result; } Ptr Apply(PositiveExpression* expression, MergeParameter* target) override { auto result = Ptr(new PositiveExpression); result->expression = Invoke(expression->expression, target); return result; } Ptr Apply(NegativeExpression* expression, MergeParameter* target) override { auto result = Ptr(new NegativeExpression); result->expression = Invoke(expression->expression, target); return result; } Ptr Apply(UsingExpression* expression, MergeParameter* target) override { if (target->definitions.Keys().Contains(expression->name)) { Ptr reference = target->definitions[expression->name]; if (reference) { return reference; } else { throw ArgumentException(L"Regular expression syntax error: Found reference loops in\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L""); } } else if (target->regex->definitions.Keys().Contains(expression->name)) { target->definitions.Add(expression->name, nullptr); Ptr result = Invoke(target->regex->definitions[expression->name], target); target->definitions.Set(expression->name, result); return result; } else { throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L""); } } }; /*********************************************************************** CharSetExpression ***********************************************************************/ bool CharSetExpression::AddRangeWithConflict(CharRange range) { if (range.begin > range.end) { char32_t t = range.begin; range.begin = range.end; range.end = t; } // TODO: (enumerable) foreach for (vint i = 0; i < ranges.Count(); i++) { if (!(rangeranges[i])) { return false; } } ranges.Add(range); return true; } /*********************************************************************** RegexExpression ***********************************************************************/ Ptr RegexExpression::Merge() { MergeParameter merge; merge.regex = this; return MergeAlgorithm().Invoke(expression, &merge); } /*********************************************************************** Expression::Apply ***********************************************************************/ void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm) { algorithm.Visit(this); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_CANTREATASPURE.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** CanTreatAsPureAlgorithm ***********************************************************************/ class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm { public: bool Apply(CharSetExpression* expression, void* target) override { return true; } bool Apply(LoopExpression* expression, void* target) override { return expression->preferLong && Invoke(expression->expression, 0); } bool Apply(SequenceExpression* expression, void* target) override { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(AlternateExpression* expression, void* target) override { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(BeginExpression* expression, void* target) override { return false; } bool Apply(EndExpression* expression, void* target) override { return false; } bool Apply(CaptureExpression* expression, void* target) override { return Invoke(expression->expression, 0); } bool Apply(MatchExpression* expression, void* target) override { return false; } bool Apply(PositiveExpression* expression, void* target) override { return false; } bool Apply(NegativeExpression* expression, void* target) override { return false; } bool Apply(UsingExpression* expression, void* target) override { return false; } }; /*********************************************************************** Expression ***********************************************************************/ bool Expression::CanTreatAsPure() { return CanTreatAsPureAlgorithm().Invoke(this, 0); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_CHARSET.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { class NormalizedCharSet { public: CharRange::List ranges; }; /*********************************************************************** CharSetAlgorithm ***********************************************************************/ class CharSetAlgorithm : public RegexExpressionAlgorithm { public: virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) = 0; void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target) { if (expression->reverse) { char32_t begin = 1; // TODO: (enumerable) foreach for (vint i = 0; i < ranges.Count(); i++) { CharRange range = ranges[i]; if (range.begin > begin) { Process(expression, target, CharRange(begin, range.begin - 1)); } begin = range.end + 1; } if (begin <= MaxChar32) { Process(expression, target, CharRange(begin, MaxChar32)); } } else { // TODO: (enumerable) foreach for (vint i = 0; i < ranges.Count(); i++) { Process(expression, target, ranges[i]); } } } void Apply(LoopExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(SequenceExpression* expression, NormalizedCharSet* target) override { Invoke(expression->left, target); Invoke(expression->right, target); } void Apply(AlternateExpression* expression, NormalizedCharSet* target) override { Invoke(expression->left, target); Invoke(expression->right, target); } void Apply(BeginExpression* expression, NormalizedCharSet* target) override { } void Apply(EndExpression* expression, NormalizedCharSet* target) override { } void Apply(CaptureExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(MatchExpression* expression, NormalizedCharSet* target) override { } void Apply(PositiveExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(NegativeExpression* expression, NormalizedCharSet* target) override { Invoke(expression->expression, target); } void Apply(UsingExpression* expression, NormalizedCharSet* target) override { } }; /*********************************************************************** BuildNormalizedCharSetAlgorithm ***********************************************************************/ class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm { public: void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) { vint index = 0; while (index < target->ranges.Count()) { CharRange current = target->ranges[index]; if (currentrange) { index++; } else if (current.begin < range.begin) { // range : [ ? // current : [ ] target->ranges.RemoveAt(index); target->ranges.Add(CharRange(current.begin, range.begin - 1)); target->ranges.Add(CharRange(range.begin, current.end)); index++; } else if (current.begin > range.begin) { // range : [ ] // current : [ ? target->ranges.Add(CharRange(range.begin, current.begin - 1)); range.begin = current.begin; } else if (current.end < range.end) { // range : [ ] // current : [ ] range.begin = current.end + 1; index++; } else if (current.end > range.end) { // range : [ ] // current : [ ] target->ranges.RemoveAt(index); target->ranges.Add(range); target->ranges.Add(CharRange(range.end + 1, current.end)); return; } else { // range : [ ] // current : [ ] return; } } target->ranges.Add(range); } void Apply(CharSetExpression* expression, NormalizedCharSet* target) { Loop(expression, expression->ranges, target); } }; /*********************************************************************** SetNormalizedCharSetAlgorithm ***********************************************************************/ class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm { public: void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) { // TODO: (enumerable) foreach for (vint j = 0; j < target->ranges.Count(); j++) { CharRange targetRange = target->ranges[j]; if (range.begin <= targetRange.begin && targetRange.end <= range.end) { expression->ranges.Add(targetRange); } } } void Apply(CharSetExpression* expression, NormalizedCharSet* target) { CharRange::List source; CopyFrom(source, expression->ranges); expression->ranges.Clear(); Loop(expression, source, target); expression->reverse = false; } }; /*********************************************************************** Expression ***********************************************************************/ void Expression::NormalizeCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized); SetNormalizedCharSetAlgorithm().Invoke(this, &normalized); CopyFrom(subsets, normalized.ranges); } void Expression::CollectCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; CopyFrom(normalized.ranges, subsets); BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized); CopyFrom(subsets, normalized.ranges); } void Expression::ApplyCharSet(CharRange::List& subsets) { NormalizedCharSet normalized; CopyFrom(normalized.ranges, subsets); SetNormalizedCharSetAlgorithm().Invoke(this, &normalized); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_GENERATEEPSILONNFA.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** EpsilonNfaAlgorithm ***********************************************************************/ class EpsilonNfaInfo { public: Ptr automaton; }; class EpsilonNfa { public: State* start; State* end; EpsilonNfa() { start = 0; end = 0; } }; class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm { public: EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target) { if (a.start) { target->NewEpsilon(a.end, b.start); a.end = b.end; return a; } else { return b; } } EpsilonNfa Apply(CharSetExpression* expression, Automaton* target) override { EpsilonNfa nfa; nfa.start = target->NewState(); nfa.end = target->NewState(); // TODO: (enumerable) foreach for (vint i = 0; i < expression->ranges.Count(); i++) { target->NewChars(nfa.start, nfa.end, expression->ranges[i]); } return nfa; } EpsilonNfa Apply(LoopExpression* expression, Automaton* target) override { EpsilonNfa head; for (vint i = 0; i < expression->min; i++) { EpsilonNfa body = Invoke(expression->expression, target); head = Connect(head, body, target); } if (expression->max == -1) { EpsilonNfa body = Invoke(expression->expression, target); if (!head.start) { head.start = head.end = target->NewState(); } State* loopBegin = head.end; State* loopEnd = target->NewState(); if (expression->preferLong) { target->NewEpsilon(loopBegin, body.start); target->NewEpsilon(body.end, loopBegin); target->NewNop(loopBegin, loopEnd); } else { target->NewNop(loopBegin, loopEnd); target->NewEpsilon(loopBegin, body.start); target->NewEpsilon(body.end, loopBegin); } head.end = loopEnd; } else if (expression->max > expression->min) { for (vint i = expression->min; i < expression->max; i++) { EpsilonNfa body = Invoke(expression->expression, target); State* start = target->NewState(); State* end = target->NewState(); if (expression->preferLong) { target->NewEpsilon(start, body.start); target->NewEpsilon(body.end, end); target->NewNop(start, end); } else { target->NewNop(start, end); target->NewEpsilon(start, body.start); target->NewEpsilon(body.end, end); } body.start = start; body.end = end; head = Connect(head, body, target); } } return head; } EpsilonNfa Apply(SequenceExpression* expression, Automaton* target) override { EpsilonNfa a = Invoke(expression->left, target); EpsilonNfa b = Invoke(expression->right, target); return Connect(a, b, target); } EpsilonNfa Apply(AlternateExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); EpsilonNfa a = Invoke(expression->left, target); EpsilonNfa b = Invoke(expression->right, target); target->NewEpsilon(result.start, a.start); target->NewEpsilon(a.end, result.end); target->NewEpsilon(result.start, b.start); target->NewEpsilon(b.end, result.end); return result; } EpsilonNfa Apply(BeginExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); target->NewBeginString(result.start, result.end); return result; } EpsilonNfa Apply(EndExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); target->NewEndString(result.start, result.end); return result; } EpsilonNfa Apply(CaptureExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); vint capture = -1; if (expression->name != U32String::Empty) { capture = target->captureNames.IndexOf(expression->name); if (capture == -1) { capture = target->captureNames.Count(); target->captureNames.Add(expression->name); } } EpsilonNfa body = Invoke(expression->expression, target); target->NewCapture(result.start, body.start, capture); target->NewEnd(body.end, result.end); return result; } EpsilonNfa Apply(MatchExpression* expression, Automaton* target) override { vint capture = -1; if (expression->name != U32String::Empty) { capture = target->captureNames.IndexOf(expression->name); if (capture == -1) { capture = target->captureNames.Count(); target->captureNames.Add(expression->name); } } EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); target->NewMatch(result.start, result.end, capture, expression->index); return result; } EpsilonNfa Apply(PositiveExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); EpsilonNfa body = Invoke(expression->expression, target); target->NewPositive(result.start, body.start); target->NewEnd(body.end, result.end); return result; } EpsilonNfa Apply(NegativeExpression* expression, Automaton* target) override { EpsilonNfa result; result.start = target->NewState(); result.end = target->NewState(); EpsilonNfa body = Invoke(expression->expression, target); target->NewNegative(result.start, body.start); target->NewEnd(body.end, result.end); target->NewNegativeFail(result.start, result.end); return result; } EpsilonNfa Apply(UsingExpression* expression, Automaton* target) override { CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine."); } }; /*********************************************************************** Expression ***********************************************************************/ Ptr Expression::GenerateEpsilonNfa() { auto automaton = Ptr(new Automaton); EpsilonNfa result = EpsilonNfaAlgorithm().Invoke(this, automaton.Obj()); automaton->startState = result.start; result.end->finalState = true; return automaton; } } } /*********************************************************************** .\AST\REGEXEXPRESSION_HASNOEXTENSION.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** HasNoExtensionAlgorithm ***********************************************************************/ class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm { public: bool Apply(CharSetExpression* expression, void* target) override { return true; } bool Apply(LoopExpression* expression, void* target) override { return expression->preferLong && Invoke(expression->expression, 0); } bool Apply(SequenceExpression* expression, void* target) override { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(AlternateExpression* expression, void* target) override { return Invoke(expression->left, 0) && Invoke(expression->right, 0); } bool Apply(BeginExpression* expression, void* target) override { return false; } bool Apply(EndExpression* expression, void* target) override { return false; } bool Apply(CaptureExpression* expression, void* target) override { return false; } bool Apply(MatchExpression* expression, void* target) override { return false; } bool Apply(PositiveExpression* expression, void* target) override { return false; } bool Apply(NegativeExpression* expression, void* target) override { return false; } bool Apply(UsingExpression* expression, void* target) override { return false; } }; /*********************************************************************** Expression ***********************************************************************/ bool Expression::HasNoExtension() { return HasNoExtensionAlgorithm().Invoke(this, 0); } } } /*********************************************************************** .\AST\REGEXEXPRESSION_ISEQUAL.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** IsEqualAlgorithm ***********************************************************************/ class IsEqualAlgorithm : public RegexExpressionAlgorithm { public: bool Apply(CharSetExpression* expression, Expression* target) override { CharSetExpression* expected = dynamic_cast(target); if (expected) { if (expression->reverse != expected->reverse)return false; if (expression->ranges.Count() != expected->ranges.Count())return false; // TODO: (enumerable) foreach:indexed for (vint i = 0; i < expression->ranges.Count(); i++) { if (expression->ranges[i] != expected->ranges[i])return false; } return true; } return false; } bool Apply(LoopExpression* expression, Expression* target) override { LoopExpression* expected = dynamic_cast(target); if (expected) { if (expression->min != expected->min)return false; if (expression->max != expected->max)return false; if (expression->preferLong != expected->preferLong)return false; if (!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(SequenceExpression* expression, Expression* target) override { SequenceExpression* expected = dynamic_cast(target); if (expected) { if (!Invoke(expression->left, expected->left.Obj()))return false; if (!Invoke(expression->right, expected->right.Obj()))return false; return true; } return false; } bool Apply(AlternateExpression* expression, Expression* target) override { AlternateExpression* expected = dynamic_cast(target); if (expected) { if (!Invoke(expression->left, expected->left.Obj()))return false; if (!Invoke(expression->right, expected->right.Obj()))return false; return true; } return false; } bool Apply(BeginExpression* expression, Expression* target) override { BeginExpression* expected = dynamic_cast(target); if (expected) { return true; } return false; } bool Apply(EndExpression* expression, Expression* target) override { EndExpression* expected = dynamic_cast(target); if (expected) { return true; } return false; } bool Apply(CaptureExpression* expression, Expression* target) override { CaptureExpression* expected = dynamic_cast(target); if (expected) { if (expression->name != expected->name)return false; if (!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(MatchExpression* expression, Expression* target) override { MatchExpression* expected = dynamic_cast(target); if (expected) { if (expression->name != expected->name)return false; if (expression->index != expected->index)return false; return true; } return false; } bool Apply(PositiveExpression* expression, Expression* target) override { PositiveExpression* expected = dynamic_cast(target); if (expected) { if (!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(NegativeExpression* expression, Expression* target) override { NegativeExpression* expected = dynamic_cast(target); if (expected) { if (!Invoke(expression->expression, expected->expression.Obj()))return false; return true; } return false; } bool Apply(UsingExpression* expression, Expression* target) override { UsingExpression* expected = dynamic_cast(target); if (expected) { if (expression->name != expected->name)return false; return true; } return false; } }; /*********************************************************************** Expression ***********************************************************************/ bool Expression::IsEqual(vl::regex_internal::Expression* expression) { return IsEqualAlgorithm().Invoke(this, expression); } } } /*********************************************************************** .\AST\REGEXPARSER.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { /*********************************************************************** Helper Functions ***********************************************************************/ bool IsChar(const char32_t*& input, char32_t c) { if (*input == c) { input++; return true; } else { return false; } } template bool IsChars(const char32_t*& input, const char32_t(&chars)[Size]) { for (char32_t c : chars) { if (*input == c) { input++; return true; } } return false; } template bool IsStr(const char32_t*& input, const char32_t(&str)[Size]) { for (vint i = 0; i < Size - 1; i++) { if (input[i] != str[i]) return false; } input += Size - 1; return true; } bool IsPositiveInteger(const char32_t*& input, vint& number) { bool readed = false; number = 0; while (U'0' <= *input && *input <= U'9') { number = number * 10 + (*input++) - U'0'; readed = true; } return readed; } bool IsName(const char32_t*& input, U32String& name) { const char32_t* read = input; if ((U'A' <= *read && *read <= U'Z') || (U'a' <= *read && *read <= U'z') || *read == U'_') { read++; while ((U'A' <= *read && *read <= U'Z') || (U'a' <= *read && *read <= U'z') || (U'0' <= *read && *read <= U'9') || *read == U'_') { read++; } } if (input == read) { return false; } else { name = U32String::CopyFrom(input, vint(read - input)); input = read; return true; } } Ptr ParseLoop(const char32_t*& input) { vint min = 0; vint max = 0; if (!*input) { return 0; } else if (IsChar(input, U'+')) { min = 1; max = -1; } else if (IsChar(input, U'*')) { min = 0; max = -1; } else if (IsChar(input, U'?')) { min = 0; max = 1; } else if (IsChar(input, U'{')) { if (IsPositiveInteger(input, min)) { if (IsChar(input, U',')) { if (!IsPositiveInteger(input, max)) { max = -1; } } else { max = min; } if (!IsChar(input, U'}')) { goto THROW_EXCEPTION; } } else { goto THROW_EXCEPTION; } } else { return 0; } { auto expression = Ptr(new LoopExpression); expression->min = min; expression->max = max; expression->preferLong = !IsChar(input, U'?'); return expression; } THROW_EXCEPTION: throw ArgumentException(L"Regular expression syntax error: Illegal loop expression.", L"vl::regex_internal::ParseLoop", L"input"); } Ptr ParseCharSet(const char32_t*& input) { if (!*input) { return 0; } else if (IsChar(input, U'^')) { return Ptr(new BeginExpression); } else if (IsChar(input, U'$')) { return Ptr(new EndExpression); } else if (IsChar(input, U'\\') || IsChar(input, U'/')) { auto expression = Ptr(new CharSetExpression); expression->reverse = false; switch (*input) { case U'.': expression->ranges.Add(CharRange(1, MaxChar32)); break; case U'r': expression->ranges.Add(CharRange(U'\r', U'\r')); break; case U'n': expression->ranges.Add(CharRange(U'\n', U'\n')); break; case U't': expression->ranges.Add(CharRange(U'\t', U'\t')); break; case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|': case U'{':case U'}':case U'[':case U']':case U'<':case U'>': case U'^':case U'$':case U'!':case U'=': expression->ranges.Add(CharRange(*input, *input)); break; case U'S': expression->reverse = true; case U's': expression->ranges.Add(CharRange(U' ', U' ')); expression->ranges.Add(CharRange(U'\r', U'\r')); expression->ranges.Add(CharRange(U'\n', U'\n')); expression->ranges.Add(CharRange(U'\t', U'\t')); break; case U'D': expression->reverse = true; case U'd': expression->ranges.Add(CharRange(U'0', U'9')); break; case U'L': expression->reverse = true; case U'l': expression->ranges.Add(CharRange(U'_', U'_')); expression->ranges.Add(CharRange(U'A', U'Z')); expression->ranges.Add(CharRange(U'a', U'z')); break; case U'W': expression->reverse = true; case U'w': expression->ranges.Add(CharRange(U'_', U'_')); expression->ranges.Add(CharRange(U'0', U'9')); expression->ranges.Add(CharRange(U'A', U'Z')); expression->ranges.Add(CharRange(U'a', U'z')); break; default: throw ArgumentException(L"Regular expression syntax error: Illegal character escaping.", L"vl::regex_internal::ParseCharSet", L"input"); } input++; return expression; } else if (IsChar(input, U'[')) { auto expression = Ptr(new CharSetExpression); if (IsChar(input, U'^')) { expression->reverse = true; } else { expression->reverse = false; } bool midState = false; char32_t a = U'\0'; char32_t b = U'\0'; while (true) { if (IsChar(input, U'\\') || IsChar(input, U'/')) { char32_t c = U'\0'; switch (*input) { case U'r': c = U'\r'; break; case U'n': c = U'\n'; break; case U't': c = U'\t'; break; case U'-':case U'[':case U']':case U'\\':case U'/':case U'^':case U'$': c = *input; break; default: throw ArgumentException(L"Regular expression syntax error: Illegal character escaping, only \"rnt-[]\\/\" are legal escaped characters in [].", L"vl::regex_internal::ParseCharSet", L"input"); } input++; midState ? b = c : a = c; midState = !midState; } else if (IsChars(input, U"-]")) { goto THROW_EXCEPTION; } else if (*input) { midState ? b = *input++ : a = *input++; midState = !midState; } else { goto THROW_EXCEPTION; } if (IsChar(input, U']')) { if (midState) { b = a; } if (!expression->AddRangeWithConflict(CharRange(a, b))) { goto THROW_EXCEPTION; } break; } else if (IsChar(input, U'-')) { if (!midState) { goto THROW_EXCEPTION; } } else { if (midState) { b = a; } if (expression->AddRangeWithConflict(CharRange(a, b))) { midState = false; } else { goto THROW_EXCEPTION; } } } return expression; THROW_EXCEPTION: throw ArgumentException(L"Regular expression syntax error: Illegal character set definition."); } else if (IsChars(input, U"()+*?{}|")) { input--; return 0; } else { auto expression = Ptr(new CharSetExpression); expression->reverse = false; expression->ranges.Add(CharRange(*input, *input)); input++; return expression; } } Ptr ParseFunction(const char32_t*& input) { if (IsStr(input, U"(=")) { Ptr sub = ParseExpression(input); if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } auto expression = Ptr(new PositiveExpression); expression->expression = sub; return expression; } else if (IsStr(input, U"(!")) { Ptr sub = ParseExpression(input); if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } auto expression = Ptr(new NegativeExpression); expression->expression = sub; return expression; } else if (IsStr(input, U"(<&")) { U32String name; if (!IsName(input, name)) { goto NEED_NAME; } if (!IsChar(input, U'>')) { goto NEED_GREATER; } if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } auto expression = Ptr(new UsingExpression); expression->name = name; return expression; } else if (IsStr(input, U"(<$")) { U32String name; vint index = -1; if (IsName(input, name)) { if (IsChar(input, U';')) { if (!IsPositiveInteger(input, index)) { goto NEED_NUMBER; } } } else if (!IsPositiveInteger(input, index)) { goto NEED_NUMBER; } if (!IsChar(input, U'>')) { goto NEED_GREATER; } if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } auto expression = Ptr(new MatchExpression); expression->name = name; expression->index = index; return expression; } else if (IsStr(input, U"(<")) { U32String name; if (!IsName(input, name)) { goto NEED_NAME; } if (!IsChar(input, U'>')) { goto NEED_GREATER; } auto sub = ParseExpression(input); if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } auto expression = Ptr(new CaptureExpression); expression->name = name; expression->expression = sub; return expression; } else if (IsStr(input, U"(?")) { auto sub = ParseExpression(input); if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } auto expression = Ptr(new CaptureExpression); expression->expression = sub; return expression; } else if (IsChar(input, U'(')) { auto sub = ParseExpression(input); if (!IsChar(input, U')')) { goto NEED_RIGHT_BRACKET; } return sub; } else { return 0; } NEED_RIGHT_BRACKET: throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input"); NEED_GREATER: throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input"); NEED_NAME: throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseFunction", L"input"); NEED_NUMBER: throw ArgumentException(L"Regular expression syntax error: Number expected.", L"vl::regex_internal::ParseFunction", L"input"); } Ptr ParseUnit(const char32_t*& input) { Ptr unit = ParseCharSet(input); if (!unit) { unit = ParseFunction(input); } if (!unit) { return 0; } Ptr loop; while ((loop = ParseLoop(input))) { loop->expression = unit; unit = loop; } return unit; } Ptr ParseJoin(const char32_t*& input) { auto expression = ParseUnit(input); while (true) { auto right = ParseUnit(input); if (right) { auto sequence = Ptr(new SequenceExpression); sequence->left = expression; sequence->right = right; expression = sequence; } else { break; } } return expression; } Ptr ParseAlt(const char32_t*& input) { auto expression = ParseJoin(input); while (true) { if (IsChar(input, U'|')) { auto right = ParseJoin(input); if (right) { auto alternate = Ptr(new AlternateExpression); alternate->left = expression; alternate->right = right; expression = alternate; } else { throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseAlt", L"input"); } } else { break; } } return expression; } Ptr ParseExpression(const char32_t*& input) { return ParseAlt(input); } Ptr ParseRegexExpression(const U32String& code) { auto regex = Ptr(new RegexExpression); const char32_t* start = code.Buffer(); const char32_t* input = start; try { while (IsStr(input, U"(<#")) { U32String name; if (!IsName(input, name)) { throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseRegexExpression", L"code"); } if (!IsChar(input, U'>')) { throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input"); } Ptr sub = ParseExpression(input); if (!IsChar(input, U')')) { throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input"); } if (regex->definitions.Keys().Contains(name)) { throw ArgumentException(L"Regular expression syntax error: Found duplicated sub expression name: \"" + u32tow(name) + L"\". ", L"vl::regex_internal::ParseFunction", L"input"); } else { regex->definitions.Add(name, sub); } } regex->expression = ParseExpression(input); if (!regex->expression) { throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseUnit", L"input"); } if (*input) { throw ArgumentException(L"Regular expression syntax error: Found unnecessary tokens.", L"vl::regex_internal::ParseUnit", L"input"); } return regex; } catch (const ArgumentException& e) { throw RegexException(e.Message(), code, input - start); } } U32String EscapeTextForRegex(const U32String& literalString) { U32String result; for (vint i = 0; i < literalString.Length(); i++) { char32_t c = literalString[i]; switch (c) { case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|': case U'{':case U'}':case U'[':case U']':case U'<':case U'>': case U'^':case U'$':case U'!':case U'=': result += U32String(U"\\") + U32String::FromChar(c); break; case U'\r': result += U"\\r"; break; case U'\n': result += U"\\n"; break; case U'\t': result += U"\\t"; break; default: result += U32String::FromChar(c); } } return result; } U32String UnescapeTextForRegex(const U32String& escapedText) { U32String result; for (vint i = 0; i < escapedText.Length(); i++) { char32_t c = escapedText[i]; if (c == U'\\' || c == U'/') { if (i < escapedText.Length() - 1) { i++; c = escapedText[i]; switch (c) { case U'r': result += U"\r"; break; case U'n': result += U"\n"; break; case U't': result += U"\t"; break; default: result += U32String::FromChar(c); } continue; } } result += U32String::FromChar(c); } return result; } U32String NormalizeEscapedTextForRegex(const U32String& escapedText) { U32String result; for (vint i = 0; i < escapedText.Length(); i++) { char32_t c = escapedText[i]; if (c == U'\\' || c == U'/') { if (i < escapedText.Length() - 1) { i++; c = escapedText[i]; result += U32String(U"\\") + U32String::FromChar(c); continue; } } result += U32String::FromChar(c); } return result; } bool IsRegexEscapedLiteralString(const U32String& regex) { for (vint i = 0; i < regex.Length(); i++) { char32_t c = regex[i]; if (c == U'\\' || c == U'/') { i++; } else { switch (c) { case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|': case U'{':case U'}':case U'[':case U']':case U'<':case U'>': case U'^':case U'$':case U'!':case U'=': return false; } } } return true; } } } /*********************************************************************** .\AST\REGEXWRITER.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex { using namespace vl::regex_internal; /*********************************************************************** RegexNode ***********************************************************************/ RegexNode::RegexNode(Ptr _expression) :expression(_expression) { } RegexNode RegexNode::Some()const { return Loop(1, -1); } RegexNode RegexNode::Any()const { return Loop(0, -1); } RegexNode RegexNode::Opt()const { return Loop(0, 1); } RegexNode RegexNode::Loop(vint min, vint max)const { auto target = Ptr(new LoopExpression); target->min = min; target->max = max; target->preferLong = true; target->expression = expression; return RegexNode(target); } RegexNode RegexNode::AtLeast(vint min)const { return Loop(min, -1); } RegexNode RegexNode::operator+(const RegexNode& node)const { auto target = Ptr(new SequenceExpression); target->left = expression; target->right = node.expression; return RegexNode(target); } RegexNode RegexNode::operator|(const RegexNode& node)const { auto target = Ptr(new AlternateExpression); target->left = expression; target->right = node.expression; return RegexNode(target); } RegexNode RegexNode::operator+()const { auto target = Ptr(new PositiveExpression); target->expression = expression; return RegexNode(target); } RegexNode RegexNode::operator-()const { auto target = Ptr(new NegativeExpression); target->expression = expression; return RegexNode(target); } RegexNode RegexNode::operator!()const { auto source = dynamic_cast(expression.Obj()); CHECK_ERROR(source, L"RegexNode::operator!()#operator ! can only applies on charset expressions."); auto target = Ptr(new CharSetExpression); CopyFrom(target->ranges, source->ranges); target->reverse = !source->reverse; return RegexNode(target); } RegexNode RegexNode::operator%(const RegexNode& node)const { auto left = dynamic_cast(expression.Obj()); auto right = dynamic_cast(node.expression.Obj()); CHECK_ERROR(left && right && !left->reverse && !right->reverse, L"RegexNode::operator%(const RegexNode&)#operator % only connects non-reverse charset expressions."); auto target = Ptr(new CharSetExpression); target->reverse = false; CopyFrom(target->ranges, left->ranges); // TODO: (enumerable) foreach for (vint i = 0; i < right->ranges.Count(); i++) { if (!target->AddRangeWithConflict(right->ranges[i])) { CHECK_ERROR(false, L"RegexNode::operator%(const RegexNode&)#Failed to create charset expression from operator %."); } } return RegexNode(target); } /*********************************************************************** Regex Writer ***********************************************************************/ RegexNode rCapture(const U32String& name, const RegexNode& node) { auto target = Ptr(new CaptureExpression); target->name = name; target->expression = node.expression; return RegexNode(target); } RegexNode rUsing(const U32String& name) { auto target = Ptr(new UsingExpression); target->name = name; return RegexNode(target); } RegexNode rMatch(const U32String& name, vint index) { auto target = Ptr(new MatchExpression); target->name = name; target->index = index; return RegexNode(target); } RegexNode rMatch(vint index) { auto target = Ptr(new MatchExpression); target->index = index; return RegexNode(target); } RegexNode rBegin() { return RegexNode(Ptr(new BeginExpression)); } RegexNode rEnd() { return RegexNode(Ptr(new EndExpression)); } RegexNode rC(char32_t a, char32_t b) { if (!b)b = a; auto target = Ptr(new CharSetExpression); target->reverse = false; target->AddRangeWithConflict(CharRange(a, b)); return RegexNode(target); } RegexNode r_d() { return rC(U'0', U'9'); } RegexNode r_l() { return rC(U'a', U'z') % rC(U'A', U'Z') % rC(U'_'); } RegexNode r_w() { return rC(U'0', U'9') % rC(U'a', U'z') % rC(U'A', U'Z') % rC(U'_'); } RegexNode rAnyChar() { return rC(1, MaxChar32); } } } /*********************************************************************** .\AUTOMATON\REGEXAUTOMATON.CPP ***********************************************************************/ /*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/ namespace vl { namespace regex_internal { using namespace collections; /*********************************************************************** Automaton ***********************************************************************/ Automaton::Automaton() { startState = 0; } State* Automaton::NewState() { auto state = Ptr(new State); state->finalState = false; state->userData = 0; states.Add(state); return state.Obj(); } Transition* Automaton::NewTransition(State* start, State* end) { auto transition = Ptr(new Transition); transition->source = start; transition->target = end; start->transitions.Add(transition.Obj()); end->inputs.Add(transition.Obj()); transitions.Add(transition); return transition.Obj(); } Transition* Automaton::NewChars(State* start, State* end, CharRange range) { auto transition = NewTransition(start, end); transition->type = Transition::Chars; transition->range = range; return transition; } Transition* Automaton::NewEpsilon(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::Epsilon; return transition; } Transition* Automaton::NewBeginString(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::BeginString; return transition; } Transition* Automaton::NewEndString(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::EndString; return transition; } Transition* Automaton::NewNop(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::Nop; return transition; } Transition* Automaton::NewCapture(State* start, State* end, vint capture) { auto transition = NewTransition(start, end); transition->type = Transition::Capture; transition->capture = capture; return transition; } Transition* Automaton::NewMatch(State* start, State* end, vint capture, vint index) { auto transition = NewTransition(start, end); transition->type = Transition::Match; transition->capture = capture; transition->index = index; return transition; } Transition* Automaton::NewPositive(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::Positive; return transition; } Transition* Automaton::NewNegative(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::Negative; return transition; } Transition* Automaton::NewNegativeFail(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::NegativeFail; return transition; } Transition* Automaton::NewEnd(State* start, State* end) { auto transition = NewTransition(start, end); transition->type = Transition::End; return transition; } /*********************************************************************** Helpers ***********************************************************************/ bool PureEpsilonChecker(Transition* transition) { switch (transition->type) { case Transition::Epsilon: case Transition::Nop: case Transition::Capture: case Transition::End: return true; default: return false; } } bool RichEpsilonChecker(Transition* transition) { switch (transition->type) { case Transition::Epsilon: return true; default: return false; } } bool AreEqual(Transition* transA, Transition* transB) { if (transA->type != transB->type)return false; switch (transA->type) { case Transition::Chars: return transA->range == transB->range; case Transition::Capture: return transA->capture == transB->capture; case Transition::Match: return transA->capture == transB->capture && transA->index == transB->index; default: return true; } } // Collect epsilon states and non-epsilon transitions, their order are maintained to match the e-NFA void CollectEpsilon(State* targetState, State* sourceState, bool(*epsilonChecker)(Transition*), List& epsilonStates, List& transitions) { if (!epsilonStates.Contains(sourceState)) { epsilonStates.Add(sourceState); // TODO: (enumerable) foreach:alterable for (vint i = 0; i < sourceState->transitions.Count(); i++) { Transition* transition = sourceState->transitions[i]; if (epsilonChecker(transition)) { if (!epsilonStates.Contains(transition->target)) { if (transition->target->finalState) { targetState->finalState = true; } CollectEpsilon(targetState, transition->target, epsilonChecker, epsilonStates, transitions); } } else { transitions.Add(transition); } } } } Ptr EpsilonNfaToNfa(Ptr source, bool(*epsilonChecker)(Transition*), Dictionary& nfaStateMap) { auto target = Ptr(new Automaton); Dictionary stateMap; // source->target List epsilonStates; // current epsilon closure List transitions; // current non-epsilon transitions stateMap.Add(source->startState, target->NewState()); nfaStateMap.Add(stateMap[source->startState], source->startState); target->startState = target->states[0].Obj(); CopyFrom(target->captureNames, source->captureNames); // TODO: (enumerable) foreach for (vint i = 0; i < target->states.Count(); i++) { // Clear cache State* targetState = target->states[i].Obj(); State* sourceState = nfaStateMap[targetState]; if (sourceState->finalState) { targetState->finalState = true; } epsilonStates.Clear(); transitions.Clear(); // Collect epsilon states and non-epsilon transitions CollectEpsilon(targetState, sourceState, epsilonChecker, epsilonStates, transitions); // Iterate through all non-epsilon transitions // TODO: (enumerable) foreach for (vint j = 0; j < transitions.Count(); j++) { Transition* transition = transitions[j]; // Create and map a new target state if a new non-epsilon state is found in the e-NFA if (!stateMap.Keys().Contains(transition->target)) { stateMap.Add(transition->target, target->NewState()); nfaStateMap.Add(stateMap[transition->target], transition->target); } // Copy transition to connect between two non-epsilon state Transition* newTransition = target->NewTransition(targetState, stateMap[transition->target]); newTransition->capture = transition->capture; newTransition->index = transition->index; newTransition->range = transition->range; newTransition->type = transition->type; } } return target; } Ptr NfaToDfa(Ptr source, Group& dfaStateMap) { auto target = Ptr(new Automaton); CopyFrom(target->captureNames, source->captureNames); State* startState = target->NewState(); target->startState = startState; dfaStateMap.Add(startState, source->startState); for (auto currentState_ : target->states) { Group nfaClassToTransitions; Dictionary nfaTransitionToClass; List orderedTransitionClasses; State* currentState = currentState_.Obj(); // Iterate through all NFA states which represent the DFA state for (auto nfaState : dfaStateMap[currentState]) { // Iterate through all transitions from those NFA states for (auto nfaTransition : nfaState->transitions) { Transition* transitionClass = nullptr; // Check if there is any key in nfaTransitions that has the same input as the current transition { vint index = nfaTransitionToClass.Keys().IndexOf(nfaTransition); if (index != -1) transitionClass = nfaTransitionToClass.Values()[index]; } if (transitionClass == nullptr) { // TODO: (enumerable) foreach for (vint l = 0; l < orderedTransitionClasses.Count(); l++) { Transition* key = orderedTransitionClasses[l]; if (AreEqual(key, nfaTransition)) { transitionClass = key; break; } } } // Create a new key if not if (transitionClass == nullptr) { transitionClass = nfaTransition; orderedTransitionClasses.Add(transitionClass); } // Group the transition nfaClassToTransitions.Add(transitionClass, nfaTransition); nfaTransitionToClass.Add(nfaTransition, transitionClass); } } // Iterate through all key transition that represent all existing transition inputs from the same state for (auto transitionClass : orderedTransitionClasses) { auto&& equivalentTransitions = nfaClassToTransitions[transitionClass]; // Sort all target states and keep unique List transitionTargets; CopyFrom( transitionTargets, From(equivalentTransitions) .Select([](auto t) { return t->target; }) .Distinct() ); // Check if these NFA states represent a created DFA state State* dfaState = 0; // TODO: (enumerable) foreach on dictionary for (vint k = 0; k < dfaStateMap.Count(); k++) { // Compare two NFA states set if (CompareEnumerable(transitionTargets, dfaStateMap.GetByIndex(k)) == 0) { dfaState = dfaStateMap.Keys()[k]; } } // Create a new DFA state if there is not if (!dfaState) { dfaState = target->NewState(); // TODO: (enumerable) foreach for (vint k = 0; k < transitionTargets.Count(); k++) { dfaStateMap.Add(dfaState, transitionTargets[k]); if (transitionTargets[k]->finalState) { dfaState->finalState = true; } } } // Create corresponding DFA transition Transition* newTransition = target->NewTransition(currentState, dfaState); newTransition->capture = transitionClass->capture; newTransition->index = transitionClass->index; newTransition->range = transitionClass->range; newTransition->type = transitionClass->type; } } return target; } } }