Files
GacUI/Import/VlppRegex.cpp
2023-07-06 01:50:42 -07:00

4347 lines
119 KiB
C++

/***********************************************************************
THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY
DEVELOPER: Zihan Chen(vczh)
***********************************************************************/
#include "VlppRegex.h"
/***********************************************************************
.\REGEX.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
void ReadInt(stream::IStream& inputStream, vint& value);
void ReadInts(stream::IStream& inputStream, vint count, vint* values);
void WriteInt(stream::IStream& outputStream, vint value);
void WriteInts(stream::IStream& outputStream, vint count, vint* values);
}
namespace regex
{
using namespace collections;
using namespace regex_internal;
/***********************************************************************
String Conversion
***********************************************************************/
template<typename T>
struct U32;
template<>
struct U32<wchar_t>
{
static constexpr U32String(*ToU32)(const WString&) = &wtou32;
static constexpr WString(*FromU32)(const U32String&) = &u32tow;
};
template<>
struct U32<char8_t>
{
static constexpr U32String(*ToU32)(const U8String&) = &u8tou32;
static constexpr U8String(*FromU32)(const U32String&) = &u32tou8;
};
template<>
struct U32<char16_t>
{
static constexpr U32String(*ToU32)(const U16String&) = &u16tou32;
static constexpr U16String(*FromU32)(const U32String&) = &u32tou16;
};
template<>
struct U32<char32_t>
{
static U32String ToU32(const U32String& text) { return text; }
static U32String FromU32(const U32String& text) { return text; }
};
/***********************************************************************
RegexMatch_<T>
***********************************************************************/
template<typename T>
RegexMatch_<T>::RegexMatch_(const ObjectString<T>& _string, PureResult* _result)
:success(true)
, result(_string, _result->start, _result->length)
{
}
template<typename T>
RegexMatch_<T>::RegexMatch_(const ObjectString<T>& _string, RichResult* _result)
: success(true)
, result(_string, _result->start, _result->length)
{
// TODO: (enumerable) foreach
for (vint i = 0; i < _result->captures.Count(); i++)
{
CaptureRecord& capture = _result->captures[i];
if (capture.capture == -1)
{
captures.Add(RegexString_<T>(_string, capture.start, capture.length));
}
else
{
groups.Add(capture.capture, RegexString_<T>(_string, capture.start, capture.length));
}
}
}
template<typename T>
RegexMatch_<T>::RegexMatch_(const RegexString_<T>& _result)
:success(false)
, result(_result)
{
}
template<typename T>
bool RegexMatch_<T>::Success()const
{
return success;
}
template<typename T>
const RegexString_<T>& RegexMatch_<T>::Result()const
{
return result;
}
template<typename T>
const typename RegexMatch_<T>::CaptureList& RegexMatch_<T>::Captures()const
{
return captures;
}
template<typename T>
const typename RegexMatch_<T>::CaptureGroup& RegexMatch_<T>::Groups()const
{
return groups;
}
/***********************************************************************
RegexBase_
***********************************************************************/
template<typename T>
void RegexBase_::Process(const ObjectString<T>& text, bool keepEmpty, bool keepSuccess, bool keepFail, typename RegexMatch_<T>::List& matches)const
{
if (rich)
{
const T* start = text.Buffer();
const T* input = start;
RichResult result;
while (rich->Match(input, start, result))
{
vint offset = input - start;
if (keepFail)
{
if (result.start > offset || keepEmpty)
{
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, offset, result.start - offset))));
}
}
if (keepSuccess)
{
matches.Add(Ptr(new RegexMatch_<T>(text, &result)));
}
input = start + result.start + result.length;
}
if (keepFail)
{
vint remain = input - start;
vint length = text.Length() - remain;
if (length || keepEmpty)
{
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, remain, length))));
}
}
}
else
{
const T* start = text.Buffer();
const T* input = start;
PureResult result;
while (pure->Match(input, start, result))
{
vint offset = input - start;
if (keepFail)
{
if (result.start > offset || keepEmpty)
{
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, offset, result.start - offset))));
}
}
if (keepSuccess)
{
matches.Add(Ptr(new RegexMatch_<T>(text, &result)));
}
input = start + result.start + result.length;
}
if (keepFail)
{
vint remain = input - start;
vint length = text.Length() - remain;
if (length || keepEmpty)
{
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, remain, length))));
}
}
}
}
RegexBase_::~RegexBase_()
{
if (pure) delete pure;
if (rich) delete rich;
}
template<typename T>
typename RegexMatch_<T>::Ref RegexBase_::MatchHead(const ObjectString<T>& text)const
{
if (rich)
{
RichResult result;
if (rich->MatchHead(text.Buffer(), text.Buffer(), result))
{
return Ptr(new RegexMatch_<T>(text, &result));
}
else
{
return nullptr;
}
}
else
{
PureResult result;
if (pure->MatchHead(text.Buffer(), text.Buffer(), result))
{
return Ptr(new RegexMatch_<T>(text, &result));
}
else
{
return nullptr;
}
}
}
template<typename T>
typename RegexMatch_<T>::Ref RegexBase_::Match(const ObjectString<T>& text)const
{
if (rich)
{
RichResult result;
if (rich->Match(text.Buffer(), text.Buffer(), result))
{
return Ptr(new RegexMatch_<T>(text, &result));
}
else
{
return nullptr;
}
}
else
{
PureResult result;
if (pure->Match(text.Buffer(), text.Buffer(), result))
{
return Ptr(new RegexMatch_<T>(text, &result));
}
else
{
return nullptr;
}
}
}
template<typename T>
bool RegexBase_::TestHead(const ObjectString<T>& text)const
{
if (pure)
{
PureResult result;
return pure->MatchHead(text.Buffer(), text.Buffer(), result);
}
else
{
RichResult result;
return rich->MatchHead(text.Buffer(), text.Buffer(), result);
}
}
template<typename T>
bool RegexBase_::Test(const ObjectString<T>& text)const
{
if (pure)
{
PureResult result;
return pure->Match(text.Buffer(), text.Buffer(), result);
}
else
{
RichResult result;
return rich->Match(text.Buffer(), text.Buffer(), result);
}
}
template<typename T>
void RegexBase_::Search(const ObjectString<T>& text, typename RegexMatch_<T>::List& matches)const
{
Process(text, false, true, false, matches);
}
template<typename T>
void RegexBase_::Split(const ObjectString<T>& text, bool keepEmptyMatch, typename RegexMatch_<T>::List& matches)const
{
Process(text, keepEmptyMatch, false, true, matches);
}
template<typename T>
void RegexBase_::Cut(const ObjectString<T>& text, bool keepEmptyMatch, typename RegexMatch_<T>::List& matches)const
{
Process(text, keepEmptyMatch, true, true, matches);
}
/***********************************************************************
Regex_<T>
***********************************************************************/
template<typename T>
Regex_<T>::Regex_(const ObjectString<T>& code, bool preferPure)
{
CharRange::List subsets;
auto regex = ParseRegexExpression(U32<T>::ToU32(code));
auto expression = regex->Merge();
expression->NormalizeCharSet(subsets);
bool pureRequired = false;
bool richRequired = false;
if (preferPure)
{
if (expression->HasNoExtension())
{
pureRequired = true;
}
else
{
if (expression->CanTreatAsPure())
{
pureRequired = true;
richRequired = true;
}
else
{
richRequired = true;
}
}
}
else
{
richRequired = true;
}
try
{
if (pureRequired)
{
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
Ptr<Automaton> eNfa = expression->GenerateEpsilonNfa();
Ptr<Automaton> nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
Ptr<Automaton> dfa = NfaToDfa(nfa, dfaStateMap);
pure = new PureInterpretor(dfa, subsets);
}
if (richRequired)
{
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
Ptr<Automaton> eNfa = expression->GenerateEpsilonNfa();
Ptr<Automaton> nfa = EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap);
Ptr<Automaton> dfa = NfaToDfa(nfa, dfaStateMap);
rich = new RichInterpretor(dfa);
for (auto&& name : rich->CaptureNames())
{
captureNames.Add(U32<T>::FromU32(name));
}
}
}
catch (...)
{
if (pure)delete pure;
if (rich)delete rich;
throw;
}
}
/***********************************************************************
RegexTokens_<T>
***********************************************************************/
template<typename T>
class RegexTokenEnumerator : public Object, public IEnumerator<RegexToken_<T>>
{
protected:
RegexToken_<T> token;
vint index = -1;
PureInterpretor* pure;
const Array<vint>& stateTokens;
const T* start;
vint codeIndex;
RegexProc_<T> proc;
const T* reading;
vint rowStart = 0;
vint columnStart = 0;
bool cacheAvailable = false;
RegexToken_<T> cacheToken;
public:
RegexTokenEnumerator(const RegexTokenEnumerator& enumerator)
: token(enumerator.token)
, index(enumerator.index)
, pure(enumerator.pure)
, stateTokens(enumerator.stateTokens)
, start(enumerator.start)
, codeIndex(enumerator.codeIndex)
, proc(enumerator.proc)
, reading(enumerator.reading)
, rowStart(enumerator.rowStart)
, columnStart(enumerator.columnStart)
, cacheAvailable(enumerator.cacheAvailable)
, cacheToken(enumerator.cacheToken)
{
}
RegexTokenEnumerator(PureInterpretor* _pure, const Array<vint>& _stateTokens, const T* _start, vint _codeIndex, RegexProc_<T> _proc)
:index(-1)
, pure(_pure)
, stateTokens(_stateTokens)
, start(_start)
, codeIndex(_codeIndex)
, proc(_proc)
, reading(_start)
{
}
IEnumerator<RegexToken_<T>>* Clone()const
{
return new RegexTokenEnumerator<T>(*this);
}
const RegexToken_<T>& Current()const
{
return token;
}
vint Index()const
{
return index;
}
bool Next()
{
if (!cacheAvailable && !*reading) return false;
if (cacheAvailable)
{
token = cacheToken;
cacheAvailable = false;
}
else
{
token.reading = reading;
token.start = 0;
token.length = 0;
token.token = -2;
token.completeToken = true;
}
token.rowStart = rowStart;
token.columnStart = columnStart;
token.rowEnd = rowStart;
token.columnEnd = columnStart;
token.codeIndex = codeIndex;
PureResult result;
while (*reading)
{
vint id = -1;
bool completeToken = true;
if (!pure->MatchHead(reading, start, result))
{
result.start = reading - start;
if (id == -1 && result.terminateState != -1)
{
vint state = pure->GetRelatedFinalState(result.terminateState);
if (state != -1)
{
id = stateTokens[state];
}
}
if (id == -1)
{
result.length = 1;
}
else
{
completeToken = false;
}
}
else
{
id = stateTokens.Get(result.finalState);
}
if (id != -1 && proc.extendProc)
{
RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr);
proc.extendProc(proc.argument, reading, -1, true, token);
#if _DEBUG
CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer.");
#endif
result.length = token.length;
id = token.token;
completeToken = token.completeToken;
}
if (token.token == -2)
{
token.start = result.start;
token.length = result.length;
token.token = id;
token.completeToken = completeToken;
}
else if (token.token == id && id == -1)
{
token.length += result.length;
}
else
{
cacheAvailable = true;
cacheToken.reading = reading;
cacheToken.start = result.start;
cacheToken.length = result.length;
cacheToken.codeIndex = codeIndex;
cacheToken.token = id;
cacheToken.completeToken = completeToken;
}
reading += result.length;
if (cacheAvailable)
{
break;
}
}
index++;
for (vint i = 0; i < token.length; i++)
{
token.rowEnd = rowStart;
token.columnEnd = columnStart;
if (token.reading[i] == L'\n')
{
rowStart++;
columnStart = 0;
}
else
{
columnStart++;
}
}
return true;
}
void Reset()
{
index = -1;
reading = start;
cacheAvailable = false;
}
void ReadToEnd(List<RegexToken_<T>>& tokens, bool(*discard)(vint))
{
while (Next())
{
if (!discard(token.token))
{
tokens.Add(token);
}
}
}
};
template<typename T>
RegexTokens_<T>::RegexTokens_(PureInterpretor* _pure, const Array<vint>& _stateTokens, const ObjectString<T>& _code, vint _codeIndex, RegexProc_<T> _proc)
:pure(_pure)
, stateTokens(_stateTokens)
, code(_code)
, codeIndex(_codeIndex)
, proc(_proc)
{
}
template<typename T>
RegexTokens_<T>::RegexTokens_(const RegexTokens_<T>& tokens)
:pure(tokens.pure)
, stateTokens(tokens.stateTokens)
, code(tokens.code)
, codeIndex(tokens.codeIndex)
, proc(tokens.proc)
{
}
template<typename T>
IEnumerator<RegexToken_<T>>* RegexTokens_<T>::CreateEnumerator() const
{
return new RegexTokenEnumerator<T>(pure, stateTokens, code.Buffer(), codeIndex, proc);
}
bool DefaultDiscard(vint token)
{
return false;
}
template<typename T>
void RegexTokens_<T>::ReadToEnd(collections::List<RegexToken_<T>>& tokens, bool(*discard)(vint))const
{
if (discard == 0)
{
discard = &DefaultDiscard;
}
RegexTokenEnumerator<T>(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard);
}
/***********************************************************************
RegexLexerWalker_<T>
***********************************************************************/
template<typename T>
RegexLexerWalker_<T>::RegexLexerWalker_(PureInterpretor* _pure, const Array<vint>& _stateTokens)
:pure(_pure)
, stateTokens(_stateTokens)
{
}
template<typename T>
RegexLexerWalker_<T>::RegexLexerWalker_(const RegexLexerWalker_<T>& tokens)
: pure(tokens.pure)
, stateTokens(tokens.stateTokens)
{
}
template<typename T>
vint RegexLexerWalker_<T>::GetStartState()const
{
return pure->GetStartState();
}
template<typename T>
vint RegexLexerWalker_<T>::GetRelatedToken(vint state)const
{
vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state);
return finalState == -1 ? -1 : stateTokens.Get(finalState);
}
template<typename T>
void RegexLexerWalker_<T>::Walk(T input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const
{
vint previousState = state;
token = -1;
finalState = false;
previousTokenStop = false;
if (state == -1)
{
state = pure->GetStartState();
previousTokenStop = true;
}
state = pure->Transit(input, state);
if (state == -1)
{
previousTokenStop = true;
if (previousState == -1)
{
finalState = true;
return;
}
else if (pure->IsFinalState(previousState))
{
state = pure->Transit(input, pure->GetStartState());
}
}
if (pure->IsFinalState(state))
{
token = stateTokens.Get(state);
finalState = true;
return;
}
else
{
finalState = state == -1;
return;
}
}
template<typename T>
vint RegexLexerWalker_<T>::Walk(T input, vint state)const
{
vint token = -1;
bool finalState = false;
bool previousTokenStop = false;
Walk(input, state, token, finalState, previousTokenStop);
return state;
}
template<typename T>
bool RegexLexerWalker_<T>::IsClosedToken(const T* input, vint length)const
{
vint state = pure->GetStartState();
for (vint i = 0; i < length; i++)
{
state = pure->Transit(input[i], state);
if (state == -1) return true;
if (pure->IsDeadState(state)) return true;
}
return false;
}
template<typename T>
bool RegexLexerWalker_<T>::IsClosedToken(const ObjectString<T>& input)const
{
return IsClosedToken(input.Buffer(), input.Length());
}
/***********************************************************************
RegexLexerColorizer_<T>
***********************************************************************/
template<typename T>
RegexLexerColorizer_<T>::RegexLexerColorizer_(const RegexLexerWalker_<T>& _walker, RegexProc_<T> _proc)
:walker(_walker)
, proc(_proc)
{
internalState.currentState = walker.GetStartState();
}
template<typename T>
typename RegexLexerColorizer_<T>::InternalState RegexLexerColorizer_<T>::GetInternalState()
{
return internalState;
}
template<typename T>
void RegexLexerColorizer_<T>::SetInternalState(InternalState state)
{
internalState = state;
}
template<typename T>
void RegexLexerColorizer_<T>::Pass(T input)
{
WalkOneToken(&input, 1, 0, false);
}
template<typename T>
vint RegexLexerColorizer_<T>::GetStartState()const
{
return walker.GetStartState();
}
template<typename T>
void RegexLexerColorizer_<T>::CallExtendProcAndColorizeProc(const T* input, vint length, RegexProcessingToken& token, bool colorize)
{
vint oldTokenLength = token.length;
proc.extendProc(proc.argument, input + token.start, length - token.start, false, token);
#if _DEBUG
{
bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken;
CHECK_ERROR(
token.completeToken || pausedAtTheEnd,
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input."
);
CHECK_ERROR(
token.completeToken || token.token != -1,
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id."
);
CHECK_ERROR(
oldTokenLength <= token.length,
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length."
);
CHECK_ERROR(
(token.interTokenState == nullptr) == !pausedAtTheEnd,
L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
);
}
#endif
if ((internalState.interTokenState = token.interTokenState))
{
internalState.interTokenId = token.token;
}
if (colorize)
{
proc.colorizeProc(proc.argument, token.start, token.length, token.token);
}
}
template<typename T>
vint RegexLexerColorizer_<T>::WalkOneToken(const T* input, vint length, vint start, bool colorize)
{
if (internalState.interTokenState)
{
RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState);
proc.extendProc(proc.argument, input, length, false, token);
#if _DEBUG
{
bool pausedAtTheEnd = token.length == length && !token.completeToken;
CHECK_ERROR(
token.completeToken || pausedAtTheEnd,
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input."
);
CHECK_ERROR(
token.completeToken || token.token == internalState.interTokenId,
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id."
);
CHECK_ERROR(
(token.interTokenState == nullptr) == !pausedAtTheEnd,
L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
);
}
#endif
if (colorize)
{
proc.colorizeProc(proc.argument, 0, token.length, token.token);
}
if (!(internalState.interTokenState = token.interTokenState))
{
internalState.interTokenId = -1;
}
return token.length;
}
vint lastFinalStateLength = 0;
vint lastFinalStateToken = -1;
vint lastFinalStateState = -1;
vint tokenStartState = internalState.currentState;
for (vint i = start; i < length; i++)
{
vint currentToken = -1;
bool finalState = false;
bool previousTokenStop = false;
walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop);
if (previousTokenStop)
{
if (proc.extendProc && lastFinalStateToken != -1)
{
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
CallExtendProcAndColorizeProc(input, length, token, colorize);
if (token.completeToken)
{
internalState.currentState = walker.GetStartState();
}
return start + token.length;
}
else if (i == start)
{
if (tokenStartState == GetStartState())
{
if (colorize)
{
proc.colorizeProc(proc.argument, start, 1, -1);
}
internalState.currentState = walker.GetStartState();
return i + 1;
}
}
else
{
if (colorize)
{
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
}
internalState.currentState = lastFinalStateState;
return start + lastFinalStateLength;
}
}
if (finalState)
{
lastFinalStateLength = i + 1 - start;
lastFinalStateToken = currentToken;
lastFinalStateState = internalState.currentState;
}
}
if (lastFinalStateToken != -1 && start + lastFinalStateLength == length)
{
if (proc.extendProc)
{
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
CallExtendProcAndColorizeProc(input, length, token, colorize);
}
else if (colorize)
{
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
}
}
else if (colorize)
{
proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState));
}
return length;
}
template<typename T>
void* RegexLexerColorizer_<T>::Colorize(const T* input, vint length)
{
vint index = 0;
while (index != length)
{
index = WalkOneToken(input, length, index, true);
}
return internalState.interTokenState;
}
/***********************************************************************
RegexLexerBase_
***********************************************************************/
RegexLexerBase_::~RegexLexerBase_()
{
if (pure) delete pure;
}
template<typename T>
RegexTokens_<T> RegexLexerBase_::Parse(const ObjectString<T>& code, RegexProc_<T> proc, vint codeIndex)const
{
code.Buffer();
pure->PrepareForRelatedFinalStateTable();
return RegexTokens_<T>(pure, stateTokens, code, codeIndex, proc);
}
template<typename T>
RegexLexerWalker_<T> RegexLexerBase_::Walk()const
{
pure->PrepareForRelatedFinalStateTable();
return RegexLexerWalker_<T>(pure, stateTokens);
}
RegexLexerWalker_<wchar_t> RegexLexerBase_::Walk()const
{
pure->PrepareForRelatedFinalStateTable();
return RegexLexerWalker_<wchar_t>(pure, stateTokens);
}
template<typename T>
RegexLexerColorizer_<T> RegexLexerBase_::Colorize(RegexProc_<T> proc)const
{
return RegexLexerColorizer_<T>(Walk<T>(), proc);
}
/***********************************************************************
RegexLexer_<T> (Serialization)
***********************************************************************/
template<typename T>
RegexLexer_<T>::RegexLexer_(stream::IStream& inputStream)
{
pure = new PureInterpretor(inputStream);
vint count = 0;
ReadInt(inputStream, count);
stateTokens.Resize(count);
if (count > 0)
{
ReadInts(inputStream, count, &stateTokens[0]);
}
}
template<typename T>
void RegexLexer_<T>::Serialize(stream::IStream& outputStream)
{
pure->Serialize(outputStream);
WriteInt(outputStream, stateTokens.Count());
if (stateTokens.Count() > 0)
{
WriteInts(outputStream, stateTokens.Count(), &stateTokens[0]);
}
}
/***********************************************************************
RegexLexer_<T>
***********************************************************************/
template<typename T>
RegexLexer_<T>::RegexLexer_(const collections::IEnumerable<ObjectString<T>>& tokens)
{
// Build DFA for all tokens
List<Ptr<Expression>> expressions;
List<Ptr<Automaton>> dfas;
CharRange::List subsets;
for (auto&& code : tokens)
{
auto regex = ParseRegexExpression(U32<T>::ToU32(code));
auto expression = regex->Merge();
expression->CollectCharSet(subsets);
expressions.Add(expression);
}
// TODO: (enumerable) foreach
for (vint i = 0; i < expressions.Count(); i++)
{
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
expressions[i]->ApplyCharSet(subsets);
auto eNfa = expressions[i]->GenerateEpsilonNfa();
auto nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
auto dfa = NfaToDfa(nfa, dfaStateMap);
dfas.Add(dfa);
}
// Mark all states in DFAs
// TODO: (enumerable) foreach
for (vint i = 0; i < dfas.Count(); i++)
{
Ptr<Automaton> dfa = dfas[i];
// TODO: (enumerable) foreach
for (vint j = 0; j < dfa->states.Count(); j++)
{
if (dfa->states[j]->finalState)
{
dfa->states[j]->userData = (void*)i;
}
else
{
dfa->states[j]->userData = (void*)dfas.Count();
}
}
}
// Connect all DFAs to an e-NFA
auto bigEnfa = Ptr(new Automaton);
// TODO: (enumerable) foreach
for (vint i = 0; i < dfas.Count(); i++)
{
CopyFrom(bigEnfa->states, dfas[i]->states, true);
CopyFrom(bigEnfa->transitions, dfas[i]->transitions, true);
}
bigEnfa->startState = bigEnfa->NewState();
// TODO: (enumerable) foreach
for (vint i = 0; i < dfas.Count(); i++)
{
bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState);
}
// Build a single DFA out of the e-NFA
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
auto bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap);
// TODO: (enumerable) foreach on dictionary
for (vint i = 0; i < nfaStateMap.Keys().Count(); i++)
{
void* userData = nfaStateMap.Values().Get(i)->userData;
nfaStateMap.Keys()[i]->userData = userData;
}
auto bigDfa = NfaToDfa(bigNfa, dfaStateMap);
// TODO: (enumerable) foreach on group
for (vint i = 0; i < dfaStateMap.Keys().Count(); i++)
{
void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData;
for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++)
{
void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData;
if (userData > newData)
{
userData = newData;
}
}
dfaStateMap.Keys()[i]->userData = userData;
}
// Build state machine
pure = new PureInterpretor(bigDfa, subsets);
stateTokens.Resize(bigDfa->states.Count());
for (vint i = 0; i < stateTokens.Count(); i++)
{
void* userData = bigDfa->states[i]->userData;
stateTokens[i] = (vint)userData;
}
}
/***********************************************************************
Template Instantiation
***********************************************************************/
template class RegexString_<wchar_t>;
template class RegexString_<char8_t>;
template class RegexString_<char16_t>;
template class RegexString_<char32_t>;
template class RegexMatch_<wchar_t>;
template class RegexMatch_<char8_t>;
template class RegexMatch_<char16_t>;
template class RegexMatch_<char32_t>;
template RegexMatch_<wchar_t>::Ref RegexBase_::MatchHead<wchar_t> (const ObjectString<wchar_t>& text)const;
template RegexMatch_<wchar_t>::Ref RegexBase_::Match<wchar_t> (const ObjectString<wchar_t>& text)const;
template bool RegexBase_::TestHead<wchar_t> (const ObjectString<wchar_t>& text)const;
template bool RegexBase_::Test<wchar_t> (const ObjectString<wchar_t>& text)const;
template void RegexBase_::Search<wchar_t> (const ObjectString<wchar_t>& text, RegexMatch_<wchar_t>::List& matches)const;
template void RegexBase_::Split<wchar_t> (const ObjectString<wchar_t>& text, bool keepEmptyMatch, RegexMatch_<wchar_t>::List& matches)const;
template void RegexBase_::Cut<wchar_t> (const ObjectString<wchar_t>& text, bool keepEmptyMatch, RegexMatch_<wchar_t>::List& matches)const;
template RegexMatch_<char8_t>::Ref RegexBase_::MatchHead<char8_t> (const ObjectString<char8_t>& text)const;
template RegexMatch_<char8_t>::Ref RegexBase_::Match<char8_t> (const ObjectString<char8_t>& text)const;
template bool RegexBase_::TestHead<char8_t> (const ObjectString<char8_t>& text)const;
template bool RegexBase_::Test<char8_t> (const ObjectString<char8_t>& text)const;
template void RegexBase_::Search<char8_t> (const ObjectString<char8_t>& text, RegexMatch_<char8_t>::List& matches)const;
template void RegexBase_::Split<char8_t> (const ObjectString<char8_t>& text, bool keepEmptyMatch, RegexMatch_<char8_t>::List& matches)const;
template void RegexBase_::Cut<char8_t> (const ObjectString<char8_t>& text, bool keepEmptyMatch, RegexMatch_<char8_t>::List& matches)const;
template RegexMatch_<char16_t>::Ref RegexBase_::MatchHead<char16_t> (const ObjectString<char16_t>& text)const;
template RegexMatch_<char16_t>::Ref RegexBase_::Match<char16_t> (const ObjectString<char16_t>& text)const;
template bool RegexBase_::TestHead<char16_t> (const ObjectString<char16_t>& text)const;
template bool RegexBase_::Test<char16_t> (const ObjectString<char16_t>& text)const;
template void RegexBase_::Search<char16_t> (const ObjectString<char16_t>& text, RegexMatch_<char16_t>::List& matches)const;
template void RegexBase_::Split<char16_t> (const ObjectString<char16_t>& text, bool keepEmptyMatch, RegexMatch_<char16_t>::List& matches)const;
template void RegexBase_::Cut<char16_t> (const ObjectString<char16_t>& text, bool keepEmptyMatch, RegexMatch_<char16_t>::List& matches)const;
template RegexMatch_<char32_t>::Ref RegexBase_::MatchHead<char32_t> (const ObjectString<char32_t>& text)const;
template RegexMatch_<char32_t>::Ref RegexBase_::Match<char32_t> (const ObjectString<char32_t>& text)const;
template bool RegexBase_::TestHead<char32_t> (const ObjectString<char32_t>& text)const;
template bool RegexBase_::Test<char32_t> (const ObjectString<char32_t>& text)const;
template void RegexBase_::Search<char32_t> (const ObjectString<char32_t>& text, RegexMatch_<char32_t>::List& matches)const;
template void RegexBase_::Split<char32_t> (const ObjectString<char32_t>& text, bool keepEmptyMatch, RegexMatch_<char32_t>::List& matches)const;
template void RegexBase_::Cut<char32_t> (const ObjectString<char32_t>& text, bool keepEmptyMatch, RegexMatch_<char32_t>::List& matches)const;
template class Regex_<wchar_t>;
template class Regex_<char8_t>;
template class Regex_<char16_t>;
template class Regex_<char32_t>;
template class RegexTokens_<wchar_t>;
template class RegexTokens_<char8_t>;
template class RegexTokens_<char16_t>;
template class RegexTokens_<char32_t>;
template class RegexLexerWalker_<wchar_t>;
template class RegexLexerWalker_<char8_t>;
template class RegexLexerWalker_<char16_t>;
template class RegexLexerWalker_<char32_t>;
template class RegexLexerColorizer_<wchar_t>;
template class RegexLexerColorizer_<char8_t>;
template class RegexLexerColorizer_<char16_t>;
template class RegexLexerColorizer_<char32_t>;
template RegexTokens_<wchar_t> RegexLexerBase_::Parse<wchar_t> (const ObjectString<wchar_t>& code, RegexProc_<wchar_t> _proc, vint codeIndex)const;
template RegexLexerWalker_<wchar_t> RegexLexerBase_::Walk<wchar_t> ()const;
template RegexLexerColorizer_<wchar_t> RegexLexerBase_::Colorize<wchar_t> (RegexProc_<wchar_t> _proc)const;
template RegexTokens_<char8_t> RegexLexerBase_::Parse<char8_t> (const ObjectString<char8_t>& code, RegexProc_<char8_t> _proc, vint codeIndex)const;
template RegexLexerWalker_<char8_t> RegexLexerBase_::Walk<char8_t> ()const;
template RegexLexerColorizer_<char8_t> RegexLexerBase_::Colorize<char8_t> (RegexProc_<char8_t> _proc)const;
template RegexTokens_<char16_t> RegexLexerBase_::Parse<char16_t> (const ObjectString<char16_t>& code, RegexProc_<char16_t> _proc, vint codeIndex)const;
template RegexLexerWalker_<char16_t> RegexLexerBase_::Walk<char16_t> ()const;
template RegexLexerColorizer_<char16_t> RegexLexerBase_::Colorize<char16_t> (RegexProc_<char16_t> _proc)const;
template RegexTokens_<char32_t> RegexLexerBase_::Parse<char32_t> (const ObjectString<char32_t>& code, RegexProc_<char32_t> _proc, vint codeIndex)const;
template RegexLexerWalker_<char32_t> RegexLexerBase_::Walk<char32_t> ()const;
template RegexLexerColorizer_<char32_t> RegexLexerBase_::Colorize<char32_t> (RegexProc_<char32_t> _proc)const;
template class RegexLexer_<wchar_t>;
template class RegexLexer_<char8_t>;
template class RegexLexer_<char16_t>;
template class RegexLexer_<char32_t>;
}
}
/***********************************************************************
.\REGEXPURE.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
using namespace collections;
/***********************************************************************
Read
***********************************************************************/
void ReadInt(stream::IStream& inputStream, vint& value)
{
#ifdef VCZH_64
vint32_t x = 0;
CHECK_ERROR(
inputStream.Read(&x, sizeof(vint32_t)) == sizeof(vint32_t),
L"Failed to deserialize RegexLexer."
);
value = (vint)x;
#else
CHECK_ERROR(
inputStream.Read(&value, sizeof(vint32_t)) == sizeof(vint32_t),
L"Failed to deserialize RegexLexer."
);
#endif
}
void ReadInts(stream::IStream& inputStream, vint count, vint* values)
{
#ifdef VCZH_64
Array<vint32_t> xs(count);
CHECK_ERROR(
inputStream.Read(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
L"Failed to deserialize RegexLexer."
);
for (vint i = 0; i < count; i++)
{
values[i] = (vint)xs[i];
}
#else
CHECK_ERROR(
inputStream.Read(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
L"Failed to deserialize RegexLexer."
);
#endif
}
void ReadBools(stream::IStream& inputStream, vint count, bool* values)
{
Array<vuint8_t> bits((count + 7) / 8);
CHECK_ERROR(
inputStream.Read(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(),
L"Failed to deserialize RegexLexer."
);
for (vint i = 0; i < count; i++)
{
vint x = i / 8;
vint y = i % 8;
values[i] = ((bits[x] >> y) & 1) == 1;
}
}
/***********************************************************************
Write
***********************************************************************/
void WriteInt(stream::IStream& outputStream, vint value)
{
#ifdef VCZH_64
vint32_t x = (vint32_t)value;
CHECK_ERROR(
outputStream.Write(&x, sizeof(vint32_t)) == sizeof(vint32_t),
L"Failed to serialize RegexLexer."
);
#else
CHECK_ERROR(
outputStream.Write(&value, sizeof(vint32_t)) == sizeof(vint32_t),
L"Failed to serialize RegexLexer."
);
#endif
}
void WriteInts(stream::IStream& outputStream, vint count, vint* values)
{
#ifdef VCZH_64
Array<vint32_t> xs(count);
for (vint i = 0; i < count; i++)
{
xs[i] = (vint32_t)values[i];
}
CHECK_ERROR(
outputStream.Write(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
L"Failed to serialize RegexLexer."
);
#else
CHECK_ERROR(
outputStream.Write(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
L"Failed to serialize RegexLexer."
);
#endif
}
void WriteBools(stream::IStream& outputStream, vint count, bool* values)
{
Array<vuint8_t> bits((count + 7) / 8);
memset(&bits[0], 0, sizeof(vuint8_t) * bits.Count());
for (vint i = 0; i < count; i++)
{
if (values[i])
{
vint x = i / 8;
vint y = i % 8;
bits[x] |= (vuint8_t)1 << y;
}
}
CHECK_ERROR(
outputStream.Write(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(),
L"Failed to serialize RegexLexer."
);
}
/***********************************************************************
PureInterpretor (Serialization)
***********************************************************************/
PureInterpretor::PureInterpretor(stream::IStream& inputStream)
{
ReadInt(inputStream, stateCount);
ReadInt(inputStream, charSetCount);
ReadInt(inputStream, startState);
{
vint count = 0;
ReadInt(inputStream, count);
charRanges.Resize(count);
if (count > 0)
{
vint size = charRanges.Count() * sizeof(CharRange);
CHECK_ERROR(inputStream.Read(&charRanges[0], size) == size, L"Failed to serialize RegexLexer.");
}
ExpandCharRanges();
}
transitions = new vint[stateCount * charSetCount];
ReadInts(inputStream, stateCount * charSetCount, transitions);
finalState = new bool[stateCount];
ReadBools(inputStream, stateCount, finalState);
}
void PureInterpretor::Serialize(stream::IStream& outputStream)
{
WriteInt(outputStream, stateCount);
WriteInt(outputStream, charSetCount);
WriteInt(outputStream, startState);
{
WriteInt(outputStream, charRanges.Count());
if (charRanges.Count() > 0)
{
vint size = charRanges.Count() * sizeof(CharRange);
CHECK_ERROR(outputStream.Write(&charRanges[0], size) == size, L"Failed to serialize RegexLexer.");
}
}
WriteInts(outputStream, stateCount * charSetCount, transitions);
WriteBools(outputStream, stateCount, finalState);
}
/***********************************************************************
PureInterpretor
***********************************************************************/
void PureInterpretor::ExpandCharRanges()
{
for (vint i = 0; i < SupportedCharCount; i++)
{
charMap[i] = charSetCount - 1;
}
// TODO: (enumerable) foreach
for (vint i = 0; i < charRanges.Count(); i++)
{
CharRange range = charRanges[i];
for (char32_t j = range.begin; j <= range.end; j++)
{
if (j > MaxChar32) break;
charMap[j] = i;
}
}
}
PureInterpretor::PureInterpretor(Ptr<Automaton> dfa, CharRange::List& subsets)
{
stateCount = dfa->states.Count();
charSetCount = subsets.Count() + 1;
startState = dfa->states.IndexOf(dfa->startState);
// Map char to input index (equivalent char class)
CopyFrom(charRanges, subsets);
ExpandCharRanges();
// Create transitions from DFA, using input index to represent input char
transitions = new vint[stateCount * charSetCount];
for (vint i = 0; i < stateCount; i++)
{
for (vint j = 0; j < charSetCount; j++)
{
transitions[i * charSetCount + j] = -1;
}
State* state = dfa->states[i].Obj();
// TODO: (enumerable) foreach
for (vint j = 0; j < state->transitions.Count(); j++)
{
Transition* dfaTransition = state->transitions[j];
switch (dfaTransition->type)
{
case Transition::Chars:
{
vint index = subsets.IndexOf(dfaTransition->range);
if (index == -1)
{
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr<Automaton>, CharRange::List&)#Specified chars don't appear in the normalized char ranges.");
}
transitions[i * charSetCount + index] = dfa->states.IndexOf(dfaTransition->target);
}
break;
default:
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr<Automaton>, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions.");
}
}
}
// Mark final states
finalState = new bool[stateCount];
for (vint i = 0; i < stateCount; i++)
{
finalState[i] = dfa->states[i]->finalState;
}
}
PureInterpretor::~PureInterpretor()
{
if (relatedFinalState) delete[] relatedFinalState;
delete[] finalState;
delete[] transitions;
}
template<typename TChar>
bool PureInterpretor::MatchHead(const TChar* input, const TChar* start, PureResult& result)
{
CharReader<TChar> reader(input);
vint currentState = startState;
vint terminateState = -1;
vint terminateLength = -1;
result.start = input - start;
result.length = -1;
result.finalState = -1;
result.terminateState = -1;
while (currentState != -1)
{
auto c = reader.Read();
terminateState = currentState;
terminateLength = reader.Index();
if (finalState[currentState])
{
result.length = terminateLength;
result.finalState = currentState;
}
if (!c) break;
if (c >= SupportedCharCount) break;
vint charIndex = charMap[c];
currentState = transitions[currentState * charSetCount + charIndex];
}
if (result.finalState == -1)
{
if (terminateLength > 0)
{
result.terminateState = terminateState;
}
result.length = terminateLength;
return false;
}
else
{
return true;
}
}
template<typename TChar>
bool PureInterpretor::Match(const TChar* input, const TChar* start, PureResult& result)
{
CharReader<TChar> reader(input);
while (reader.Read())
{
if (MatchHead(reader.Reading(), start, result))
{
return true;
}
}
return false;
}
vint PureInterpretor::GetStartState()
{
return startState;
}
vint PureInterpretor::Transit(char32_t input, vint state)
{
if (0 <= state && state < stateCount && 0 <= input && input <= MaxChar32)
{
vint charIndex = charMap[input];
vint nextState = transitions[state * charSetCount + charIndex];
return nextState;
}
else
{
return -1;
}
}
bool PureInterpretor::IsFinalState(vint state)
{
return 0 <= state && state < stateCount&& finalState[state];
}
bool PureInterpretor::IsDeadState(vint state)
{
if (state == -1) return true;
for (vint i = 0; i < charSetCount; i++)
{
if (transitions[state * charSetCount + i] != -1)
{
return false;
}
}
return true;
}
void PureInterpretor::PrepareForRelatedFinalStateTable()
{
if (!relatedFinalState)
{
relatedFinalState = new vint[stateCount];
for (vint i = 0; i < stateCount; i++)
{
relatedFinalState[i] = finalState[i] ? i : -1;
}
while (true)
{
vint modifyCount = 0;
for (vint i = 0; i < stateCount; i++)
{
if (relatedFinalState[i] == -1)
{
vint state = -1;
for (vint j = 0; j < charSetCount; j++)
{
vint nextState = transitions[i * charSetCount + j];
if (nextState != -1)
{
state = relatedFinalState[nextState];
if (state != -1)
{
break;
}
}
}
if (state != -1)
{
relatedFinalState[i] = state;
modifyCount++;
}
}
}
if (modifyCount == 0)
{
break;
}
}
}
}
vint PureInterpretor::GetRelatedFinalState(vint state)
{
return relatedFinalState ? relatedFinalState[state] : -1;
}
template bool PureInterpretor::MatchHead<wchar_t>(const wchar_t* input, const wchar_t* start, PureResult& result);
template bool PureInterpretor::MatchHead<char8_t>(const char8_t* input, const char8_t* start, PureResult& result);
template bool PureInterpretor::MatchHead<char16_t>(const char16_t* input, const char16_t* start, PureResult& result);
template bool PureInterpretor::MatchHead<char32_t>(const char32_t* input, const char32_t* start, PureResult& result);
template bool PureInterpretor::Match<wchar_t>(const wchar_t* input, const wchar_t* start, PureResult& result);
template bool PureInterpretor::Match<char8_t>(const char8_t* input, const char8_t* start, PureResult& result);
template bool PureInterpretor::Match<char16_t>(const char16_t* input, const char16_t* start, PureResult& result);
template bool PureInterpretor::Match<char32_t>(const char32_t* input, const char32_t* start, PureResult& result);
}
}
/***********************************************************************
.\REGEXRICH.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
Data Structures for Backtracking
***********************************************************************/
enum class StateStoreType
{
Positive,
Negative,
Other
};
template<typename TChar>
class StateSaver
{
public:
CharReader<TChar> reader; // Current reading position
char32_t ch; // Current character
State* currentState; // Current state
vint minTransition = 0; // The first transition to backtrack
vint captureCount = 0; // Available capture count (the list size may larger than this)
vint stateSaverCount = 0; // Available saver count (the list size may larger than this)
vint extensionSaverAvailable = -1; // Available extension saver count (the list size may larger than this)
vint extensionSaverCount = 0; // Available extension saver count (during executing)
StateStoreType storeType = StateStoreType::Other; // Reason to keep this record
StateSaver(const TChar* input, State* _currentState)
: reader(input)
, currentState(_currentState)
{
ch = reader.Read();
}
StateSaver(const StateSaver&) = default;
StateSaver& operator=(const StateSaver&) = default;
void RestoreReaderTo(StateSaver<TChar>& saver)
{
saver.reader = reader;
saver.ch = ch;
}
};
template<typename TChar>
class ExtensionSaver
{
public:
CharReader<TChar> reader; // The reading position
char32_t ch; // Current character
vint previous; // Previous extension saver index
vint captureListIndex; // Where to write the captured text
Transition* transition; // The extension begin transition (Capture, Positive, Negative)
ExtensionSaver(const StateSaver<TChar>& saver)
: reader(saver.reader)
, ch(saver.ch)
{
}
ExtensionSaver(const ExtensionSaver&) = default;
ExtensionSaver& operator=(const ExtensionSaver&) = default;
void RestoreReaderTo(StateSaver<TChar>& saver)
{
saver.reader = reader;
saver.ch = ch;
}
};
}
namespace regex_internal
{
using namespace collections;
template<typename TChar>
void Push(List<ExtensionSaver<TChar>>& elements, vint& available, vint& count, const ExtensionSaver<TChar>& element)
{
if (elements.Count() == count)
{
elements.Add(element);
}
else
{
elements[count] = element;
}
auto& current = elements[count];
current.previous = available;
available = count++;
}
template<typename TChar>
ExtensionSaver<TChar> Pop(List<ExtensionSaver<TChar>>& elements, vint& available, vint& count)
{
auto& current = elements[available];
available = current.previous;
return current;
}
template<typename T>
void PushNonSaver(List<T>& elements, vint& count, const T& element)
{
if (elements.Count() == count)
{
elements.Add(element);
}
else
{
elements[count] = element;
}
count++;
}
template<typename T>
T PopNonSaver(List<T>& elements, vint& count)
{
return elements[--count];
}
}
namespace regex_internal
{
/***********************************************************************
RichInterpretor
***********************************************************************/
RichInterpretor::RichInterpretor(Ptr<Automaton> _dfa)
:dfa(_dfa)
{
datas = new UserData[dfa->states.Count()];
// TODO: (enumerable) foreach
for (vint i = 0; i < dfa->states.Count(); i++)
{
State* state = dfa->states[i].Obj();
vint charEdges = 0;
vint nonCharEdges = 0;
bool mustSave = false;
// TODO: (enumerable) foreach
for (vint j = 0; j < state->transitions.Count(); j++)
{
if (state->transitions[j]->type == Transition::Chars)
{
charEdges++;
}
else
{
if (state->transitions[j]->type == Transition::Negative ||
state->transitions[j]->type == Transition::Positive)
{
mustSave = true;
}
nonCharEdges++;
}
}
datas[i].NeedKeepState = mustSave || nonCharEdges > 1 || (nonCharEdges != 0 && charEdges != 0);
state->userData = &datas[i];
}
}
RichInterpretor::~RichInterpretor()
{
delete[] datas;
}
template<typename TChar>
bool RichInterpretor::MatchHead(const TChar* input, const TChar* start, RichResult& result)
{
List<StateSaver<TChar>> stateSavers;
List<ExtensionSaver<TChar>> extensionSavers;
StateSaver<TChar> currentState(input, dfa->startState);
while (!currentState.currentState->finalState)
{
bool found = false; // true means at least one transition matches the input
StateSaver<TChar> oldState = currentState;
// Iterate through all transitions from the current state
// TODO: (enumerable) foreach:reversed
for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++)
{
Transition* transition = currentState.currentState->transitions[i];
switch (transition->type)
{
case Transition::Chars:
{
// match the input if the current character fall into the range
CharRange range = transition->range;
found =
range.begin <= currentState.ch &&
range.end >= currentState.ch;
if (found)
{
currentState.ch = currentState.reader.Read();
}
}
break;
case Transition::BeginString:
{
// match the input if this is the first character, and it is not consumed
found = currentState.reader.Index() == 0 && input == start;
}
break;
case Transition::EndString:
{
// match the input if this is after the last character, and it is not consumed
found = currentState.ch == 0;
}
break;
case Transition::Nop:
{
// match without any condition
found = true;
}
break;
case Transition::Capture:
{
// Push the capture information
ExtensionSaver<TChar> saver(currentState);
saver.captureListIndex = currentState.captureCount;
saver.transition = transition;
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
// Push the capture record, and it will be written if the input matches the regex
CaptureRecord capture;
capture.capture = transition->capture;
capture.start = currentState.reader.Index() + (input - start);
capture.length = -1;
PushNonSaver(result.captures, currentState.captureCount, capture);
found = true;
}
break;
case Transition::Match:
{
vint index = 0;
for (vint j = 0; j < currentState.captureCount; j++)
{
CaptureRecord& capture = result.captures[j];
// If the capture name matched
if (capture.capture == transition->capture)
{
// If the capture index matched, or it is -1
if (capture.length != -1 && (transition->index == -1 || transition->index == index))
{
// If the captured text matched
if (memcmp(start + capture.start, input + currentState.reader.Index(), sizeof(TChar) * capture.length) == 0)
{
// Consume so much input
vint targetIndex = currentState.reader.Index() + capture.length;
while (currentState.reader.Index() < targetIndex)
{
currentState.ch = currentState.reader.Read();
}
CHECK_ERROR(currentState.reader.Index() == targetIndex, L"vl::regex_internal::RichInterpretor::MatchHead<TChar>(const TChar*, const TChar*, RichResult&)#Input code could be an incorrect unicode sequence.");
found = true;
break;
}
}
// Fail if f the captured text with the specified name and index doesn't match
if (transition->index != -1 && index == transition->index)
{
break;
}
else
{
index++;
}
}
}
}
break;
case Transition::Positive:
{
// Push the positive lookahead information
ExtensionSaver<TChar> saver(currentState);
saver.captureListIndex = -1;
saver.transition = transition;
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
// Set found = true so that PushNonSaver(oldState) happens later
oldState.storeType = StateStoreType::Positive;
found = true;
}
break;
case Transition::Negative:
{
// Push the positive lookahead information
ExtensionSaver<TChar> saver(currentState);
saver.captureListIndex = -1;
saver.transition = transition;
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
// Set found = true so that PushNonSaver(oldState) happens later
oldState.storeType = StateStoreType::Negative;
found = true;
}
break;
case Transition::NegativeFail:
{
// NegativeFail will be used when the nagative lookahead failed
}
break;
case Transition::End:
{
// Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here
ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount);
switch (extensionSaver.transition->type)
{
case Transition::Capture:
{
// Write the captured text
CaptureRecord& capture = result.captures[extensionSaver.captureListIndex];
capture.length = currentState.reader.Index() + (input - start) - capture.start;
found = true;
}
break;
case Transition::Positive:
// Find the last positive lookahead state saver
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
{
auto& stateSaver = stateSavers[j];
if (stateSaver.storeType == StateStoreType::Positive)
{
// restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input
stateSaver.RestoreReaderTo(oldState);
oldState.stateSaverCount = j;
stateSaver.RestoreReaderTo(currentState);
currentState.stateSaverCount = j;
break;
}
}
found = true;
break;
case Transition::Negative:
// Find the last negative lookahead state saver
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
{
auto& stateSaver = stateSavers[j];
if (stateSaver.storeType == StateStoreType::Negative)
{
// restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input
oldState = stateSaver;
oldState.storeType = StateStoreType::Other;
currentState = stateSaver;
currentState.storeType = StateStoreType::Other;
i = currentState.minTransition - 1;
break;
}
}
break;
default:;
}
}
break;
default:;
}
// Save the parsing state when necessary
if (found)
{
UserData* data = (UserData*)currentState.currentState->userData;
if (data->NeedKeepState)
{
oldState.minTransition = i + 1;
PushNonSaver(stateSavers, currentState.stateSaverCount, oldState);
}
currentState.currentState = transition->target;
currentState.minTransition = 0;
break;
}
}
// If no transition from the current state can be used
if (!found)
{
// If there is a chance to do backtracking
if (currentState.stateSaverCount)
{
currentState = PopNonSaver(stateSavers, currentState.stateSaverCount);
// minTransition - 1 is always valid since the value is stored with adding 1
// So minTransition - 1 record the transition, which is the reason the parsing state is saved
if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative)
{
// Find the next NegativeFail transition
// Because when a negative lookahead regex failed to match, it is actually succeeded
// Since a negative lookahead means we don't want to match this regex
// TODO: (enumerable) foreach:reversed
for (vint i = 0; i < currentState.currentState->transitions.Count(); i++)
{
Transition* transition = currentState.currentState->transitions[i];
if (transition->type == Transition::NegativeFail)
{
// Restore the state to the target of NegativeFail to let the parsing continue
currentState.currentState = transition->target;
currentState.minTransition = 0;
currentState.storeType = StateStoreType::Other;
break;
}
}
}
}
else
{
break;
}
}
}
if (currentState.currentState->finalState)
{
// Keep available captures if succeeded
result.start = input - start;
result.length = currentState.reader.Index();
for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--)
{
result.captures.RemoveAt(i);
}
return true;
}
else
{
// Clear captures if failed
result.captures.Clear();
return false;
}
}
template<typename TChar>
bool RichInterpretor::Match(const TChar* input, const TChar* start, RichResult& result)
{
CharReader<TChar> reader(input);
while (reader.Read())
{
if (MatchHead(reader.Reading(), start, result))
{
return true;
}
}
return false;
}
const List<U32String>& RichInterpretor::CaptureNames()
{
return dfa->captureNames;
}
template bool RichInterpretor::MatchHead<wchar_t>(const wchar_t* input, const wchar_t* start, RichResult& result);
template bool RichInterpretor::MatchHead<char8_t>(const char8_t* input, const char8_t* start, RichResult& result);
template bool RichInterpretor::MatchHead<char16_t>(const char16_t* input, const char16_t* start, RichResult& result);
template bool RichInterpretor::MatchHead<char32_t>(const char32_t* input, const char32_t* start, RichResult& result);
template bool RichInterpretor::Match<wchar_t>(const wchar_t* input, const wchar_t* start, RichResult& result);
template bool RichInterpretor::Match<char8_t>(const char8_t* input, const char8_t* start, RichResult& result);
template bool RichInterpretor::Match<char16_t>(const char16_t* input, const char16_t* start, RichResult& result);
template bool RichInterpretor::Match<char32_t>(const char32_t* input, const char32_t* start, RichResult& result);
}
}
/***********************************************************************
.\AST\REGEXEXPRESSION.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
MergeAlgorithm
***********************************************************************/
class MergeParameter
{
public:
Expression::Map definitions;
RegexExpression* regex = nullptr;
};
class MergeAlgorithm : public RegexExpressionAlgorithm<Ptr<Expression>, MergeParameter*>
{
public:
Ptr<Expression> Apply(CharSetExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new CharSetExpression);
CopyFrom(result->ranges, expression->ranges);
result->reverse = expression->reverse;
return result;
}
Ptr<Expression> Apply(LoopExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new LoopExpression);
result->max = expression->max;
result->min = expression->min;
result->preferLong = expression->preferLong;
result->expression = Invoke(expression->expression, target);
return result;
}
Ptr<Expression> Apply(SequenceExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new SequenceExpression);
result->left = Invoke(expression->left, target);
result->right = Invoke(expression->right, target);
return result;
}
Ptr<Expression> Apply(AlternateExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new AlternateExpression);
result->left = Invoke(expression->left, target);
result->right = Invoke(expression->right, target);
return result;
}
Ptr<Expression> Apply(BeginExpression* expression, MergeParameter* target) override
{
return Ptr(new BeginExpression);
}
Ptr<Expression> Apply(EndExpression* expression, MergeParameter* target) override
{
return Ptr(new EndExpression);
}
Ptr<Expression> Apply(CaptureExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new CaptureExpression);
result->expression = Invoke(expression->expression, target);
result->name = expression->name;
return result;
}
Ptr<Expression> Apply(MatchExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new MatchExpression);
result->name = expression->name;
result->index = expression->index;
return result;
}
Ptr<Expression> Apply(PositiveExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new PositiveExpression);
result->expression = Invoke(expression->expression, target);
return result;
}
Ptr<Expression> Apply(NegativeExpression* expression, MergeParameter* target) override
{
auto result = Ptr(new NegativeExpression);
result->expression = Invoke(expression->expression, target);
return result;
}
Ptr<Expression> Apply(UsingExpression* expression, MergeParameter* target) override
{
if (target->definitions.Keys().Contains(expression->name))
{
Ptr<Expression> reference = target->definitions[expression->name];
if (reference)
{
return reference;
}
else
{
throw ArgumentException(L"Regular expression syntax error: Found reference loops in\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
}
}
else if (target->regex->definitions.Keys().Contains(expression->name))
{
target->definitions.Add(expression->name, nullptr);
Ptr<Expression> result = Invoke(target->regex->definitions[expression->name], target);
target->definitions.Set(expression->name, result);
return result;
}
else
{
throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
}
}
};
/***********************************************************************
CharSetExpression
***********************************************************************/
bool CharSetExpression::AddRangeWithConflict(CharRange range)
{
if (range.begin > range.end)
{
char32_t t = range.begin;
range.begin = range.end;
range.end = t;
}
// TODO: (enumerable) foreach
for (vint i = 0; i < ranges.Count(); i++)
{
if (!(range<ranges[i] || range>ranges[i]))
{
return false;
}
}
ranges.Add(range);
return true;
}
/***********************************************************************
RegexExpression
***********************************************************************/
Ptr<Expression> RegexExpression::Merge()
{
MergeParameter merge;
merge.regex = this;
return MergeAlgorithm().Invoke(expression, &merge);
}
/***********************************************************************
Expression::Apply
***********************************************************************/
void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
}
}
/***********************************************************************
.\AST\REGEXEXPRESSION_CANTREATASPURE.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
CanTreatAsPureAlgorithm
***********************************************************************/
class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm<bool, void*>
{
public:
bool Apply(CharSetExpression* expression, void* target) override
{
return true;
}
bool Apply(LoopExpression* expression, void* target) override
{
return expression->preferLong && Invoke(expression->expression, 0);
}
bool Apply(SequenceExpression* expression, void* target) override
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(AlternateExpression* expression, void* target) override
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(BeginExpression* expression, void* target) override
{
return false;
}
bool Apply(EndExpression* expression, void* target) override
{
return false;
}
bool Apply(CaptureExpression* expression, void* target) override
{
return Invoke(expression->expression, 0);
}
bool Apply(MatchExpression* expression, void* target) override
{
return false;
}
bool Apply(PositiveExpression* expression, void* target) override
{
return false;
}
bool Apply(NegativeExpression* expression, void* target) override
{
return false;
}
bool Apply(UsingExpression* expression, void* target) override
{
return false;
}
};
/***********************************************************************
Expression
***********************************************************************/
bool Expression::CanTreatAsPure()
{
return CanTreatAsPureAlgorithm().Invoke(this, 0);
}
}
}
/***********************************************************************
.\AST\REGEXEXPRESSION_CHARSET.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
class NormalizedCharSet
{
public:
CharRange::List ranges;
};
/***********************************************************************
CharSetAlgorithm
***********************************************************************/
class CharSetAlgorithm : public RegexExpressionAlgorithm<void, NormalizedCharSet*>
{
public:
virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) = 0;
void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target)
{
if (expression->reverse)
{
char32_t begin = 1;
// TODO: (enumerable) foreach
for (vint i = 0; i < ranges.Count(); i++)
{
CharRange range = ranges[i];
if (range.begin > begin)
{
Process(expression, target, CharRange(begin, range.begin - 1));
}
begin = range.end + 1;
}
if (begin <= MaxChar32)
{
Process(expression, target, CharRange(begin, MaxChar32));
}
}
else
{
// TODO: (enumerable) foreach
for (vint i = 0; i < ranges.Count(); i++)
{
Process(expression, target, ranges[i]);
}
}
}
void Apply(LoopExpression* expression, NormalizedCharSet* target) override
{
Invoke(expression->expression, target);
}
void Apply(SequenceExpression* expression, NormalizedCharSet* target) override
{
Invoke(expression->left, target);
Invoke(expression->right, target);
}
void Apply(AlternateExpression* expression, NormalizedCharSet* target) override
{
Invoke(expression->left, target);
Invoke(expression->right, target);
}
void Apply(BeginExpression* expression, NormalizedCharSet* target) override
{
}
void Apply(EndExpression* expression, NormalizedCharSet* target) override
{
}
void Apply(CaptureExpression* expression, NormalizedCharSet* target) override
{
Invoke(expression->expression, target);
}
void Apply(MatchExpression* expression, NormalizedCharSet* target) override
{
}
void Apply(PositiveExpression* expression, NormalizedCharSet* target) override
{
Invoke(expression->expression, target);
}
void Apply(NegativeExpression* expression, NormalizedCharSet* target) override
{
Invoke(expression->expression, target);
}
void Apply(UsingExpression* expression, NormalizedCharSet* target) override
{
}
};
/***********************************************************************
BuildNormalizedCharSetAlgorithm
***********************************************************************/
class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm
{
public:
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
{
vint index = 0;
while (index < target->ranges.Count())
{
CharRange current = target->ranges[index];
if (current<range || current>range)
{
index++;
}
else if (current.begin < range.begin)
{
// range : [ ?
// current : [ ]
target->ranges.RemoveAt(index);
target->ranges.Add(CharRange(current.begin, range.begin - 1));
target->ranges.Add(CharRange(range.begin, current.end));
index++;
}
else if (current.begin > range.begin)
{
// range : [ ]
// current : [ ?
target->ranges.Add(CharRange(range.begin, current.begin - 1));
range.begin = current.begin;
}
else if (current.end < range.end)
{
// range : [ ]
// current : [ ]
range.begin = current.end + 1;
index++;
}
else if (current.end > range.end)
{
// range : [ ]
// current : [ ]
target->ranges.RemoveAt(index);
target->ranges.Add(range);
target->ranges.Add(CharRange(range.end + 1, current.end));
return;
}
else
{
// range : [ ]
// current : [ ]
return;
}
}
target->ranges.Add(range);
}
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
{
Loop(expression, expression->ranges, target);
}
};
/***********************************************************************
SetNormalizedCharSetAlgorithm
***********************************************************************/
class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm
{
public:
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
{
// TODO: (enumerable) foreach
for (vint j = 0; j < target->ranges.Count(); j++)
{
CharRange targetRange = target->ranges[j];
if (range.begin <= targetRange.begin && targetRange.end <= range.end)
{
expression->ranges.Add(targetRange);
}
}
}
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
{
CharRange::List source;
CopyFrom(source, expression->ranges);
expression->ranges.Clear();
Loop(expression, source, target);
expression->reverse = false;
}
};
/***********************************************************************
Expression
***********************************************************************/
void Expression::NormalizeCharSet(CharRange::List& subsets)
{
NormalizedCharSet normalized;
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
CopyFrom(subsets, normalized.ranges);
}
void Expression::CollectCharSet(CharRange::List& subsets)
{
NormalizedCharSet normalized;
CopyFrom(normalized.ranges, subsets);
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
CopyFrom(subsets, normalized.ranges);
}
void Expression::ApplyCharSet(CharRange::List& subsets)
{
NormalizedCharSet normalized;
CopyFrom(normalized.ranges, subsets);
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
}
}
}
/***********************************************************************
.\AST\REGEXEXPRESSION_GENERATEEPSILONNFA.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
EpsilonNfaAlgorithm
***********************************************************************/
class EpsilonNfaInfo
{
public:
Ptr<Automaton> automaton;
};
class EpsilonNfa
{
public:
State* start;
State* end;
EpsilonNfa()
{
start = 0;
end = 0;
}
};
class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm<EpsilonNfa, Automaton*>
{
public:
EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target)
{
if (a.start)
{
target->NewEpsilon(a.end, b.start);
a.end = b.end;
return a;
}
else
{
return b;
}
}
EpsilonNfa Apply(CharSetExpression* expression, Automaton* target) override
{
EpsilonNfa nfa;
nfa.start = target->NewState();
nfa.end = target->NewState();
// TODO: (enumerable) foreach
for (vint i = 0; i < expression->ranges.Count(); i++)
{
target->NewChars(nfa.start, nfa.end, expression->ranges[i]);
}
return nfa;
}
EpsilonNfa Apply(LoopExpression* expression, Automaton* target) override
{
EpsilonNfa head;
for (vint i = 0; i < expression->min; i++)
{
EpsilonNfa body = Invoke(expression->expression, target);
head = Connect(head, body, target);
}
if (expression->max == -1)
{
EpsilonNfa body = Invoke(expression->expression, target);
if (!head.start)
{
head.start = head.end = target->NewState();
}
State* loopBegin = head.end;
State* loopEnd = target->NewState();
if (expression->preferLong)
{
target->NewEpsilon(loopBegin, body.start);
target->NewEpsilon(body.end, loopBegin);
target->NewNop(loopBegin, loopEnd);
}
else
{
target->NewNop(loopBegin, loopEnd);
target->NewEpsilon(loopBegin, body.start);
target->NewEpsilon(body.end, loopBegin);
}
head.end = loopEnd;
}
else if (expression->max > expression->min)
{
for (vint i = expression->min; i < expression->max; i++)
{
EpsilonNfa body = Invoke(expression->expression, target);
State* start = target->NewState();
State* end = target->NewState();
if (expression->preferLong)
{
target->NewEpsilon(start, body.start);
target->NewEpsilon(body.end, end);
target->NewNop(start, end);
}
else
{
target->NewNop(start, end);
target->NewEpsilon(start, body.start);
target->NewEpsilon(body.end, end);
}
body.start = start;
body.end = end;
head = Connect(head, body, target);
}
}
return head;
}
EpsilonNfa Apply(SequenceExpression* expression, Automaton* target) override
{
EpsilonNfa a = Invoke(expression->left, target);
EpsilonNfa b = Invoke(expression->right, target);
return Connect(a, b, target);
}
EpsilonNfa Apply(AlternateExpression* expression, Automaton* target) override
{
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
EpsilonNfa a = Invoke(expression->left, target);
EpsilonNfa b = Invoke(expression->right, target);
target->NewEpsilon(result.start, a.start);
target->NewEpsilon(a.end, result.end);
target->NewEpsilon(result.start, b.start);
target->NewEpsilon(b.end, result.end);
return result;
}
EpsilonNfa Apply(BeginExpression* expression, Automaton* target) override
{
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
target->NewBeginString(result.start, result.end);
return result;
}
EpsilonNfa Apply(EndExpression* expression, Automaton* target) override
{
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
target->NewEndString(result.start, result.end);
return result;
}
EpsilonNfa Apply(CaptureExpression* expression, Automaton* target) override
{
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
vint capture = -1;
if (expression->name != U32String::Empty)
{
capture = target->captureNames.IndexOf(expression->name);
if (capture == -1)
{
capture = target->captureNames.Count();
target->captureNames.Add(expression->name);
}
}
EpsilonNfa body = Invoke(expression->expression, target);
target->NewCapture(result.start, body.start, capture);
target->NewEnd(body.end, result.end);
return result;
}
EpsilonNfa Apply(MatchExpression* expression, Automaton* target) override
{
vint capture = -1;
if (expression->name != U32String::Empty)
{
capture = target->captureNames.IndexOf(expression->name);
if (capture == -1)
{
capture = target->captureNames.Count();
target->captureNames.Add(expression->name);
}
}
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
target->NewMatch(result.start, result.end, capture, expression->index);
return result;
}
EpsilonNfa Apply(PositiveExpression* expression, Automaton* target) override
{
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
EpsilonNfa body = Invoke(expression->expression, target);
target->NewPositive(result.start, body.start);
target->NewEnd(body.end, result.end);
return result;
}
EpsilonNfa Apply(NegativeExpression* expression, Automaton* target) override
{
EpsilonNfa result;
result.start = target->NewState();
result.end = target->NewState();
EpsilonNfa body = Invoke(expression->expression, target);
target->NewNegative(result.start, body.start);
target->NewEnd(body.end, result.end);
target->NewNegativeFail(result.start, result.end);
return result;
}
EpsilonNfa Apply(UsingExpression* expression, Automaton* target) override
{
CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine.");
}
};
/***********************************************************************
Expression
***********************************************************************/
Ptr<Automaton> Expression::GenerateEpsilonNfa()
{
auto automaton = Ptr(new Automaton);
EpsilonNfa result = EpsilonNfaAlgorithm().Invoke(this, automaton.Obj());
automaton->startState = result.start;
result.end->finalState = true;
return automaton;
}
}
}
/***********************************************************************
.\AST\REGEXEXPRESSION_HASNOEXTENSION.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
HasNoExtensionAlgorithm
***********************************************************************/
class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm<bool, void*>
{
public:
bool Apply(CharSetExpression* expression, void* target) override
{
return true;
}
bool Apply(LoopExpression* expression, void* target) override
{
return expression->preferLong && Invoke(expression->expression, 0);
}
bool Apply(SequenceExpression* expression, void* target) override
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(AlternateExpression* expression, void* target) override
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(BeginExpression* expression, void* target) override
{
return false;
}
bool Apply(EndExpression* expression, void* target) override
{
return false;
}
bool Apply(CaptureExpression* expression, void* target) override
{
return false;
}
bool Apply(MatchExpression* expression, void* target) override
{
return false;
}
bool Apply(PositiveExpression* expression, void* target) override
{
return false;
}
bool Apply(NegativeExpression* expression, void* target) override
{
return false;
}
bool Apply(UsingExpression* expression, void* target) override
{
return false;
}
};
/***********************************************************************
Expression
***********************************************************************/
bool Expression::HasNoExtension()
{
return HasNoExtensionAlgorithm().Invoke(this, 0);
}
}
}
/***********************************************************************
.\AST\REGEXEXPRESSION_ISEQUAL.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
IsEqualAlgorithm
***********************************************************************/
class IsEqualAlgorithm : public RegexExpressionAlgorithm<bool, Expression*>
{
public:
bool Apply(CharSetExpression* expression, Expression* target) override
{
CharSetExpression* expected = dynamic_cast<CharSetExpression*>(target);
if (expected)
{
if (expression->reverse != expected->reverse)return false;
if (expression->ranges.Count() != expected->ranges.Count())return false;
// TODO: (enumerable) foreach:indexed
for (vint i = 0; i < expression->ranges.Count(); i++)
{
if (expression->ranges[i] != expected->ranges[i])return false;
}
return true;
}
return false;
}
bool Apply(LoopExpression* expression, Expression* target) override
{
LoopExpression* expected = dynamic_cast<LoopExpression*>(target);
if (expected)
{
if (expression->min != expected->min)return false;
if (expression->max != expected->max)return false;
if (expression->preferLong != expected->preferLong)return false;
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(SequenceExpression* expression, Expression* target) override
{
SequenceExpression* expected = dynamic_cast<SequenceExpression*>(target);
if (expected)
{
if (!Invoke(expression->left, expected->left.Obj()))return false;
if (!Invoke(expression->right, expected->right.Obj()))return false;
return true;
}
return false;
}
bool Apply(AlternateExpression* expression, Expression* target) override
{
AlternateExpression* expected = dynamic_cast<AlternateExpression*>(target);
if (expected)
{
if (!Invoke(expression->left, expected->left.Obj()))return false;
if (!Invoke(expression->right, expected->right.Obj()))return false;
return true;
}
return false;
}
bool Apply(BeginExpression* expression, Expression* target) override
{
BeginExpression* expected = dynamic_cast<BeginExpression*>(target);
if (expected)
{
return true;
}
return false;
}
bool Apply(EndExpression* expression, Expression* target) override
{
EndExpression* expected = dynamic_cast<EndExpression*>(target);
if (expected)
{
return true;
}
return false;
}
bool Apply(CaptureExpression* expression, Expression* target) override
{
CaptureExpression* expected = dynamic_cast<CaptureExpression*>(target);
if (expected)
{
if (expression->name != expected->name)return false;
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(MatchExpression* expression, Expression* target) override
{
MatchExpression* expected = dynamic_cast<MatchExpression*>(target);
if (expected)
{
if (expression->name != expected->name)return false;
if (expression->index != expected->index)return false;
return true;
}
return false;
}
bool Apply(PositiveExpression* expression, Expression* target) override
{
PositiveExpression* expected = dynamic_cast<PositiveExpression*>(target);
if (expected)
{
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(NegativeExpression* expression, Expression* target) override
{
NegativeExpression* expected = dynamic_cast<NegativeExpression*>(target);
if (expected)
{
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(UsingExpression* expression, Expression* target) override
{
UsingExpression* expected = dynamic_cast<UsingExpression*>(target);
if (expected)
{
if (expression->name != expected->name)return false;
return true;
}
return false;
}
};
/***********************************************************************
Expression
***********************************************************************/
bool Expression::IsEqual(vl::regex_internal::Expression* expression)
{
return IsEqualAlgorithm().Invoke(this, expression);
}
}
}
/***********************************************************************
.\AST\REGEXPARSER.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
Helper Functions
***********************************************************************/
bool IsChar(const char32_t*& input, char32_t c)
{
if (*input == c)
{
input++;
return true;
}
else
{
return false;
}
}
template<vint Size>
bool IsChars(const char32_t*& input, const char32_t(&chars)[Size])
{
for (char32_t c : chars)
{
if (*input == c)
{
input++;
return true;
}
}
return false;
}
template<vint Size>
bool IsStr(const char32_t*& input, const char32_t(&str)[Size])
{
for (vint i = 0; i < Size - 1; i++)
{
if (input[i] != str[i]) return false;
}
input += Size - 1;
return true;
}
bool IsPositiveInteger(const char32_t*& input, vint& number)
{
bool readed = false;
number = 0;
while (U'0' <= *input && *input <= U'9')
{
number = number * 10 + (*input++) - U'0';
readed = true;
}
return readed;
}
bool IsName(const char32_t*& input, U32String& name)
{
const char32_t* read = input;
if ((U'A' <= *read && *read <= U'Z') || (U'a' <= *read && *read <= U'z') || *read == U'_')
{
read++;
while ((U'A' <= *read && *read <= U'Z') || (U'a' <= *read && *read <= U'z') || (U'0' <= *read && *read <= U'9') || *read == U'_')
{
read++;
}
}
if (input == read)
{
return false;
}
else
{
name = U32String::CopyFrom(input, vint(read - input));
input = read;
return true;
}
}
Ptr<LoopExpression> ParseLoop(const char32_t*& input)
{
vint min = 0;
vint max = 0;
if (!*input)
{
return 0;
}
else if (IsChar(input, U'+'))
{
min = 1;
max = -1;
}
else if (IsChar(input, U'*'))
{
min = 0;
max = -1;
}
else if (IsChar(input, U'?'))
{
min = 0;
max = 1;
}
else if (IsChar(input, U'{'))
{
if (IsPositiveInteger(input, min))
{
if (IsChar(input, U','))
{
if (!IsPositiveInteger(input, max))
{
max = -1;
}
}
else
{
max = min;
}
if (!IsChar(input, U'}'))
{
goto THROW_EXCEPTION;
}
}
else
{
goto THROW_EXCEPTION;
}
}
else
{
return 0;
}
{
auto expression = Ptr(new LoopExpression);
expression->min = min;
expression->max = max;
expression->preferLong = !IsChar(input, U'?');
return expression;
}
THROW_EXCEPTION:
throw ArgumentException(L"Regular expression syntax error: Illegal loop expression.", L"vl::regex_internal::ParseLoop", L"input");
}
Ptr<Expression> ParseCharSet(const char32_t*& input)
{
if (!*input)
{
return 0;
}
else if (IsChar(input, U'^'))
{
return Ptr(new BeginExpression);
}
else if (IsChar(input, U'$'))
{
return Ptr(new EndExpression);
}
else if (IsChar(input, U'\\') || IsChar(input, U'/'))
{
auto expression = Ptr(new CharSetExpression);
expression->reverse = false;
switch (*input)
{
case U'.':
expression->ranges.Add(CharRange(1, MaxChar32));
break;
case U'r':
expression->ranges.Add(CharRange(U'\r', U'\r'));
break;
case U'n':
expression->ranges.Add(CharRange(U'\n', U'\n'));
break;
case U't':
expression->ranges.Add(CharRange(U'\t', U'\t'));
break;
case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|':
case U'{':case U'}':case U'[':case U']':case U'<':case U'>':
case U'^':case U'$':case U'!':case U'=':
expression->ranges.Add(CharRange(*input, *input));
break;
case U'S':
expression->reverse = true;
case U's':
expression->ranges.Add(CharRange(U' ', U' '));
expression->ranges.Add(CharRange(U'\r', U'\r'));
expression->ranges.Add(CharRange(U'\n', U'\n'));
expression->ranges.Add(CharRange(U'\t', U'\t'));
break;
case U'D':
expression->reverse = true;
case U'd':
expression->ranges.Add(CharRange(U'0', U'9'));
break;
case U'L':
expression->reverse = true;
case U'l':
expression->ranges.Add(CharRange(U'_', U'_'));
expression->ranges.Add(CharRange(U'A', U'Z'));
expression->ranges.Add(CharRange(U'a', U'z'));
break;
case U'W':
expression->reverse = true;
case U'w':
expression->ranges.Add(CharRange(U'_', U'_'));
expression->ranges.Add(CharRange(U'0', U'9'));
expression->ranges.Add(CharRange(U'A', U'Z'));
expression->ranges.Add(CharRange(U'a', U'z'));
break;
default:
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping.", L"vl::regex_internal::ParseCharSet", L"input");
}
input++;
return expression;
}
else if (IsChar(input, U'['))
{
auto expression = Ptr(new CharSetExpression);
if (IsChar(input, U'^'))
{
expression->reverse = true;
}
else
{
expression->reverse = false;
}
bool midState = false;
char32_t a = U'\0';
char32_t b = U'\0';
while (true)
{
if (IsChar(input, U'\\') || IsChar(input, U'/'))
{
char32_t c = U'\0';
switch (*input)
{
case U'r':
c = U'\r';
break;
case U'n':
c = U'\n';
break;
case U't':
c = U'\t';
break;
case U'-':case U'[':case U']':case U'\\':case U'/':case U'^':case U'$':
c = *input;
break;
default:
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping, only \"rnt-[]\\/\" are legal escaped characters in [].", L"vl::regex_internal::ParseCharSet", L"input");
}
input++;
midState ? b = c : a = c;
midState = !midState;
}
else if (IsChars(input, U"-]"))
{
goto THROW_EXCEPTION;
}
else if (*input)
{
midState ? b = *input++ : a = *input++;
midState = !midState;
}
else
{
goto THROW_EXCEPTION;
}
if (IsChar(input, U']'))
{
if (midState)
{
b = a;
}
if (!expression->AddRangeWithConflict(CharRange(a, b)))
{
goto THROW_EXCEPTION;
}
break;
}
else if (IsChar(input, U'-'))
{
if (!midState)
{
goto THROW_EXCEPTION;
}
}
else
{
if (midState)
{
b = a;
}
if (expression->AddRangeWithConflict(CharRange(a, b)))
{
midState = false;
}
else
{
goto THROW_EXCEPTION;
}
}
}
return expression;
THROW_EXCEPTION:
throw ArgumentException(L"Regular expression syntax error: Illegal character set definition.");
}
else if (IsChars(input, U"()+*?{}|"))
{
input--;
return 0;
}
else
{
auto expression = Ptr(new CharSetExpression);
expression->reverse = false;
expression->ranges.Add(CharRange(*input, *input));
input++;
return expression;
}
}
Ptr<Expression> ParseFunction(const char32_t*& input)
{
if (IsStr(input, U"(="))
{
Ptr<Expression> sub = ParseExpression(input);
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
auto expression = Ptr(new PositiveExpression);
expression->expression = sub;
return expression;
}
else if (IsStr(input, U"(!"))
{
Ptr<Expression> sub = ParseExpression(input);
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
auto expression = Ptr(new NegativeExpression);
expression->expression = sub;
return expression;
}
else if (IsStr(input, U"(<&"))
{
U32String name;
if (!IsName(input, name))
{
goto NEED_NAME;
}
if (!IsChar(input, U'>'))
{
goto NEED_GREATER;
}
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
auto expression = Ptr(new UsingExpression);
expression->name = name;
return expression;
}
else if (IsStr(input, U"(<$"))
{
U32String name;
vint index = -1;
if (IsName(input, name))
{
if (IsChar(input, U';'))
{
if (!IsPositiveInteger(input, index))
{
goto NEED_NUMBER;
}
}
}
else if (!IsPositiveInteger(input, index))
{
goto NEED_NUMBER;
}
if (!IsChar(input, U'>'))
{
goto NEED_GREATER;
}
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
auto expression = Ptr(new MatchExpression);
expression->name = name;
expression->index = index;
return expression;
}
else if (IsStr(input, U"(<"))
{
U32String name;
if (!IsName(input, name))
{
goto NEED_NAME;
}
if (!IsChar(input, U'>'))
{
goto NEED_GREATER;
}
auto sub = ParseExpression(input);
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
auto expression = Ptr(new CaptureExpression);
expression->name = name;
expression->expression = sub;
return expression;
}
else if (IsStr(input, U"(?"))
{
auto sub = ParseExpression(input);
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
auto expression = Ptr(new CaptureExpression);
expression->expression = sub;
return expression;
}
else if (IsChar(input, U'('))
{
auto sub = ParseExpression(input);
if (!IsChar(input, U')'))
{
goto NEED_RIGHT_BRACKET;
}
return sub;
}
else
{
return 0;
}
NEED_RIGHT_BRACKET:
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
NEED_GREATER:
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
NEED_NAME:
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseFunction", L"input");
NEED_NUMBER:
throw ArgumentException(L"Regular expression syntax error: Number expected.", L"vl::regex_internal::ParseFunction", L"input");
}
Ptr<Expression> ParseUnit(const char32_t*& input)
{
Ptr<Expression> unit = ParseCharSet(input);
if (!unit)
{
unit = ParseFunction(input);
}
if (!unit)
{
return 0;
}
Ptr<LoopExpression> loop;
while ((loop = ParseLoop(input)))
{
loop->expression = unit;
unit = loop;
}
return unit;
}
Ptr<Expression> ParseJoin(const char32_t*& input)
{
auto expression = ParseUnit(input);
while (true)
{
auto right = ParseUnit(input);
if (right)
{
auto sequence = Ptr(new SequenceExpression);
sequence->left = expression;
sequence->right = right;
expression = sequence;
}
else
{
break;
}
}
return expression;
}
Ptr<Expression> ParseAlt(const char32_t*& input)
{
auto expression = ParseJoin(input);
while (true)
{
if (IsChar(input, U'|'))
{
auto right = ParseJoin(input);
if (right)
{
auto alternate = Ptr(new AlternateExpression);
alternate->left = expression;
alternate->right = right;
expression = alternate;
}
else
{
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseAlt", L"input");
}
}
else
{
break;
}
}
return expression;
}
Ptr<Expression> ParseExpression(const char32_t*& input)
{
return ParseAlt(input);
}
Ptr<RegexExpression> ParseRegexExpression(const U32String& code)
{
auto regex = Ptr(new RegexExpression);
const char32_t* start = code.Buffer();
const char32_t* input = start;
try
{
while (IsStr(input, U"(<#"))
{
U32String name;
if (!IsName(input, name))
{
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseRegexExpression", L"code");
}
if (!IsChar(input, U'>'))
{
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
}
Ptr<Expression> sub = ParseExpression(input);
if (!IsChar(input, U')'))
{
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
}
if (regex->definitions.Keys().Contains(name))
{
throw ArgumentException(L"Regular expression syntax error: Found duplicated sub expression name: \"" + u32tow(name) + L"\". ", L"vl::regex_internal::ParseFunction", L"input");
}
else
{
regex->definitions.Add(name, sub);
}
}
regex->expression = ParseExpression(input);
if (!regex->expression)
{
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseUnit", L"input");
}
if (*input)
{
throw ArgumentException(L"Regular expression syntax error: Found unnecessary tokens.", L"vl::regex_internal::ParseUnit", L"input");
}
return regex;
}
catch (const ArgumentException& e)
{
throw RegexException(e.Message(), code, input - start);
}
}
U32String EscapeTextForRegex(const U32String& literalString)
{
U32String result;
for (vint i = 0; i < literalString.Length(); i++)
{
char32_t c = literalString[i];
switch (c)
{
case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|':
case U'{':case U'}':case U'[':case U']':case U'<':case U'>':
case U'^':case U'$':case U'!':case U'=':
result += U32String(U"\\") + U32String::FromChar(c);
break;
case U'\r':
result += U"\\r";
break;
case U'\n':
result += U"\\n";
break;
case U'\t':
result += U"\\t";
break;
default:
result += U32String::FromChar(c);
}
}
return result;
}
U32String UnescapeTextForRegex(const U32String& escapedText)
{
U32String result;
for (vint i = 0; i < escapedText.Length(); i++)
{
char32_t c = escapedText[i];
if (c == U'\\' || c == U'/')
{
if (i < escapedText.Length() - 1)
{
i++;
c = escapedText[i];
switch (c)
{
case U'r':
result += U"\r";
break;
case U'n':
result += U"\n";
break;
case U't':
result += U"\t";
break;
default:
result += U32String::FromChar(c);
}
continue;
}
}
result += U32String::FromChar(c);
}
return result;
}
U32String NormalizeEscapedTextForRegex(const U32String& escapedText)
{
U32String result;
for (vint i = 0; i < escapedText.Length(); i++)
{
char32_t c = escapedText[i];
if (c == U'\\' || c == U'/')
{
if (i < escapedText.Length() - 1)
{
i++;
c = escapedText[i];
result += U32String(U"\\") + U32String::FromChar(c);
continue;
}
}
result += U32String::FromChar(c);
}
return result;
}
bool IsRegexEscapedLiteralString(const U32String& regex)
{
for (vint i = 0; i < regex.Length(); i++)
{
char32_t c = regex[i];
if (c == U'\\' || c == U'/')
{
i++;
}
else
{
switch (c)
{
case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|':
case U'{':case U'}':case U'[':case U']':case U'<':case U'>':
case U'^':case U'$':case U'!':case U'=':
return false;
}
}
}
return true;
}
}
}
/***********************************************************************
.\AST\REGEXWRITER.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex
{
using namespace vl::regex_internal;
/***********************************************************************
RegexNode
***********************************************************************/
RegexNode::RegexNode(Ptr<vl::regex_internal::Expression> _expression)
:expression(_expression)
{
}
RegexNode RegexNode::Some()const
{
return Loop(1, -1);
}
RegexNode RegexNode::Any()const
{
return Loop(0, -1);
}
RegexNode RegexNode::Opt()const
{
return Loop(0, 1);
}
RegexNode RegexNode::Loop(vint min, vint max)const
{
auto target = Ptr(new LoopExpression);
target->min = min;
target->max = max;
target->preferLong = true;
target->expression = expression;
return RegexNode(target);
}
RegexNode RegexNode::AtLeast(vint min)const
{
return Loop(min, -1);
}
RegexNode RegexNode::operator+(const RegexNode& node)const
{
auto target = Ptr(new SequenceExpression);
target->left = expression;
target->right = node.expression;
return RegexNode(target);
}
RegexNode RegexNode::operator|(const RegexNode& node)const
{
auto target = Ptr(new AlternateExpression);
target->left = expression;
target->right = node.expression;
return RegexNode(target);
}
RegexNode RegexNode::operator+()const
{
auto target = Ptr(new PositiveExpression);
target->expression = expression;
return RegexNode(target);
}
RegexNode RegexNode::operator-()const
{
auto target = Ptr(new NegativeExpression);
target->expression = expression;
return RegexNode(target);
}
RegexNode RegexNode::operator!()const
{
auto source = dynamic_cast<CharSetExpression*>(expression.Obj());
CHECK_ERROR(source, L"RegexNode::operator!()#operator ! can only applies on charset expressions.");
auto target = Ptr(new CharSetExpression);
CopyFrom(target->ranges, source->ranges);
target->reverse = !source->reverse;
return RegexNode(target);
}
RegexNode RegexNode::operator%(const RegexNode& node)const
{
auto left = dynamic_cast<CharSetExpression*>(expression.Obj());
auto right = dynamic_cast<CharSetExpression*>(node.expression.Obj());
CHECK_ERROR(left && right && !left->reverse && !right->reverse, L"RegexNode::operator%(const RegexNode&)#operator % only connects non-reverse charset expressions.");
auto target = Ptr(new CharSetExpression);
target->reverse = false;
CopyFrom(target->ranges, left->ranges);
// TODO: (enumerable) foreach
for (vint i = 0; i < right->ranges.Count(); i++)
{
if (!target->AddRangeWithConflict(right->ranges[i]))
{
CHECK_ERROR(false, L"RegexNode::operator%(const RegexNode&)#Failed to create charset expression from operator %.");
}
}
return RegexNode(target);
}
/***********************************************************************
Regex Writer
***********************************************************************/
RegexNode rCapture(const U32String& name, const RegexNode& node)
{
auto target = Ptr(new CaptureExpression);
target->name = name;
target->expression = node.expression;
return RegexNode(target);
}
RegexNode rUsing(const U32String& name)
{
auto target = Ptr(new UsingExpression);
target->name = name;
return RegexNode(target);
}
RegexNode rMatch(const U32String& name, vint index)
{
auto target = Ptr(new MatchExpression);
target->name = name;
target->index = index;
return RegexNode(target);
}
RegexNode rMatch(vint index)
{
auto target = Ptr(new MatchExpression);
target->index = index;
return RegexNode(target);
}
RegexNode rBegin()
{
return RegexNode(Ptr(new BeginExpression));
}
RegexNode rEnd()
{
return RegexNode(Ptr(new EndExpression));
}
RegexNode rC(char32_t a, char32_t b)
{
if (!b)b = a;
auto target = Ptr(new CharSetExpression);
target->reverse = false;
target->AddRangeWithConflict(CharRange(a, b));
return RegexNode(target);
}
RegexNode r_d()
{
return rC(U'0', U'9');
}
RegexNode r_l()
{
return rC(U'a', U'z') % rC(U'A', U'Z') % rC(U'_');
}
RegexNode r_w()
{
return rC(U'0', U'9') % rC(U'a', U'z') % rC(U'A', U'Z') % rC(U'_');
}
RegexNode rAnyChar()
{
return rC(1, MaxChar32);
}
}
}
/***********************************************************************
.\AUTOMATON\REGEXAUTOMATON.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
using namespace collections;
/***********************************************************************
Automaton
***********************************************************************/
Automaton::Automaton()
{
startState = 0;
}
State* Automaton::NewState()
{
auto state = Ptr(new State);
state->finalState = false;
state->userData = 0;
states.Add(state);
return state.Obj();
}
Transition* Automaton::NewTransition(State* start, State* end)
{
auto transition = Ptr(new Transition);
transition->source = start;
transition->target = end;
start->transitions.Add(transition.Obj());
end->inputs.Add(transition.Obj());
transitions.Add(transition);
return transition.Obj();
}
Transition* Automaton::NewChars(State* start, State* end, CharRange range)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Chars;
transition->range = range;
return transition;
}
Transition* Automaton::NewEpsilon(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Epsilon;
return transition;
}
Transition* Automaton::NewBeginString(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::BeginString;
return transition;
}
Transition* Automaton::NewEndString(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::EndString;
return transition;
}
Transition* Automaton::NewNop(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Nop;
return transition;
}
Transition* Automaton::NewCapture(State* start, State* end, vint capture)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Capture;
transition->capture = capture;
return transition;
}
Transition* Automaton::NewMatch(State* start, State* end, vint capture, vint index)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Match;
transition->capture = capture;
transition->index = index;
return transition;
}
Transition* Automaton::NewPositive(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Positive;
return transition;
}
Transition* Automaton::NewNegative(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::Negative;
return transition;
}
Transition* Automaton::NewNegativeFail(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::NegativeFail;
return transition;
}
Transition* Automaton::NewEnd(State* start, State* end)
{
auto transition = NewTransition(start, end);
transition->type = Transition::End;
return transition;
}
/***********************************************************************
Helpers
***********************************************************************/
bool PureEpsilonChecker(Transition* transition)
{
switch (transition->type)
{
case Transition::Epsilon:
case Transition::Nop:
case Transition::Capture:
case Transition::End:
return true;
default:
return false;
}
}
bool RichEpsilonChecker(Transition* transition)
{
switch (transition->type)
{
case Transition::Epsilon:
return true;
default:
return false;
}
}
bool AreEqual(Transition* transA, Transition* transB)
{
if (transA->type != transB->type)return false;
switch (transA->type)
{
case Transition::Chars:
return transA->range == transB->range;
case Transition::Capture:
return transA->capture == transB->capture;
case Transition::Match:
return transA->capture == transB->capture && transA->index == transB->index;
default:
return true;
}
}
// Collect epsilon states and non-epsilon transitions, their order are maintained to match the e-NFA
void CollectEpsilon(State* targetState, State* sourceState, bool(*epsilonChecker)(Transition*), List<State*>& epsilonStates, List<Transition*>& transitions)
{
if (!epsilonStates.Contains(sourceState))
{
epsilonStates.Add(sourceState);
// TODO: (enumerable) foreach:alterable
for (vint i = 0; i < sourceState->transitions.Count(); i++)
{
Transition* transition = sourceState->transitions[i];
if (epsilonChecker(transition))
{
if (!epsilonStates.Contains(transition->target))
{
if (transition->target->finalState)
{
targetState->finalState = true;
}
CollectEpsilon(targetState, transition->target, epsilonChecker, epsilonStates, transitions);
}
}
else
{
transitions.Add(transition);
}
}
}
}
Ptr<Automaton> EpsilonNfaToNfa(Ptr<Automaton> source, bool(*epsilonChecker)(Transition*), Dictionary<State*, State*>& nfaStateMap)
{
auto target = Ptr(new Automaton);
Dictionary<State*, State*> stateMap; // source->target
List<State*> epsilonStates; // current epsilon closure
List<Transition*> transitions; // current non-epsilon transitions
stateMap.Add(source->startState, target->NewState());
nfaStateMap.Add(stateMap[source->startState], source->startState);
target->startState = target->states[0].Obj();
CopyFrom(target->captureNames, source->captureNames);
// TODO: (enumerable) foreach
for (vint i = 0; i < target->states.Count(); i++)
{
// Clear cache
State* targetState = target->states[i].Obj();
State* sourceState = nfaStateMap[targetState];
if (sourceState->finalState)
{
targetState->finalState = true;
}
epsilonStates.Clear();
transitions.Clear();
// Collect epsilon states and non-epsilon transitions
CollectEpsilon(targetState, sourceState, epsilonChecker, epsilonStates, transitions);
// Iterate through all non-epsilon transitions
// TODO: (enumerable) foreach
for (vint j = 0; j < transitions.Count(); j++)
{
Transition* transition = transitions[j];
// Create and map a new target state if a new non-epsilon state is found in the e-NFA
if (!stateMap.Keys().Contains(transition->target))
{
stateMap.Add(transition->target, target->NewState());
nfaStateMap.Add(stateMap[transition->target], transition->target);
}
// Copy transition to connect between two non-epsilon state
Transition* newTransition = target->NewTransition(targetState, stateMap[transition->target]);
newTransition->capture = transition->capture;
newTransition->index = transition->index;
newTransition->range = transition->range;
newTransition->type = transition->type;
}
}
return target;
}
Ptr<Automaton> NfaToDfa(Ptr<Automaton> source, Group<State*, State*>& dfaStateMap)
{
auto target = Ptr(new Automaton);
CopyFrom(target->captureNames, source->captureNames);
State* startState = target->NewState();
target->startState = startState;
dfaStateMap.Add(startState, source->startState);
for (auto currentState_ : target->states)
{
Group<Transition*, Transition*> nfaClassToTransitions;
Dictionary<Transition*, Transition*> nfaTransitionToClass;
List<Transition*> orderedTransitionClasses;
State* currentState = currentState_.Obj();
// Iterate through all NFA states which represent the DFA state
for (auto nfaState : dfaStateMap[currentState])
{
// Iterate through all transitions from those NFA states
for (auto nfaTransition : nfaState->transitions)
{
Transition* transitionClass = nullptr;
// Check if there is any key in nfaTransitions that has the same input as the current transition
{
vint index = nfaTransitionToClass.Keys().IndexOf(nfaTransition);
if (index != -1) transitionClass = nfaTransitionToClass.Values()[index];
}
if (transitionClass == nullptr)
{
// TODO: (enumerable) foreach
for (vint l = 0; l < orderedTransitionClasses.Count(); l++)
{
Transition* key = orderedTransitionClasses[l];
if (AreEqual(key, nfaTransition))
{
transitionClass = key;
break;
}
}
}
// Create a new key if not
if (transitionClass == nullptr)
{
transitionClass = nfaTransition;
orderedTransitionClasses.Add(transitionClass);
}
// Group the transition
nfaClassToTransitions.Add(transitionClass, nfaTransition);
nfaTransitionToClass.Add(nfaTransition, transitionClass);
}
}
// Iterate through all key transition that represent all existing transition inputs from the same state
for (auto transitionClass : orderedTransitionClasses)
{
auto&& equivalentTransitions = nfaClassToTransitions[transitionClass];
// Sort all target states and keep unique
List<State*> transitionTargets;
CopyFrom(
transitionTargets,
From(equivalentTransitions)
.Select([](auto t) { return t->target; })
.Distinct()
);
// Check if these NFA states represent a created DFA state
State* dfaState = 0;
// TODO: (enumerable) foreach on dictionary
for (vint k = 0; k < dfaStateMap.Count(); k++)
{
// Compare two NFA states set
if (CompareEnumerable(transitionTargets, dfaStateMap.GetByIndex(k)) == 0)
{
dfaState = dfaStateMap.Keys()[k];
}
}
// Create a new DFA state if there is not
if (!dfaState)
{
dfaState = target->NewState();
// TODO: (enumerable) foreach
for (vint k = 0; k < transitionTargets.Count(); k++)
{
dfaStateMap.Add(dfaState, transitionTargets[k]);
if (transitionTargets[k]->finalState)
{
dfaState->finalState = true;
}
}
}
// Create corresponding DFA transition
Transition* newTransition = target->NewTransition(currentState, dfaState);
newTransition->capture = transitionClass->capture;
newTransition->index = transitionClass->index;
newTransition->range = transitionClass->range;
newTransition->type = transitionClass->type;
}
}
return target;
}
}
}