mirror of
https://github.com/vczh-libraries/Release.git
synced 2026-02-06 03:42:11 +08:00
4347 lines
119 KiB
C++
4347 lines
119 KiB
C++
/***********************************************************************
|
|
THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY
|
|
DEVELOPER: Zihan Chen(vczh)
|
|
***********************************************************************/
|
|
#include "VlppRegex.h"
|
|
|
|
/***********************************************************************
|
|
.\REGEX.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
void ReadInt(stream::IStream& inputStream, vint& value);
|
|
void ReadInts(stream::IStream& inputStream, vint count, vint* values);
|
|
void WriteInt(stream::IStream& outputStream, vint value);
|
|
void WriteInts(stream::IStream& outputStream, vint count, vint* values);
|
|
}
|
|
|
|
namespace regex
|
|
{
|
|
using namespace collections;
|
|
using namespace regex_internal;
|
|
|
|
/***********************************************************************
|
|
String Conversion
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
struct U32;
|
|
|
|
template<>
|
|
struct U32<wchar_t>
|
|
{
|
|
static constexpr U32String(*ToU32)(const WString&) = &wtou32;
|
|
static constexpr WString(*FromU32)(const U32String&) = &u32tow;
|
|
};
|
|
|
|
template<>
|
|
struct U32<char8_t>
|
|
{
|
|
static constexpr U32String(*ToU32)(const U8String&) = &u8tou32;
|
|
static constexpr U8String(*FromU32)(const U32String&) = &u32tou8;
|
|
};
|
|
|
|
template<>
|
|
struct U32<char16_t>
|
|
{
|
|
static constexpr U32String(*ToU32)(const U16String&) = &u16tou32;
|
|
static constexpr U16String(*FromU32)(const U32String&) = &u32tou16;
|
|
};
|
|
|
|
template<>
|
|
struct U32<char32_t>
|
|
{
|
|
static U32String ToU32(const U32String& text) { return text; }
|
|
static U32String FromU32(const U32String& text) { return text; }
|
|
};
|
|
|
|
/***********************************************************************
|
|
RegexMatch_<T>
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
RegexMatch_<T>::RegexMatch_(const ObjectString<T>& _string, PureResult* _result)
|
|
:success(true)
|
|
, result(_string, _result->start, _result->length)
|
|
{
|
|
}
|
|
|
|
template<typename T>
|
|
RegexMatch_<T>::RegexMatch_(const ObjectString<T>& _string, RichResult* _result)
|
|
: success(true)
|
|
, result(_string, _result->start, _result->length)
|
|
{
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < _result->captures.Count(); i++)
|
|
{
|
|
CaptureRecord& capture = _result->captures[i];
|
|
if (capture.capture == -1)
|
|
{
|
|
captures.Add(RegexString_<T>(_string, capture.start, capture.length));
|
|
}
|
|
else
|
|
{
|
|
groups.Add(capture.capture, RegexString_<T>(_string, capture.start, capture.length));
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
RegexMatch_<T>::RegexMatch_(const RegexString_<T>& _result)
|
|
:success(false)
|
|
, result(_result)
|
|
{
|
|
}
|
|
|
|
template<typename T>
|
|
bool RegexMatch_<T>::Success()const
|
|
{
|
|
return success;
|
|
}
|
|
|
|
template<typename T>
|
|
const RegexString_<T>& RegexMatch_<T>::Result()const
|
|
{
|
|
return result;
|
|
}
|
|
|
|
template<typename T>
|
|
const typename RegexMatch_<T>::CaptureList& RegexMatch_<T>::Captures()const
|
|
{
|
|
return captures;
|
|
}
|
|
|
|
template<typename T>
|
|
const typename RegexMatch_<T>::CaptureGroup& RegexMatch_<T>::Groups()const
|
|
{
|
|
return groups;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexBase_
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
void RegexBase_::Process(const ObjectString<T>& text, bool keepEmpty, bool keepSuccess, bool keepFail, typename RegexMatch_<T>::List& matches)const
|
|
{
|
|
if (rich)
|
|
{
|
|
const T* start = text.Buffer();
|
|
const T* input = start;
|
|
RichResult result;
|
|
while (rich->Match(input, start, result))
|
|
{
|
|
vint offset = input - start;
|
|
if (keepFail)
|
|
{
|
|
if (result.start > offset || keepEmpty)
|
|
{
|
|
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, offset, result.start - offset))));
|
|
}
|
|
}
|
|
if (keepSuccess)
|
|
{
|
|
matches.Add(Ptr(new RegexMatch_<T>(text, &result)));
|
|
}
|
|
input = start + result.start + result.length;
|
|
}
|
|
if (keepFail)
|
|
{
|
|
vint remain = input - start;
|
|
vint length = text.Length() - remain;
|
|
if (length || keepEmpty)
|
|
{
|
|
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, remain, length))));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
const T* start = text.Buffer();
|
|
const T* input = start;
|
|
PureResult result;
|
|
while (pure->Match(input, start, result))
|
|
{
|
|
vint offset = input - start;
|
|
if (keepFail)
|
|
{
|
|
if (result.start > offset || keepEmpty)
|
|
{
|
|
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, offset, result.start - offset))));
|
|
}
|
|
}
|
|
if (keepSuccess)
|
|
{
|
|
matches.Add(Ptr(new RegexMatch_<T>(text, &result)));
|
|
}
|
|
input = start + result.start + result.length;
|
|
}
|
|
if (keepFail)
|
|
{
|
|
vint remain = input - start;
|
|
vint length = text.Length() - remain;
|
|
if (length || keepEmpty)
|
|
{
|
|
matches.Add(Ptr(new RegexMatch_<T>(RegexString_<T>(text, remain, length))));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
RegexBase_::~RegexBase_()
|
|
{
|
|
if (pure) delete pure;
|
|
if (rich) delete rich;
|
|
}
|
|
|
|
template<typename T>
|
|
typename RegexMatch_<T>::Ref RegexBase_::MatchHead(const ObjectString<T>& text)const
|
|
{
|
|
if (rich)
|
|
{
|
|
RichResult result;
|
|
if (rich->MatchHead(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return Ptr(new RegexMatch_<T>(text, &result));
|
|
}
|
|
else
|
|
{
|
|
return nullptr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PureResult result;
|
|
if (pure->MatchHead(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return Ptr(new RegexMatch_<T>(text, &result));
|
|
}
|
|
else
|
|
{
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
typename RegexMatch_<T>::Ref RegexBase_::Match(const ObjectString<T>& text)const
|
|
{
|
|
if (rich)
|
|
{
|
|
RichResult result;
|
|
if (rich->Match(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return Ptr(new RegexMatch_<T>(text, &result));
|
|
}
|
|
else
|
|
{
|
|
return nullptr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PureResult result;
|
|
if (pure->Match(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return Ptr(new RegexMatch_<T>(text, &result));
|
|
}
|
|
else
|
|
{
|
|
return nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
bool RegexBase_::TestHead(const ObjectString<T>& text)const
|
|
{
|
|
if (pure)
|
|
{
|
|
PureResult result;
|
|
return pure->MatchHead(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
else
|
|
{
|
|
RichResult result;
|
|
return rich->MatchHead(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
bool RegexBase_::Test(const ObjectString<T>& text)const
|
|
{
|
|
if (pure)
|
|
{
|
|
PureResult result;
|
|
return pure->Match(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
else
|
|
{
|
|
RichResult result;
|
|
return rich->Match(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexBase_::Search(const ObjectString<T>& text, typename RegexMatch_<T>::List& matches)const
|
|
{
|
|
Process(text, false, true, false, matches);
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexBase_::Split(const ObjectString<T>& text, bool keepEmptyMatch, typename RegexMatch_<T>::List& matches)const
|
|
{
|
|
Process(text, keepEmptyMatch, false, true, matches);
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexBase_::Cut(const ObjectString<T>& text, bool keepEmptyMatch, typename RegexMatch_<T>::List& matches)const
|
|
{
|
|
Process(text, keepEmptyMatch, true, true, matches);
|
|
}
|
|
|
|
/***********************************************************************
|
|
Regex_<T>
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
Regex_<T>::Regex_(const ObjectString<T>& code, bool preferPure)
|
|
{
|
|
CharRange::List subsets;
|
|
auto regex = ParseRegexExpression(U32<T>::ToU32(code));
|
|
auto expression = regex->Merge();
|
|
expression->NormalizeCharSet(subsets);
|
|
|
|
bool pureRequired = false;
|
|
bool richRequired = false;
|
|
if (preferPure)
|
|
{
|
|
if (expression->HasNoExtension())
|
|
{
|
|
pureRequired = true;
|
|
}
|
|
else
|
|
{
|
|
if (expression->CanTreatAsPure())
|
|
{
|
|
pureRequired = true;
|
|
richRequired = true;
|
|
}
|
|
else
|
|
{
|
|
richRequired = true;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
richRequired = true;
|
|
}
|
|
|
|
try
|
|
{
|
|
if (pureRequired)
|
|
{
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
Ptr<Automaton> eNfa = expression->GenerateEpsilonNfa();
|
|
Ptr<Automaton> nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
|
|
Ptr<Automaton> dfa = NfaToDfa(nfa, dfaStateMap);
|
|
pure = new PureInterpretor(dfa, subsets);
|
|
}
|
|
if (richRequired)
|
|
{
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
Ptr<Automaton> eNfa = expression->GenerateEpsilonNfa();
|
|
Ptr<Automaton> nfa = EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap);
|
|
Ptr<Automaton> dfa = NfaToDfa(nfa, dfaStateMap);
|
|
rich = new RichInterpretor(dfa);
|
|
|
|
for (auto&& name : rich->CaptureNames())
|
|
{
|
|
captureNames.Add(U32<T>::FromU32(name));
|
|
}
|
|
}
|
|
}
|
|
catch (...)
|
|
{
|
|
if (pure)delete pure;
|
|
if (rich)delete rich;
|
|
throw;
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexTokens_<T>
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
class RegexTokenEnumerator : public Object, public IEnumerator<RegexToken_<T>>
|
|
{
|
|
protected:
|
|
RegexToken_<T> token;
|
|
vint index = -1;
|
|
|
|
PureInterpretor* pure;
|
|
const Array<vint>& stateTokens;
|
|
const T* start;
|
|
vint codeIndex;
|
|
RegexProc_<T> proc;
|
|
|
|
const T* reading;
|
|
vint rowStart = 0;
|
|
vint columnStart = 0;
|
|
bool cacheAvailable = false;
|
|
RegexToken_<T> cacheToken;
|
|
|
|
public:
|
|
RegexTokenEnumerator(const RegexTokenEnumerator& enumerator)
|
|
: token(enumerator.token)
|
|
, index(enumerator.index)
|
|
, pure(enumerator.pure)
|
|
, stateTokens(enumerator.stateTokens)
|
|
, start(enumerator.start)
|
|
, codeIndex(enumerator.codeIndex)
|
|
, proc(enumerator.proc)
|
|
, reading(enumerator.reading)
|
|
, rowStart(enumerator.rowStart)
|
|
, columnStart(enumerator.columnStart)
|
|
, cacheAvailable(enumerator.cacheAvailable)
|
|
, cacheToken(enumerator.cacheToken)
|
|
{
|
|
}
|
|
|
|
RegexTokenEnumerator(PureInterpretor* _pure, const Array<vint>& _stateTokens, const T* _start, vint _codeIndex, RegexProc_<T> _proc)
|
|
:index(-1)
|
|
, pure(_pure)
|
|
, stateTokens(_stateTokens)
|
|
, start(_start)
|
|
, codeIndex(_codeIndex)
|
|
, proc(_proc)
|
|
, reading(_start)
|
|
{
|
|
}
|
|
|
|
IEnumerator<RegexToken_<T>>* Clone()const
|
|
{
|
|
return new RegexTokenEnumerator<T>(*this);
|
|
}
|
|
|
|
const RegexToken_<T>& Current()const
|
|
{
|
|
return token;
|
|
}
|
|
|
|
vint Index()const
|
|
{
|
|
return index;
|
|
}
|
|
|
|
bool Next()
|
|
{
|
|
if (!cacheAvailable && !*reading) return false;
|
|
if (cacheAvailable)
|
|
{
|
|
token = cacheToken;
|
|
cacheAvailable = false;
|
|
}
|
|
else
|
|
{
|
|
token.reading = reading;
|
|
token.start = 0;
|
|
token.length = 0;
|
|
token.token = -2;
|
|
token.completeToken = true;
|
|
}
|
|
|
|
token.rowStart = rowStart;
|
|
token.columnStart = columnStart;
|
|
token.rowEnd = rowStart;
|
|
token.columnEnd = columnStart;
|
|
token.codeIndex = codeIndex;
|
|
|
|
PureResult result;
|
|
while (*reading)
|
|
{
|
|
vint id = -1;
|
|
bool completeToken = true;
|
|
if (!pure->MatchHead(reading, start, result))
|
|
{
|
|
result.start = reading - start;
|
|
|
|
if (id == -1 && result.terminateState != -1)
|
|
{
|
|
vint state = pure->GetRelatedFinalState(result.terminateState);
|
|
if (state != -1)
|
|
{
|
|
id = stateTokens[state];
|
|
}
|
|
}
|
|
|
|
if (id == -1)
|
|
{
|
|
result.length = 1;
|
|
}
|
|
else
|
|
{
|
|
completeToken = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
id = stateTokens.Get(result.finalState);
|
|
}
|
|
|
|
if (id != -1 && proc.extendProc)
|
|
{
|
|
RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr);
|
|
proc.extendProc(proc.argument, reading, -1, true, token);
|
|
#if _DEBUG
|
|
CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer.");
|
|
#endif
|
|
result.length = token.length;
|
|
id = token.token;
|
|
completeToken = token.completeToken;
|
|
}
|
|
|
|
if (token.token == -2)
|
|
{
|
|
token.start = result.start;
|
|
token.length = result.length;
|
|
token.token = id;
|
|
token.completeToken = completeToken;
|
|
}
|
|
else if (token.token == id && id == -1)
|
|
{
|
|
token.length += result.length;
|
|
}
|
|
else
|
|
{
|
|
cacheAvailable = true;
|
|
cacheToken.reading = reading;
|
|
cacheToken.start = result.start;
|
|
cacheToken.length = result.length;
|
|
cacheToken.codeIndex = codeIndex;
|
|
cacheToken.token = id;
|
|
cacheToken.completeToken = completeToken;
|
|
}
|
|
reading += result.length;
|
|
|
|
if (cacheAvailable)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
index++;
|
|
|
|
for (vint i = 0; i < token.length; i++)
|
|
{
|
|
token.rowEnd = rowStart;
|
|
token.columnEnd = columnStart;
|
|
if (token.reading[i] == L'\n')
|
|
{
|
|
rowStart++;
|
|
columnStart = 0;
|
|
}
|
|
else
|
|
{
|
|
columnStart++;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void Reset()
|
|
{
|
|
index = -1;
|
|
reading = start;
|
|
cacheAvailable = false;
|
|
}
|
|
|
|
void ReadToEnd(List<RegexToken_<T>>& tokens, bool(*discard)(vint))
|
|
{
|
|
while (Next())
|
|
{
|
|
if (!discard(token.token))
|
|
{
|
|
tokens.Add(token);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename T>
|
|
RegexTokens_<T>::RegexTokens_(PureInterpretor* _pure, const Array<vint>& _stateTokens, const ObjectString<T>& _code, vint _codeIndex, RegexProc_<T> _proc)
|
|
:pure(_pure)
|
|
, stateTokens(_stateTokens)
|
|
, code(_code)
|
|
, codeIndex(_codeIndex)
|
|
, proc(_proc)
|
|
{
|
|
}
|
|
|
|
template<typename T>
|
|
RegexTokens_<T>::RegexTokens_(const RegexTokens_<T>& tokens)
|
|
:pure(tokens.pure)
|
|
, stateTokens(tokens.stateTokens)
|
|
, code(tokens.code)
|
|
, codeIndex(tokens.codeIndex)
|
|
, proc(tokens.proc)
|
|
{
|
|
}
|
|
|
|
template<typename T>
|
|
IEnumerator<RegexToken_<T>>* RegexTokens_<T>::CreateEnumerator() const
|
|
{
|
|
return new RegexTokenEnumerator<T>(pure, stateTokens, code.Buffer(), codeIndex, proc);
|
|
}
|
|
|
|
bool DefaultDiscard(vint token)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexTokens_<T>::ReadToEnd(collections::List<RegexToken_<T>>& tokens, bool(*discard)(vint))const
|
|
{
|
|
if (discard == 0)
|
|
{
|
|
discard = &DefaultDiscard;
|
|
}
|
|
RegexTokenEnumerator<T>(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard);
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexerWalker_<T>
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
RegexLexerWalker_<T>::RegexLexerWalker_(PureInterpretor* _pure, const Array<vint>& _stateTokens)
|
|
:pure(_pure)
|
|
, stateTokens(_stateTokens)
|
|
{
|
|
}
|
|
|
|
template<typename T>
|
|
RegexLexerWalker_<T>::RegexLexerWalker_(const RegexLexerWalker_<T>& tokens)
|
|
: pure(tokens.pure)
|
|
, stateTokens(tokens.stateTokens)
|
|
{
|
|
}
|
|
|
|
template<typename T>
|
|
vint RegexLexerWalker_<T>::GetStartState()const
|
|
{
|
|
return pure->GetStartState();
|
|
}
|
|
|
|
template<typename T>
|
|
vint RegexLexerWalker_<T>::GetRelatedToken(vint state)const
|
|
{
|
|
vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state);
|
|
return finalState == -1 ? -1 : stateTokens.Get(finalState);
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexLexerWalker_<T>::Walk(T input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const
|
|
{
|
|
vint previousState = state;
|
|
token = -1;
|
|
finalState = false;
|
|
previousTokenStop = false;
|
|
if (state == -1)
|
|
{
|
|
state = pure->GetStartState();
|
|
previousTokenStop = true;
|
|
}
|
|
|
|
state = pure->Transit(input, state);
|
|
if (state == -1)
|
|
{
|
|
previousTokenStop = true;
|
|
if (previousState == -1)
|
|
{
|
|
finalState = true;
|
|
return;
|
|
}
|
|
else if (pure->IsFinalState(previousState))
|
|
{
|
|
state = pure->Transit(input, pure->GetStartState());
|
|
}
|
|
}
|
|
if (pure->IsFinalState(state))
|
|
{
|
|
token = stateTokens.Get(state);
|
|
finalState = true;
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
finalState = state == -1;
|
|
return;
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
vint RegexLexerWalker_<T>::Walk(T input, vint state)const
|
|
{
|
|
vint token = -1;
|
|
bool finalState = false;
|
|
bool previousTokenStop = false;
|
|
Walk(input, state, token, finalState, previousTokenStop);
|
|
return state;
|
|
}
|
|
|
|
template<typename T>
|
|
bool RegexLexerWalker_<T>::IsClosedToken(const T* input, vint length)const
|
|
{
|
|
vint state = pure->GetStartState();
|
|
for (vint i = 0; i < length; i++)
|
|
{
|
|
state = pure->Transit(input[i], state);
|
|
if (state == -1) return true;
|
|
if (pure->IsDeadState(state)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template<typename T>
|
|
bool RegexLexerWalker_<T>::IsClosedToken(const ObjectString<T>& input)const
|
|
{
|
|
return IsClosedToken(input.Buffer(), input.Length());
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexerColorizer_<T>
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
RegexLexerColorizer_<T>::RegexLexerColorizer_(const RegexLexerWalker_<T>& _walker, RegexProc_<T> _proc)
|
|
:walker(_walker)
|
|
, proc(_proc)
|
|
{
|
|
internalState.currentState = walker.GetStartState();
|
|
}
|
|
|
|
template<typename T>
|
|
typename RegexLexerColorizer_<T>::InternalState RegexLexerColorizer_<T>::GetInternalState()
|
|
{
|
|
return internalState;
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexLexerColorizer_<T>::SetInternalState(InternalState state)
|
|
{
|
|
internalState = state;
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexLexerColorizer_<T>::Pass(T input)
|
|
{
|
|
WalkOneToken(&input, 1, 0, false);
|
|
}
|
|
|
|
template<typename T>
|
|
vint RegexLexerColorizer_<T>::GetStartState()const
|
|
{
|
|
return walker.GetStartState();
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexLexerColorizer_<T>::CallExtendProcAndColorizeProc(const T* input, vint length, RegexProcessingToken& token, bool colorize)
|
|
{
|
|
vint oldTokenLength = token.length;
|
|
proc.extendProc(proc.argument, input + token.start, length - token.start, false, token);
|
|
#if _DEBUG
|
|
{
|
|
bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken;
|
|
CHECK_ERROR(
|
|
token.completeToken || pausedAtTheEnd,
|
|
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input."
|
|
);
|
|
CHECK_ERROR(
|
|
token.completeToken || token.token != -1,
|
|
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id."
|
|
);
|
|
CHECK_ERROR(
|
|
oldTokenLength <= token.length,
|
|
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length."
|
|
);
|
|
CHECK_ERROR(
|
|
(token.interTokenState == nullptr) == !pausedAtTheEnd,
|
|
L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
|
|
);
|
|
}
|
|
#endif
|
|
if ((internalState.interTokenState = token.interTokenState))
|
|
{
|
|
internalState.interTokenId = token.token;
|
|
}
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, token.start, token.length, token.token);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
vint RegexLexerColorizer_<T>::WalkOneToken(const T* input, vint length, vint start, bool colorize)
|
|
{
|
|
if (internalState.interTokenState)
|
|
{
|
|
RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState);
|
|
proc.extendProc(proc.argument, input, length, false, token);
|
|
#if _DEBUG
|
|
{
|
|
bool pausedAtTheEnd = token.length == length && !token.completeToken;
|
|
CHECK_ERROR(
|
|
token.completeToken || pausedAtTheEnd,
|
|
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input."
|
|
);
|
|
CHECK_ERROR(
|
|
token.completeToken || token.token == internalState.interTokenId,
|
|
L"RegexLexerColorizer::WalkOneToken(const char32_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id."
|
|
);
|
|
CHECK_ERROR(
|
|
(token.interTokenState == nullptr) == !pausedAtTheEnd,
|
|
L"RegexLexerColorizer::Colorize(const char32_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
|
|
);
|
|
}
|
|
#endif
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, 0, token.length, token.token);
|
|
}
|
|
if (!(internalState.interTokenState = token.interTokenState))
|
|
{
|
|
internalState.interTokenId = -1;
|
|
}
|
|
return token.length;
|
|
}
|
|
|
|
vint lastFinalStateLength = 0;
|
|
vint lastFinalStateToken = -1;
|
|
vint lastFinalStateState = -1;
|
|
|
|
vint tokenStartState = internalState.currentState;
|
|
for (vint i = start; i < length; i++)
|
|
{
|
|
vint currentToken = -1;
|
|
bool finalState = false;
|
|
bool previousTokenStop = false;
|
|
walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop);
|
|
|
|
if (previousTokenStop)
|
|
{
|
|
if (proc.extendProc && lastFinalStateToken != -1)
|
|
{
|
|
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
|
|
CallExtendProcAndColorizeProc(input, length, token, colorize);
|
|
if (token.completeToken)
|
|
{
|
|
internalState.currentState = walker.GetStartState();
|
|
}
|
|
return start + token.length;
|
|
}
|
|
else if (i == start)
|
|
{
|
|
if (tokenStartState == GetStartState())
|
|
{
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, 1, -1);
|
|
}
|
|
internalState.currentState = walker.GetStartState();
|
|
return i + 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
|
|
}
|
|
internalState.currentState = lastFinalStateState;
|
|
return start + lastFinalStateLength;
|
|
}
|
|
}
|
|
|
|
if (finalState)
|
|
{
|
|
lastFinalStateLength = i + 1 - start;
|
|
lastFinalStateToken = currentToken;
|
|
lastFinalStateState = internalState.currentState;
|
|
}
|
|
}
|
|
|
|
if (lastFinalStateToken != -1 && start + lastFinalStateLength == length)
|
|
{
|
|
if (proc.extendProc)
|
|
{
|
|
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
|
|
CallExtendProcAndColorizeProc(input, length, token, colorize);
|
|
}
|
|
else if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
|
|
}
|
|
}
|
|
else if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState));
|
|
}
|
|
return length;
|
|
}
|
|
|
|
template<typename T>
|
|
void* RegexLexerColorizer_<T>::Colorize(const T* input, vint length)
|
|
{
|
|
vint index = 0;
|
|
while (index != length)
|
|
{
|
|
index = WalkOneToken(input, length, index, true);
|
|
}
|
|
return internalState.interTokenState;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexerBase_
|
|
***********************************************************************/
|
|
|
|
RegexLexerBase_::~RegexLexerBase_()
|
|
{
|
|
if (pure) delete pure;
|
|
}
|
|
|
|
template<typename T>
|
|
RegexTokens_<T> RegexLexerBase_::Parse(const ObjectString<T>& code, RegexProc_<T> proc, vint codeIndex)const
|
|
{
|
|
code.Buffer();
|
|
pure->PrepareForRelatedFinalStateTable();
|
|
return RegexTokens_<T>(pure, stateTokens, code, codeIndex, proc);
|
|
}
|
|
|
|
template<typename T>
|
|
RegexLexerWalker_<T> RegexLexerBase_::Walk()const
|
|
{
|
|
pure->PrepareForRelatedFinalStateTable();
|
|
return RegexLexerWalker_<T>(pure, stateTokens);
|
|
}
|
|
|
|
RegexLexerWalker_<wchar_t> RegexLexerBase_::Walk()const
|
|
{
|
|
pure->PrepareForRelatedFinalStateTable();
|
|
return RegexLexerWalker_<wchar_t>(pure, stateTokens);
|
|
}
|
|
|
|
template<typename T>
|
|
RegexLexerColorizer_<T> RegexLexerBase_::Colorize(RegexProc_<T> proc)const
|
|
{
|
|
return RegexLexerColorizer_<T>(Walk<T>(), proc);
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexer_<T> (Serialization)
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
RegexLexer_<T>::RegexLexer_(stream::IStream& inputStream)
|
|
{
|
|
pure = new PureInterpretor(inputStream);
|
|
vint count = 0;
|
|
ReadInt(inputStream, count);
|
|
stateTokens.Resize(count);
|
|
if (count > 0)
|
|
{
|
|
ReadInts(inputStream, count, &stateTokens[0]);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
void RegexLexer_<T>::Serialize(stream::IStream& outputStream)
|
|
{
|
|
pure->Serialize(outputStream);
|
|
WriteInt(outputStream, stateTokens.Count());
|
|
if (stateTokens.Count() > 0)
|
|
{
|
|
WriteInts(outputStream, stateTokens.Count(), &stateTokens[0]);
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexer_<T>
|
|
***********************************************************************/
|
|
|
|
template<typename T>
|
|
RegexLexer_<T>::RegexLexer_(const collections::IEnumerable<ObjectString<T>>& tokens)
|
|
{
|
|
// Build DFA for all tokens
|
|
List<Ptr<Expression>> expressions;
|
|
List<Ptr<Automaton>> dfas;
|
|
CharRange::List subsets;
|
|
for (auto&& code : tokens)
|
|
{
|
|
auto regex = ParseRegexExpression(U32<T>::ToU32(code));
|
|
auto expression = regex->Merge();
|
|
expression->CollectCharSet(subsets);
|
|
expressions.Add(expression);
|
|
}
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < expressions.Count(); i++)
|
|
{
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
expressions[i]->ApplyCharSet(subsets);
|
|
auto eNfa = expressions[i]->GenerateEpsilonNfa();
|
|
auto nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
|
|
auto dfa = NfaToDfa(nfa, dfaStateMap);
|
|
dfas.Add(dfa);
|
|
}
|
|
|
|
// Mark all states in DFAs
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < dfas.Count(); i++)
|
|
{
|
|
Ptr<Automaton> dfa = dfas[i];
|
|
// TODO: (enumerable) foreach
|
|
for (vint j = 0; j < dfa->states.Count(); j++)
|
|
{
|
|
if (dfa->states[j]->finalState)
|
|
{
|
|
dfa->states[j]->userData = (void*)i;
|
|
}
|
|
else
|
|
{
|
|
dfa->states[j]->userData = (void*)dfas.Count();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Connect all DFAs to an e-NFA
|
|
auto bigEnfa = Ptr(new Automaton);
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < dfas.Count(); i++)
|
|
{
|
|
CopyFrom(bigEnfa->states, dfas[i]->states, true);
|
|
CopyFrom(bigEnfa->transitions, dfas[i]->transitions, true);
|
|
}
|
|
bigEnfa->startState = bigEnfa->NewState();
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < dfas.Count(); i++)
|
|
{
|
|
bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState);
|
|
}
|
|
|
|
// Build a single DFA out of the e-NFA
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
auto bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap);
|
|
// TODO: (enumerable) foreach on dictionary
|
|
for (vint i = 0; i < nfaStateMap.Keys().Count(); i++)
|
|
{
|
|
void* userData = nfaStateMap.Values().Get(i)->userData;
|
|
nfaStateMap.Keys()[i]->userData = userData;
|
|
}
|
|
auto bigDfa = NfaToDfa(bigNfa, dfaStateMap);
|
|
// TODO: (enumerable) foreach on group
|
|
for (vint i = 0; i < dfaStateMap.Keys().Count(); i++)
|
|
{
|
|
void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData;
|
|
for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++)
|
|
{
|
|
void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData;
|
|
if (userData > newData)
|
|
{
|
|
userData = newData;
|
|
}
|
|
}
|
|
dfaStateMap.Keys()[i]->userData = userData;
|
|
}
|
|
|
|
// Build state machine
|
|
pure = new PureInterpretor(bigDfa, subsets);
|
|
stateTokens.Resize(bigDfa->states.Count());
|
|
for (vint i = 0; i < stateTokens.Count(); i++)
|
|
{
|
|
void* userData = bigDfa->states[i]->userData;
|
|
stateTokens[i] = (vint)userData;
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
Template Instantiation
|
|
***********************************************************************/
|
|
|
|
template class RegexString_<wchar_t>;
|
|
template class RegexString_<char8_t>;
|
|
template class RegexString_<char16_t>;
|
|
template class RegexString_<char32_t>;
|
|
|
|
template class RegexMatch_<wchar_t>;
|
|
template class RegexMatch_<char8_t>;
|
|
template class RegexMatch_<char16_t>;
|
|
template class RegexMatch_<char32_t>;
|
|
|
|
template RegexMatch_<wchar_t>::Ref RegexBase_::MatchHead<wchar_t> (const ObjectString<wchar_t>& text)const;
|
|
template RegexMatch_<wchar_t>::Ref RegexBase_::Match<wchar_t> (const ObjectString<wchar_t>& text)const;
|
|
template bool RegexBase_::TestHead<wchar_t> (const ObjectString<wchar_t>& text)const;
|
|
template bool RegexBase_::Test<wchar_t> (const ObjectString<wchar_t>& text)const;
|
|
template void RegexBase_::Search<wchar_t> (const ObjectString<wchar_t>& text, RegexMatch_<wchar_t>::List& matches)const;
|
|
template void RegexBase_::Split<wchar_t> (const ObjectString<wchar_t>& text, bool keepEmptyMatch, RegexMatch_<wchar_t>::List& matches)const;
|
|
template void RegexBase_::Cut<wchar_t> (const ObjectString<wchar_t>& text, bool keepEmptyMatch, RegexMatch_<wchar_t>::List& matches)const;
|
|
|
|
template RegexMatch_<char8_t>::Ref RegexBase_::MatchHead<char8_t> (const ObjectString<char8_t>& text)const;
|
|
template RegexMatch_<char8_t>::Ref RegexBase_::Match<char8_t> (const ObjectString<char8_t>& text)const;
|
|
template bool RegexBase_::TestHead<char8_t> (const ObjectString<char8_t>& text)const;
|
|
template bool RegexBase_::Test<char8_t> (const ObjectString<char8_t>& text)const;
|
|
template void RegexBase_::Search<char8_t> (const ObjectString<char8_t>& text, RegexMatch_<char8_t>::List& matches)const;
|
|
template void RegexBase_::Split<char8_t> (const ObjectString<char8_t>& text, bool keepEmptyMatch, RegexMatch_<char8_t>::List& matches)const;
|
|
template void RegexBase_::Cut<char8_t> (const ObjectString<char8_t>& text, bool keepEmptyMatch, RegexMatch_<char8_t>::List& matches)const;
|
|
|
|
template RegexMatch_<char16_t>::Ref RegexBase_::MatchHead<char16_t> (const ObjectString<char16_t>& text)const;
|
|
template RegexMatch_<char16_t>::Ref RegexBase_::Match<char16_t> (const ObjectString<char16_t>& text)const;
|
|
template bool RegexBase_::TestHead<char16_t> (const ObjectString<char16_t>& text)const;
|
|
template bool RegexBase_::Test<char16_t> (const ObjectString<char16_t>& text)const;
|
|
template void RegexBase_::Search<char16_t> (const ObjectString<char16_t>& text, RegexMatch_<char16_t>::List& matches)const;
|
|
template void RegexBase_::Split<char16_t> (const ObjectString<char16_t>& text, bool keepEmptyMatch, RegexMatch_<char16_t>::List& matches)const;
|
|
template void RegexBase_::Cut<char16_t> (const ObjectString<char16_t>& text, bool keepEmptyMatch, RegexMatch_<char16_t>::List& matches)const;
|
|
|
|
template RegexMatch_<char32_t>::Ref RegexBase_::MatchHead<char32_t> (const ObjectString<char32_t>& text)const;
|
|
template RegexMatch_<char32_t>::Ref RegexBase_::Match<char32_t> (const ObjectString<char32_t>& text)const;
|
|
template bool RegexBase_::TestHead<char32_t> (const ObjectString<char32_t>& text)const;
|
|
template bool RegexBase_::Test<char32_t> (const ObjectString<char32_t>& text)const;
|
|
template void RegexBase_::Search<char32_t> (const ObjectString<char32_t>& text, RegexMatch_<char32_t>::List& matches)const;
|
|
template void RegexBase_::Split<char32_t> (const ObjectString<char32_t>& text, bool keepEmptyMatch, RegexMatch_<char32_t>::List& matches)const;
|
|
template void RegexBase_::Cut<char32_t> (const ObjectString<char32_t>& text, bool keepEmptyMatch, RegexMatch_<char32_t>::List& matches)const;
|
|
|
|
template class Regex_<wchar_t>;
|
|
template class Regex_<char8_t>;
|
|
template class Regex_<char16_t>;
|
|
template class Regex_<char32_t>;
|
|
|
|
template class RegexTokens_<wchar_t>;
|
|
template class RegexTokens_<char8_t>;
|
|
template class RegexTokens_<char16_t>;
|
|
template class RegexTokens_<char32_t>;
|
|
|
|
template class RegexLexerWalker_<wchar_t>;
|
|
template class RegexLexerWalker_<char8_t>;
|
|
template class RegexLexerWalker_<char16_t>;
|
|
template class RegexLexerWalker_<char32_t>;
|
|
|
|
template class RegexLexerColorizer_<wchar_t>;
|
|
template class RegexLexerColorizer_<char8_t>;
|
|
template class RegexLexerColorizer_<char16_t>;
|
|
template class RegexLexerColorizer_<char32_t>;
|
|
|
|
template RegexTokens_<wchar_t> RegexLexerBase_::Parse<wchar_t> (const ObjectString<wchar_t>& code, RegexProc_<wchar_t> _proc, vint codeIndex)const;
|
|
template RegexLexerWalker_<wchar_t> RegexLexerBase_::Walk<wchar_t> ()const;
|
|
template RegexLexerColorizer_<wchar_t> RegexLexerBase_::Colorize<wchar_t> (RegexProc_<wchar_t> _proc)const;
|
|
|
|
template RegexTokens_<char8_t> RegexLexerBase_::Parse<char8_t> (const ObjectString<char8_t>& code, RegexProc_<char8_t> _proc, vint codeIndex)const;
|
|
template RegexLexerWalker_<char8_t> RegexLexerBase_::Walk<char8_t> ()const;
|
|
template RegexLexerColorizer_<char8_t> RegexLexerBase_::Colorize<char8_t> (RegexProc_<char8_t> _proc)const;
|
|
|
|
template RegexTokens_<char16_t> RegexLexerBase_::Parse<char16_t> (const ObjectString<char16_t>& code, RegexProc_<char16_t> _proc, vint codeIndex)const;
|
|
template RegexLexerWalker_<char16_t> RegexLexerBase_::Walk<char16_t> ()const;
|
|
template RegexLexerColorizer_<char16_t> RegexLexerBase_::Colorize<char16_t> (RegexProc_<char16_t> _proc)const;
|
|
|
|
template RegexTokens_<char32_t> RegexLexerBase_::Parse<char32_t> (const ObjectString<char32_t>& code, RegexProc_<char32_t> _proc, vint codeIndex)const;
|
|
template RegexLexerWalker_<char32_t> RegexLexerBase_::Walk<char32_t> ()const;
|
|
template RegexLexerColorizer_<char32_t> RegexLexerBase_::Colorize<char32_t> (RegexProc_<char32_t> _proc)const;
|
|
|
|
template class RegexLexer_<wchar_t>;
|
|
template class RegexLexer_<char8_t>;
|
|
template class RegexLexer_<char16_t>;
|
|
template class RegexLexer_<char32_t>;
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXPURE.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
using namespace collections;
|
|
|
|
/***********************************************************************
|
|
Read
|
|
***********************************************************************/
|
|
|
|
void ReadInt(stream::IStream& inputStream, vint& value)
|
|
{
|
|
#ifdef VCZH_64
|
|
vint32_t x = 0;
|
|
CHECK_ERROR(
|
|
inputStream.Read(&x, sizeof(vint32_t)) == sizeof(vint32_t),
|
|
L"Failed to deserialize RegexLexer."
|
|
);
|
|
value = (vint)x;
|
|
#else
|
|
CHECK_ERROR(
|
|
inputStream.Read(&value, sizeof(vint32_t)) == sizeof(vint32_t),
|
|
L"Failed to deserialize RegexLexer."
|
|
);
|
|
#endif
|
|
}
|
|
|
|
void ReadInts(stream::IStream& inputStream, vint count, vint* values)
|
|
{
|
|
#ifdef VCZH_64
|
|
Array<vint32_t> xs(count);
|
|
CHECK_ERROR(
|
|
inputStream.Read(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
|
|
L"Failed to deserialize RegexLexer."
|
|
);
|
|
for (vint i = 0; i < count; i++)
|
|
{
|
|
values[i] = (vint)xs[i];
|
|
}
|
|
#else
|
|
CHECK_ERROR(
|
|
inputStream.Read(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
|
|
L"Failed to deserialize RegexLexer."
|
|
);
|
|
#endif
|
|
}
|
|
|
|
void ReadBools(stream::IStream& inputStream, vint count, bool* values)
|
|
{
|
|
Array<vuint8_t> bits((count + 7) / 8);
|
|
CHECK_ERROR(
|
|
inputStream.Read(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(),
|
|
L"Failed to deserialize RegexLexer."
|
|
);
|
|
|
|
for (vint i = 0; i < count; i++)
|
|
{
|
|
vint x = i / 8;
|
|
vint y = i % 8;
|
|
values[i] = ((bits[x] >> y) & 1) == 1;
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
Write
|
|
***********************************************************************/
|
|
|
|
void WriteInt(stream::IStream& outputStream, vint value)
|
|
{
|
|
#ifdef VCZH_64
|
|
vint32_t x = (vint32_t)value;
|
|
CHECK_ERROR(
|
|
outputStream.Write(&x, sizeof(vint32_t)) == sizeof(vint32_t),
|
|
L"Failed to serialize RegexLexer."
|
|
);
|
|
#else
|
|
CHECK_ERROR(
|
|
outputStream.Write(&value, sizeof(vint32_t)) == sizeof(vint32_t),
|
|
L"Failed to serialize RegexLexer."
|
|
);
|
|
#endif
|
|
}
|
|
|
|
void WriteInts(stream::IStream& outputStream, vint count, vint* values)
|
|
{
|
|
#ifdef VCZH_64
|
|
Array<vint32_t> xs(count);
|
|
for (vint i = 0; i < count; i++)
|
|
{
|
|
xs[i] = (vint32_t)values[i];
|
|
}
|
|
CHECK_ERROR(
|
|
outputStream.Write(&xs[0], sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
|
|
L"Failed to serialize RegexLexer."
|
|
);
|
|
#else
|
|
CHECK_ERROR(
|
|
outputStream.Write(values, sizeof(vint32_t) * count) == sizeof(vint32_t) * count,
|
|
L"Failed to serialize RegexLexer."
|
|
);
|
|
#endif
|
|
}
|
|
|
|
void WriteBools(stream::IStream& outputStream, vint count, bool* values)
|
|
{
|
|
Array<vuint8_t> bits((count + 7) / 8);
|
|
memset(&bits[0], 0, sizeof(vuint8_t) * bits.Count());
|
|
|
|
for (vint i = 0; i < count; i++)
|
|
{
|
|
if (values[i])
|
|
{
|
|
vint x = i / 8;
|
|
vint y = i % 8;
|
|
bits[x] |= (vuint8_t)1 << y;
|
|
}
|
|
}
|
|
|
|
CHECK_ERROR(
|
|
outputStream.Write(&bits[0], sizeof(vuint8_t) * bits.Count()) == sizeof(vuint8_t) * bits.Count(),
|
|
L"Failed to serialize RegexLexer."
|
|
);
|
|
}
|
|
|
|
/***********************************************************************
|
|
PureInterpretor (Serialization)
|
|
***********************************************************************/
|
|
|
|
PureInterpretor::PureInterpretor(stream::IStream& inputStream)
|
|
{
|
|
ReadInt(inputStream, stateCount);
|
|
ReadInt(inputStream, charSetCount);
|
|
ReadInt(inputStream, startState);
|
|
{
|
|
vint count = 0;
|
|
ReadInt(inputStream, count);
|
|
charRanges.Resize(count);
|
|
if (count > 0)
|
|
{
|
|
vint size = charRanges.Count() * sizeof(CharRange);
|
|
CHECK_ERROR(inputStream.Read(&charRanges[0], size) == size, L"Failed to serialize RegexLexer.");
|
|
}
|
|
ExpandCharRanges();
|
|
}
|
|
|
|
transitions = new vint[stateCount * charSetCount];
|
|
ReadInts(inputStream, stateCount * charSetCount, transitions);
|
|
|
|
finalState = new bool[stateCount];
|
|
ReadBools(inputStream, stateCount, finalState);
|
|
}
|
|
|
|
void PureInterpretor::Serialize(stream::IStream& outputStream)
|
|
{
|
|
WriteInt(outputStream, stateCount);
|
|
WriteInt(outputStream, charSetCount);
|
|
WriteInt(outputStream, startState);
|
|
{
|
|
WriteInt(outputStream, charRanges.Count());
|
|
if (charRanges.Count() > 0)
|
|
{
|
|
vint size = charRanges.Count() * sizeof(CharRange);
|
|
CHECK_ERROR(outputStream.Write(&charRanges[0], size) == size, L"Failed to serialize RegexLexer.");
|
|
}
|
|
}
|
|
WriteInts(outputStream, stateCount * charSetCount, transitions);
|
|
WriteBools(outputStream, stateCount, finalState);
|
|
}
|
|
|
|
/***********************************************************************
|
|
PureInterpretor
|
|
***********************************************************************/
|
|
|
|
void PureInterpretor::ExpandCharRanges()
|
|
{
|
|
for (vint i = 0; i < SupportedCharCount; i++)
|
|
{
|
|
charMap[i] = charSetCount - 1;
|
|
}
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < charRanges.Count(); i++)
|
|
{
|
|
CharRange range = charRanges[i];
|
|
for (char32_t j = range.begin; j <= range.end; j++)
|
|
{
|
|
if (j > MaxChar32) break;
|
|
charMap[j] = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
PureInterpretor::PureInterpretor(Ptr<Automaton> dfa, CharRange::List& subsets)
|
|
{
|
|
stateCount = dfa->states.Count();
|
|
charSetCount = subsets.Count() + 1;
|
|
startState = dfa->states.IndexOf(dfa->startState);
|
|
|
|
// Map char to input index (equivalent char class)
|
|
CopyFrom(charRanges, subsets);
|
|
ExpandCharRanges();
|
|
|
|
// Create transitions from DFA, using input index to represent input char
|
|
transitions = new vint[stateCount * charSetCount];
|
|
for (vint i = 0; i < stateCount; i++)
|
|
{
|
|
for (vint j = 0; j < charSetCount; j++)
|
|
{
|
|
transitions[i * charSetCount + j] = -1;
|
|
}
|
|
|
|
State* state = dfa->states[i].Obj();
|
|
// TODO: (enumerable) foreach
|
|
for (vint j = 0; j < state->transitions.Count(); j++)
|
|
{
|
|
Transition* dfaTransition = state->transitions[j];
|
|
switch (dfaTransition->type)
|
|
{
|
|
case Transition::Chars:
|
|
{
|
|
vint index = subsets.IndexOf(dfaTransition->range);
|
|
if (index == -1)
|
|
{
|
|
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr<Automaton>, CharRange::List&)#Specified chars don't appear in the normalized char ranges.");
|
|
}
|
|
transitions[i * charSetCount + index] = dfa->states.IndexOf(dfaTransition->target);
|
|
}
|
|
break;
|
|
default:
|
|
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Ptr<Automaton>, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions.");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark final states
|
|
finalState = new bool[stateCount];
|
|
for (vint i = 0; i < stateCount; i++)
|
|
{
|
|
finalState[i] = dfa->states[i]->finalState;
|
|
}
|
|
}
|
|
|
|
PureInterpretor::~PureInterpretor()
|
|
{
|
|
if (relatedFinalState) delete[] relatedFinalState;
|
|
delete[] finalState;
|
|
delete[] transitions;
|
|
}
|
|
|
|
template<typename TChar>
|
|
bool PureInterpretor::MatchHead(const TChar* input, const TChar* start, PureResult& result)
|
|
{
|
|
CharReader<TChar> reader(input);
|
|
vint currentState = startState;
|
|
vint terminateState = -1;
|
|
vint terminateLength = -1;
|
|
|
|
result.start = input - start;
|
|
result.length = -1;
|
|
result.finalState = -1;
|
|
result.terminateState = -1;
|
|
|
|
while (currentState != -1)
|
|
{
|
|
auto c = reader.Read();
|
|
|
|
terminateState = currentState;
|
|
terminateLength = reader.Index();
|
|
if (finalState[currentState])
|
|
{
|
|
result.length = terminateLength;
|
|
result.finalState = currentState;
|
|
}
|
|
|
|
if (!c) break;
|
|
if (c >= SupportedCharCount) break;
|
|
|
|
vint charIndex = charMap[c];
|
|
currentState = transitions[currentState * charSetCount + charIndex];
|
|
}
|
|
|
|
if (result.finalState == -1)
|
|
{
|
|
if (terminateLength > 0)
|
|
{
|
|
result.terminateState = terminateState;
|
|
}
|
|
result.length = terminateLength;
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
template<typename TChar>
|
|
bool PureInterpretor::Match(const TChar* input, const TChar* start, PureResult& result)
|
|
{
|
|
CharReader<TChar> reader(input);
|
|
while (reader.Read())
|
|
{
|
|
if (MatchHead(reader.Reading(), start, result))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
vint PureInterpretor::GetStartState()
|
|
{
|
|
return startState;
|
|
}
|
|
|
|
vint PureInterpretor::Transit(char32_t input, vint state)
|
|
{
|
|
if (0 <= state && state < stateCount && 0 <= input && input <= MaxChar32)
|
|
{
|
|
vint charIndex = charMap[input];
|
|
vint nextState = transitions[state * charSetCount + charIndex];
|
|
return nextState;
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
bool PureInterpretor::IsFinalState(vint state)
|
|
{
|
|
return 0 <= state && state < stateCount&& finalState[state];
|
|
}
|
|
|
|
bool PureInterpretor::IsDeadState(vint state)
|
|
{
|
|
if (state == -1) return true;
|
|
for (vint i = 0; i < charSetCount; i++)
|
|
{
|
|
if (transitions[state * charSetCount + i] != -1)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void PureInterpretor::PrepareForRelatedFinalStateTable()
|
|
{
|
|
if (!relatedFinalState)
|
|
{
|
|
relatedFinalState = new vint[stateCount];
|
|
for (vint i = 0; i < stateCount; i++)
|
|
{
|
|
relatedFinalState[i] = finalState[i] ? i : -1;
|
|
}
|
|
while (true)
|
|
{
|
|
vint modifyCount = 0;
|
|
for (vint i = 0; i < stateCount; i++)
|
|
{
|
|
if (relatedFinalState[i] == -1)
|
|
{
|
|
vint state = -1;
|
|
for (vint j = 0; j < charSetCount; j++)
|
|
{
|
|
vint nextState = transitions[i * charSetCount + j];
|
|
if (nextState != -1)
|
|
{
|
|
state = relatedFinalState[nextState];
|
|
if (state != -1)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (state != -1)
|
|
{
|
|
relatedFinalState[i] = state;
|
|
modifyCount++;
|
|
}
|
|
}
|
|
}
|
|
if (modifyCount == 0)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
vint PureInterpretor::GetRelatedFinalState(vint state)
|
|
{
|
|
return relatedFinalState ? relatedFinalState[state] : -1;
|
|
}
|
|
|
|
template bool PureInterpretor::MatchHead<wchar_t>(const wchar_t* input, const wchar_t* start, PureResult& result);
|
|
template bool PureInterpretor::MatchHead<char8_t>(const char8_t* input, const char8_t* start, PureResult& result);
|
|
template bool PureInterpretor::MatchHead<char16_t>(const char16_t* input, const char16_t* start, PureResult& result);
|
|
template bool PureInterpretor::MatchHead<char32_t>(const char32_t* input, const char32_t* start, PureResult& result);
|
|
|
|
template bool PureInterpretor::Match<wchar_t>(const wchar_t* input, const wchar_t* start, PureResult& result);
|
|
template bool PureInterpretor::Match<char8_t>(const char8_t* input, const char8_t* start, PureResult& result);
|
|
template bool PureInterpretor::Match<char16_t>(const char16_t* input, const char16_t* start, PureResult& result);
|
|
template bool PureInterpretor::Match<char32_t>(const char32_t* input, const char32_t* start, PureResult& result);
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXRICH.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
Data Structures for Backtracking
|
|
***********************************************************************/
|
|
|
|
enum class StateStoreType
|
|
{
|
|
Positive,
|
|
Negative,
|
|
Other
|
|
};
|
|
|
|
template<typename TChar>
|
|
class StateSaver
|
|
{
|
|
public:
|
|
|
|
CharReader<TChar> reader; // Current reading position
|
|
char32_t ch; // Current character
|
|
State* currentState; // Current state
|
|
vint minTransition = 0; // The first transition to backtrack
|
|
vint captureCount = 0; // Available capture count (the list size may larger than this)
|
|
vint stateSaverCount = 0; // Available saver count (the list size may larger than this)
|
|
vint extensionSaverAvailable = -1; // Available extension saver count (the list size may larger than this)
|
|
vint extensionSaverCount = 0; // Available extension saver count (during executing)
|
|
StateStoreType storeType = StateStoreType::Other; // Reason to keep this record
|
|
|
|
StateSaver(const TChar* input, State* _currentState)
|
|
: reader(input)
|
|
, currentState(_currentState)
|
|
{
|
|
ch = reader.Read();
|
|
}
|
|
|
|
StateSaver(const StateSaver&) = default;
|
|
StateSaver& operator=(const StateSaver&) = default;
|
|
|
|
void RestoreReaderTo(StateSaver<TChar>& saver)
|
|
{
|
|
saver.reader = reader;
|
|
saver.ch = ch;
|
|
}
|
|
};
|
|
|
|
template<typename TChar>
|
|
class ExtensionSaver
|
|
{
|
|
public:
|
|
CharReader<TChar> reader; // The reading position
|
|
char32_t ch; // Current character
|
|
vint previous; // Previous extension saver index
|
|
vint captureListIndex; // Where to write the captured text
|
|
Transition* transition; // The extension begin transition (Capture, Positive, Negative)
|
|
|
|
ExtensionSaver(const StateSaver<TChar>& saver)
|
|
: reader(saver.reader)
|
|
, ch(saver.ch)
|
|
{
|
|
}
|
|
|
|
ExtensionSaver(const ExtensionSaver&) = default;
|
|
ExtensionSaver& operator=(const ExtensionSaver&) = default;
|
|
|
|
void RestoreReaderTo(StateSaver<TChar>& saver)
|
|
{
|
|
saver.reader = reader;
|
|
saver.ch = ch;
|
|
}
|
|
};
|
|
}
|
|
|
|
namespace regex_internal
|
|
{
|
|
using namespace collections;
|
|
|
|
template<typename TChar>
|
|
void Push(List<ExtensionSaver<TChar>>& elements, vint& available, vint& count, const ExtensionSaver<TChar>& element)
|
|
{
|
|
if (elements.Count() == count)
|
|
{
|
|
elements.Add(element);
|
|
}
|
|
else
|
|
{
|
|
elements[count] = element;
|
|
}
|
|
auto& current = elements[count];
|
|
current.previous = available;
|
|
available = count++;
|
|
}
|
|
|
|
template<typename TChar>
|
|
ExtensionSaver<TChar> Pop(List<ExtensionSaver<TChar>>& elements, vint& available, vint& count)
|
|
{
|
|
auto& current = elements[available];
|
|
available = current.previous;
|
|
return current;
|
|
}
|
|
|
|
template<typename T>
|
|
void PushNonSaver(List<T>& elements, vint& count, const T& element)
|
|
{
|
|
if (elements.Count() == count)
|
|
{
|
|
elements.Add(element);
|
|
}
|
|
else
|
|
{
|
|
elements[count] = element;
|
|
}
|
|
count++;
|
|
}
|
|
|
|
template<typename T>
|
|
T PopNonSaver(List<T>& elements, vint& count)
|
|
{
|
|
return elements[--count];
|
|
}
|
|
}
|
|
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
RichInterpretor
|
|
***********************************************************************/
|
|
|
|
RichInterpretor::RichInterpretor(Ptr<Automaton> _dfa)
|
|
:dfa(_dfa)
|
|
{
|
|
datas = new UserData[dfa->states.Count()];
|
|
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < dfa->states.Count(); i++)
|
|
{
|
|
State* state = dfa->states[i].Obj();
|
|
vint charEdges = 0;
|
|
vint nonCharEdges = 0;
|
|
bool mustSave = false;
|
|
// TODO: (enumerable) foreach
|
|
for (vint j = 0; j < state->transitions.Count(); j++)
|
|
{
|
|
if (state->transitions[j]->type == Transition::Chars)
|
|
{
|
|
charEdges++;
|
|
}
|
|
else
|
|
{
|
|
if (state->transitions[j]->type == Transition::Negative ||
|
|
state->transitions[j]->type == Transition::Positive)
|
|
{
|
|
mustSave = true;
|
|
}
|
|
nonCharEdges++;
|
|
}
|
|
}
|
|
datas[i].NeedKeepState = mustSave || nonCharEdges > 1 || (nonCharEdges != 0 && charEdges != 0);
|
|
state->userData = &datas[i];
|
|
}
|
|
}
|
|
|
|
RichInterpretor::~RichInterpretor()
|
|
{
|
|
delete[] datas;
|
|
}
|
|
|
|
template<typename TChar>
|
|
bool RichInterpretor::MatchHead(const TChar* input, const TChar* start, RichResult& result)
|
|
{
|
|
List<StateSaver<TChar>> stateSavers;
|
|
List<ExtensionSaver<TChar>> extensionSavers;
|
|
|
|
StateSaver<TChar> currentState(input, dfa->startState);
|
|
|
|
while (!currentState.currentState->finalState)
|
|
{
|
|
bool found = false; // true means at least one transition matches the input
|
|
StateSaver<TChar> oldState = currentState;
|
|
// Iterate through all transitions from the current state
|
|
// TODO: (enumerable) foreach:reversed
|
|
for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++)
|
|
{
|
|
Transition* transition = currentState.currentState->transitions[i];
|
|
switch (transition->type)
|
|
{
|
|
case Transition::Chars:
|
|
{
|
|
// match the input if the current character fall into the range
|
|
CharRange range = transition->range;
|
|
found =
|
|
range.begin <= currentState.ch &&
|
|
range.end >= currentState.ch;
|
|
if (found)
|
|
{
|
|
currentState.ch = currentState.reader.Read();
|
|
}
|
|
}
|
|
break;
|
|
case Transition::BeginString:
|
|
{
|
|
// match the input if this is the first character, and it is not consumed
|
|
found = currentState.reader.Index() == 0 && input == start;
|
|
}
|
|
break;
|
|
case Transition::EndString:
|
|
{
|
|
// match the input if this is after the last character, and it is not consumed
|
|
found = currentState.ch == 0;
|
|
}
|
|
break;
|
|
case Transition::Nop:
|
|
{
|
|
// match without any condition
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Capture:
|
|
{
|
|
// Push the capture information
|
|
ExtensionSaver<TChar> saver(currentState);
|
|
saver.captureListIndex = currentState.captureCount;
|
|
saver.transition = transition;
|
|
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
|
|
|
|
// Push the capture record, and it will be written if the input matches the regex
|
|
CaptureRecord capture;
|
|
capture.capture = transition->capture;
|
|
capture.start = currentState.reader.Index() + (input - start);
|
|
capture.length = -1;
|
|
PushNonSaver(result.captures, currentState.captureCount, capture);
|
|
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Match:
|
|
{
|
|
vint index = 0;
|
|
for (vint j = 0; j < currentState.captureCount; j++)
|
|
{
|
|
CaptureRecord& capture = result.captures[j];
|
|
// If the capture name matched
|
|
if (capture.capture == transition->capture)
|
|
{
|
|
// If the capture index matched, or it is -1
|
|
if (capture.length != -1 && (transition->index == -1 || transition->index == index))
|
|
{
|
|
// If the captured text matched
|
|
if (memcmp(start + capture.start, input + currentState.reader.Index(), sizeof(TChar) * capture.length) == 0)
|
|
{
|
|
// Consume so much input
|
|
vint targetIndex = currentState.reader.Index() + capture.length;
|
|
while (currentState.reader.Index() < targetIndex)
|
|
{
|
|
currentState.ch = currentState.reader.Read();
|
|
}
|
|
CHECK_ERROR(currentState.reader.Index() == targetIndex, L"vl::regex_internal::RichInterpretor::MatchHead<TChar>(const TChar*, const TChar*, RichResult&)#Input code could be an incorrect unicode sequence.");
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Fail if f the captured text with the specified name and index doesn't match
|
|
if (transition->index != -1 && index == transition->index)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
index++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case Transition::Positive:
|
|
{
|
|
// Push the positive lookahead information
|
|
ExtensionSaver<TChar> saver(currentState);
|
|
saver.captureListIndex = -1;
|
|
saver.transition = transition;
|
|
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
|
|
|
|
// Set found = true so that PushNonSaver(oldState) happens later
|
|
oldState.storeType = StateStoreType::Positive;
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Negative:
|
|
{
|
|
// Push the positive lookahead information
|
|
|
|
ExtensionSaver<TChar> saver(currentState);
|
|
saver.captureListIndex = -1;
|
|
saver.transition = transition;
|
|
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
|
|
|
|
// Set found = true so that PushNonSaver(oldState) happens later
|
|
oldState.storeType = StateStoreType::Negative;
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::NegativeFail:
|
|
{
|
|
// NegativeFail will be used when the nagative lookahead failed
|
|
}
|
|
break;
|
|
case Transition::End:
|
|
{
|
|
// Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here
|
|
ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount);
|
|
switch (extensionSaver.transition->type)
|
|
{
|
|
case Transition::Capture:
|
|
{
|
|
// Write the captured text
|
|
CaptureRecord& capture = result.captures[extensionSaver.captureListIndex];
|
|
capture.length = currentState.reader.Index() + (input - start) - capture.start;
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Positive:
|
|
// Find the last positive lookahead state saver
|
|
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
|
|
{
|
|
auto& stateSaver = stateSavers[j];
|
|
if (stateSaver.storeType == StateStoreType::Positive)
|
|
{
|
|
// restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input
|
|
stateSaver.RestoreReaderTo(oldState);
|
|
oldState.stateSaverCount = j;
|
|
stateSaver.RestoreReaderTo(currentState);
|
|
currentState.stateSaverCount = j;
|
|
break;
|
|
}
|
|
}
|
|
found = true;
|
|
break;
|
|
case Transition::Negative:
|
|
// Find the last negative lookahead state saver
|
|
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
|
|
{
|
|
auto& stateSaver = stateSavers[j];
|
|
if (stateSaver.storeType == StateStoreType::Negative)
|
|
{
|
|
// restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input
|
|
oldState = stateSaver;
|
|
oldState.storeType = StateStoreType::Other;
|
|
currentState = stateSaver;
|
|
currentState.storeType = StateStoreType::Other;
|
|
i = currentState.minTransition - 1;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
default:;
|
|
}
|
|
}
|
|
break;
|
|
default:;
|
|
}
|
|
|
|
// Save the parsing state when necessary
|
|
if (found)
|
|
{
|
|
UserData* data = (UserData*)currentState.currentState->userData;
|
|
if (data->NeedKeepState)
|
|
{
|
|
oldState.minTransition = i + 1;
|
|
PushNonSaver(stateSavers, currentState.stateSaverCount, oldState);
|
|
}
|
|
currentState.currentState = transition->target;
|
|
currentState.minTransition = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If no transition from the current state can be used
|
|
if (!found)
|
|
{
|
|
// If there is a chance to do backtracking
|
|
if (currentState.stateSaverCount)
|
|
{
|
|
currentState = PopNonSaver(stateSavers, currentState.stateSaverCount);
|
|
// minTransition - 1 is always valid since the value is stored with adding 1
|
|
// So minTransition - 1 record the transition, which is the reason the parsing state is saved
|
|
if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative)
|
|
{
|
|
// Find the next NegativeFail transition
|
|
// Because when a negative lookahead regex failed to match, it is actually succeeded
|
|
// Since a negative lookahead means we don't want to match this regex
|
|
// TODO: (enumerable) foreach:reversed
|
|
for (vint i = 0; i < currentState.currentState->transitions.Count(); i++)
|
|
{
|
|
Transition* transition = currentState.currentState->transitions[i];
|
|
if (transition->type == Transition::NegativeFail)
|
|
{
|
|
// Restore the state to the target of NegativeFail to let the parsing continue
|
|
currentState.currentState = transition->target;
|
|
currentState.minTransition = 0;
|
|
currentState.storeType = StateStoreType::Other;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (currentState.currentState->finalState)
|
|
{
|
|
// Keep available captures if succeeded
|
|
result.start = input - start;
|
|
result.length = currentState.reader.Index();
|
|
for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--)
|
|
{
|
|
result.captures.RemoveAt(i);
|
|
}
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
// Clear captures if failed
|
|
result.captures.Clear();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
template<typename TChar>
|
|
bool RichInterpretor::Match(const TChar* input, const TChar* start, RichResult& result)
|
|
{
|
|
CharReader<TChar> reader(input);
|
|
while (reader.Read())
|
|
{
|
|
if (MatchHead(reader.Reading(), start, result))
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
const List<U32String>& RichInterpretor::CaptureNames()
|
|
{
|
|
return dfa->captureNames;
|
|
}
|
|
|
|
template bool RichInterpretor::MatchHead<wchar_t>(const wchar_t* input, const wchar_t* start, RichResult& result);
|
|
template bool RichInterpretor::MatchHead<char8_t>(const char8_t* input, const char8_t* start, RichResult& result);
|
|
template bool RichInterpretor::MatchHead<char16_t>(const char16_t* input, const char16_t* start, RichResult& result);
|
|
template bool RichInterpretor::MatchHead<char32_t>(const char32_t* input, const char32_t* start, RichResult& result);
|
|
|
|
template bool RichInterpretor::Match<wchar_t>(const wchar_t* input, const wchar_t* start, RichResult& result);
|
|
template bool RichInterpretor::Match<char8_t>(const char8_t* input, const char8_t* start, RichResult& result);
|
|
template bool RichInterpretor::Match<char16_t>(const char16_t* input, const char16_t* start, RichResult& result);
|
|
template bool RichInterpretor::Match<char32_t>(const char32_t* input, const char32_t* start, RichResult& result);
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXEXPRESSION.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
MergeAlgorithm
|
|
***********************************************************************/
|
|
|
|
class MergeParameter
|
|
{
|
|
public:
|
|
Expression::Map definitions;
|
|
RegexExpression* regex = nullptr;
|
|
};
|
|
|
|
class MergeAlgorithm : public RegexExpressionAlgorithm<Ptr<Expression>, MergeParameter*>
|
|
{
|
|
public:
|
|
Ptr<Expression> Apply(CharSetExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new CharSetExpression);
|
|
CopyFrom(result->ranges, expression->ranges);
|
|
result->reverse = expression->reverse;
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(LoopExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new LoopExpression);
|
|
result->max = expression->max;
|
|
result->min = expression->min;
|
|
result->preferLong = expression->preferLong;
|
|
result->expression = Invoke(expression->expression, target);
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(SequenceExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new SequenceExpression);
|
|
result->left = Invoke(expression->left, target);
|
|
result->right = Invoke(expression->right, target);
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(AlternateExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new AlternateExpression);
|
|
result->left = Invoke(expression->left, target);
|
|
result->right = Invoke(expression->right, target);
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(BeginExpression* expression, MergeParameter* target) override
|
|
{
|
|
return Ptr(new BeginExpression);
|
|
}
|
|
|
|
Ptr<Expression> Apply(EndExpression* expression, MergeParameter* target) override
|
|
{
|
|
return Ptr(new EndExpression);
|
|
}
|
|
|
|
Ptr<Expression> Apply(CaptureExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new CaptureExpression);
|
|
result->expression = Invoke(expression->expression, target);
|
|
result->name = expression->name;
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(MatchExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new MatchExpression);
|
|
result->name = expression->name;
|
|
result->index = expression->index;
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(PositiveExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new PositiveExpression);
|
|
result->expression = Invoke(expression->expression, target);
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(NegativeExpression* expression, MergeParameter* target) override
|
|
{
|
|
auto result = Ptr(new NegativeExpression);
|
|
result->expression = Invoke(expression->expression, target);
|
|
return result;
|
|
}
|
|
|
|
Ptr<Expression> Apply(UsingExpression* expression, MergeParameter* target) override
|
|
{
|
|
if (target->definitions.Keys().Contains(expression->name))
|
|
{
|
|
Ptr<Expression> reference = target->definitions[expression->name];
|
|
if (reference)
|
|
{
|
|
return reference;
|
|
}
|
|
else
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Found reference loops in\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
|
|
}
|
|
}
|
|
else if (target->regex->definitions.Keys().Contains(expression->name))
|
|
{
|
|
target->definitions.Add(expression->name, nullptr);
|
|
Ptr<Expression> result = Invoke(target->regex->definitions[expression->name], target);
|
|
target->definitions.Set(expression->name, result);
|
|
return result;
|
|
}
|
|
else
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\"" + u32tow(expression->name) + L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
|
|
}
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
CharSetExpression
|
|
***********************************************************************/
|
|
|
|
bool CharSetExpression::AddRangeWithConflict(CharRange range)
|
|
{
|
|
if (range.begin > range.end)
|
|
{
|
|
char32_t t = range.begin;
|
|
range.begin = range.end;
|
|
range.end = t;
|
|
}
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < ranges.Count(); i++)
|
|
{
|
|
if (!(range<ranges[i] || range>ranges[i]))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
ranges.Add(range);
|
|
return true;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexExpression
|
|
***********************************************************************/
|
|
|
|
Ptr<Expression> RegexExpression::Merge()
|
|
{
|
|
MergeParameter merge;
|
|
merge.regex = this;
|
|
return MergeAlgorithm().Invoke(expression, &merge);
|
|
}
|
|
|
|
/***********************************************************************
|
|
Expression::Apply
|
|
***********************************************************************/
|
|
|
|
void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXEXPRESSION_CANTREATASPURE.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
CanTreatAsPureAlgorithm
|
|
***********************************************************************/
|
|
|
|
class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm<bool, void*>
|
|
{
|
|
public:
|
|
bool Apply(CharSetExpression* expression, void* target) override
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool Apply(LoopExpression* expression, void* target) override
|
|
{
|
|
return expression->preferLong && Invoke(expression->expression, 0);
|
|
}
|
|
|
|
bool Apply(SequenceExpression* expression, void* target) override
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(AlternateExpression* expression, void* target) override
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(BeginExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(EndExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(CaptureExpression* expression, void* target) override
|
|
{
|
|
return Invoke(expression->expression, 0);
|
|
}
|
|
|
|
bool Apply(MatchExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(PositiveExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(NegativeExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(UsingExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
Expression
|
|
***********************************************************************/
|
|
|
|
bool Expression::CanTreatAsPure()
|
|
{
|
|
return CanTreatAsPureAlgorithm().Invoke(this, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXEXPRESSION_CHARSET.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
class NormalizedCharSet
|
|
{
|
|
public:
|
|
CharRange::List ranges;
|
|
};
|
|
|
|
/***********************************************************************
|
|
CharSetAlgorithm
|
|
***********************************************************************/
|
|
|
|
class CharSetAlgorithm : public RegexExpressionAlgorithm<void, NormalizedCharSet*>
|
|
{
|
|
public:
|
|
virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range) = 0;
|
|
|
|
void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target)
|
|
{
|
|
if (expression->reverse)
|
|
{
|
|
char32_t begin = 1;
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < ranges.Count(); i++)
|
|
{
|
|
CharRange range = ranges[i];
|
|
if (range.begin > begin)
|
|
{
|
|
Process(expression, target, CharRange(begin, range.begin - 1));
|
|
}
|
|
begin = range.end + 1;
|
|
}
|
|
if (begin <= MaxChar32)
|
|
{
|
|
Process(expression, target, CharRange(begin, MaxChar32));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < ranges.Count(); i++)
|
|
{
|
|
Process(expression, target, ranges[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Apply(LoopExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(SequenceExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
Invoke(expression->left, target);
|
|
Invoke(expression->right, target);
|
|
}
|
|
|
|
void Apply(AlternateExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
Invoke(expression->left, target);
|
|
Invoke(expression->right, target);
|
|
}
|
|
|
|
void Apply(BeginExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
}
|
|
|
|
void Apply(EndExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
}
|
|
|
|
void Apply(CaptureExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(MatchExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
}
|
|
|
|
void Apply(PositiveExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(NegativeExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(UsingExpression* expression, NormalizedCharSet* target) override
|
|
{
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
BuildNormalizedCharSetAlgorithm
|
|
***********************************************************************/
|
|
|
|
class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm
|
|
{
|
|
public:
|
|
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
|
|
{
|
|
vint index = 0;
|
|
while (index < target->ranges.Count())
|
|
{
|
|
CharRange current = target->ranges[index];
|
|
if (current<range || current>range)
|
|
{
|
|
index++;
|
|
}
|
|
else if (current.begin < range.begin)
|
|
{
|
|
// range : [ ?
|
|
// current : [ ]
|
|
target->ranges.RemoveAt(index);
|
|
target->ranges.Add(CharRange(current.begin, range.begin - 1));
|
|
target->ranges.Add(CharRange(range.begin, current.end));
|
|
index++;
|
|
}
|
|
else if (current.begin > range.begin)
|
|
{
|
|
// range : [ ]
|
|
// current : [ ?
|
|
target->ranges.Add(CharRange(range.begin, current.begin - 1));
|
|
range.begin = current.begin;
|
|
}
|
|
else if (current.end < range.end)
|
|
{
|
|
// range : [ ]
|
|
// current : [ ]
|
|
range.begin = current.end + 1;
|
|
index++;
|
|
}
|
|
else if (current.end > range.end)
|
|
{
|
|
// range : [ ]
|
|
// current : [ ]
|
|
target->ranges.RemoveAt(index);
|
|
target->ranges.Add(range);
|
|
target->ranges.Add(CharRange(range.end + 1, current.end));
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
// range : [ ]
|
|
// current : [ ]
|
|
return;
|
|
}
|
|
}
|
|
target->ranges.Add(range);
|
|
}
|
|
|
|
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Loop(expression, expression->ranges, target);
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
SetNormalizedCharSetAlgorithm
|
|
***********************************************************************/
|
|
|
|
class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm
|
|
{
|
|
public:
|
|
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
|
|
{
|
|
// TODO: (enumerable) foreach
|
|
for (vint j = 0; j < target->ranges.Count(); j++)
|
|
{
|
|
CharRange targetRange = target->ranges[j];
|
|
if (range.begin <= targetRange.begin && targetRange.end <= range.end)
|
|
{
|
|
expression->ranges.Add(targetRange);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
CharRange::List source;
|
|
CopyFrom(source, expression->ranges);
|
|
expression->ranges.Clear();
|
|
Loop(expression, source, target);
|
|
expression->reverse = false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
Expression
|
|
***********************************************************************/
|
|
|
|
void Expression::NormalizeCharSet(CharRange::List& subsets)
|
|
{
|
|
NormalizedCharSet normalized;
|
|
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
CopyFrom(subsets, normalized.ranges);
|
|
}
|
|
|
|
void Expression::CollectCharSet(CharRange::List& subsets)
|
|
{
|
|
NormalizedCharSet normalized;
|
|
CopyFrom(normalized.ranges, subsets);
|
|
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
CopyFrom(subsets, normalized.ranges);
|
|
}
|
|
|
|
void Expression::ApplyCharSet(CharRange::List& subsets)
|
|
{
|
|
NormalizedCharSet normalized;
|
|
CopyFrom(normalized.ranges, subsets);
|
|
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXEXPRESSION_GENERATEEPSILONNFA.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
EpsilonNfaAlgorithm
|
|
***********************************************************************/
|
|
|
|
class EpsilonNfaInfo
|
|
{
|
|
public:
|
|
Ptr<Automaton> automaton;
|
|
};
|
|
|
|
class EpsilonNfa
|
|
{
|
|
public:
|
|
State* start;
|
|
State* end;
|
|
|
|
EpsilonNfa()
|
|
{
|
|
start = 0;
|
|
end = 0;
|
|
}
|
|
};
|
|
|
|
class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm<EpsilonNfa, Automaton*>
|
|
{
|
|
public:
|
|
EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target)
|
|
{
|
|
if (a.start)
|
|
{
|
|
target->NewEpsilon(a.end, b.start);
|
|
a.end = b.end;
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
return b;
|
|
}
|
|
}
|
|
|
|
EpsilonNfa Apply(CharSetExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa nfa;
|
|
nfa.start = target->NewState();
|
|
nfa.end = target->NewState();
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < expression->ranges.Count(); i++)
|
|
{
|
|
target->NewChars(nfa.start, nfa.end, expression->ranges[i]);
|
|
}
|
|
return nfa;
|
|
}
|
|
|
|
EpsilonNfa Apply(LoopExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa head;
|
|
for (vint i = 0; i < expression->min; i++)
|
|
{
|
|
EpsilonNfa body = Invoke(expression->expression, target);
|
|
head = Connect(head, body, target);
|
|
}
|
|
if (expression->max == -1)
|
|
{
|
|
EpsilonNfa body = Invoke(expression->expression, target);
|
|
if (!head.start)
|
|
{
|
|
head.start = head.end = target->NewState();
|
|
}
|
|
State* loopBegin = head.end;
|
|
State* loopEnd = target->NewState();
|
|
if (expression->preferLong)
|
|
{
|
|
target->NewEpsilon(loopBegin, body.start);
|
|
target->NewEpsilon(body.end, loopBegin);
|
|
target->NewNop(loopBegin, loopEnd);
|
|
}
|
|
else
|
|
{
|
|
target->NewNop(loopBegin, loopEnd);
|
|
target->NewEpsilon(loopBegin, body.start);
|
|
target->NewEpsilon(body.end, loopBegin);
|
|
}
|
|
head.end = loopEnd;
|
|
}
|
|
else if (expression->max > expression->min)
|
|
{
|
|
for (vint i = expression->min; i < expression->max; i++)
|
|
{
|
|
EpsilonNfa body = Invoke(expression->expression, target);
|
|
State* start = target->NewState();
|
|
State* end = target->NewState();
|
|
if (expression->preferLong)
|
|
{
|
|
target->NewEpsilon(start, body.start);
|
|
target->NewEpsilon(body.end, end);
|
|
target->NewNop(start, end);
|
|
}
|
|
else
|
|
{
|
|
target->NewNop(start, end);
|
|
target->NewEpsilon(start, body.start);
|
|
target->NewEpsilon(body.end, end);
|
|
}
|
|
body.start = start;
|
|
body.end = end;
|
|
head = Connect(head, body, target);
|
|
}
|
|
}
|
|
return head;
|
|
}
|
|
|
|
EpsilonNfa Apply(SequenceExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa a = Invoke(expression->left, target);
|
|
EpsilonNfa b = Invoke(expression->right, target);
|
|
return Connect(a, b, target);
|
|
}
|
|
|
|
EpsilonNfa Apply(AlternateExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
EpsilonNfa a = Invoke(expression->left, target);
|
|
EpsilonNfa b = Invoke(expression->right, target);
|
|
target->NewEpsilon(result.start, a.start);
|
|
target->NewEpsilon(a.end, result.end);
|
|
target->NewEpsilon(result.start, b.start);
|
|
target->NewEpsilon(b.end, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(BeginExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
target->NewBeginString(result.start, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(EndExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
target->NewEndString(result.start, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(CaptureExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
|
|
vint capture = -1;
|
|
if (expression->name != U32String::Empty)
|
|
{
|
|
capture = target->captureNames.IndexOf(expression->name);
|
|
if (capture == -1)
|
|
{
|
|
capture = target->captureNames.Count();
|
|
target->captureNames.Add(expression->name);
|
|
}
|
|
}
|
|
|
|
EpsilonNfa body = Invoke(expression->expression, target);
|
|
target->NewCapture(result.start, body.start, capture);
|
|
target->NewEnd(body.end, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(MatchExpression* expression, Automaton* target) override
|
|
{
|
|
vint capture = -1;
|
|
if (expression->name != U32String::Empty)
|
|
{
|
|
capture = target->captureNames.IndexOf(expression->name);
|
|
if (capture == -1)
|
|
{
|
|
capture = target->captureNames.Count();
|
|
target->captureNames.Add(expression->name);
|
|
}
|
|
}
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
target->NewMatch(result.start, result.end, capture, expression->index);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(PositiveExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
EpsilonNfa body = Invoke(expression->expression, target);
|
|
target->NewPositive(result.start, body.start);
|
|
target->NewEnd(body.end, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(NegativeExpression* expression, Automaton* target) override
|
|
{
|
|
EpsilonNfa result;
|
|
result.start = target->NewState();
|
|
result.end = target->NewState();
|
|
EpsilonNfa body = Invoke(expression->expression, target);
|
|
target->NewNegative(result.start, body.start);
|
|
target->NewEnd(body.end, result.end);
|
|
target->NewNegativeFail(result.start, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(UsingExpression* expression, Automaton* target) override
|
|
{
|
|
CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine.");
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
Expression
|
|
***********************************************************************/
|
|
|
|
Ptr<Automaton> Expression::GenerateEpsilonNfa()
|
|
{
|
|
auto automaton = Ptr(new Automaton);
|
|
EpsilonNfa result = EpsilonNfaAlgorithm().Invoke(this, automaton.Obj());
|
|
automaton->startState = result.start;
|
|
result.end->finalState = true;
|
|
return automaton;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXEXPRESSION_HASNOEXTENSION.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
HasNoExtensionAlgorithm
|
|
***********************************************************************/
|
|
|
|
class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm<bool, void*>
|
|
{
|
|
public:
|
|
bool Apply(CharSetExpression* expression, void* target) override
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool Apply(LoopExpression* expression, void* target) override
|
|
{
|
|
return expression->preferLong && Invoke(expression->expression, 0);
|
|
}
|
|
|
|
bool Apply(SequenceExpression* expression, void* target) override
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(AlternateExpression* expression, void* target) override
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(BeginExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(EndExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(CaptureExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(MatchExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(PositiveExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(NegativeExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(UsingExpression* expression, void* target) override
|
|
{
|
|
return false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
Expression
|
|
***********************************************************************/
|
|
|
|
bool Expression::HasNoExtension()
|
|
{
|
|
return HasNoExtensionAlgorithm().Invoke(this, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXEXPRESSION_ISEQUAL.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
IsEqualAlgorithm
|
|
***********************************************************************/
|
|
|
|
class IsEqualAlgorithm : public RegexExpressionAlgorithm<bool, Expression*>
|
|
{
|
|
public:
|
|
bool Apply(CharSetExpression* expression, Expression* target) override
|
|
{
|
|
CharSetExpression* expected = dynamic_cast<CharSetExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (expression->reverse != expected->reverse)return false;
|
|
if (expression->ranges.Count() != expected->ranges.Count())return false;
|
|
// TODO: (enumerable) foreach:indexed
|
|
for (vint i = 0; i < expression->ranges.Count(); i++)
|
|
{
|
|
if (expression->ranges[i] != expected->ranges[i])return false;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(LoopExpression* expression, Expression* target) override
|
|
{
|
|
LoopExpression* expected = dynamic_cast<LoopExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (expression->min != expected->min)return false;
|
|
if (expression->max != expected->max)return false;
|
|
if (expression->preferLong != expected->preferLong)return false;
|
|
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(SequenceExpression* expression, Expression* target) override
|
|
{
|
|
SequenceExpression* expected = dynamic_cast<SequenceExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (!Invoke(expression->left, expected->left.Obj()))return false;
|
|
if (!Invoke(expression->right, expected->right.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(AlternateExpression* expression, Expression* target) override
|
|
{
|
|
AlternateExpression* expected = dynamic_cast<AlternateExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (!Invoke(expression->left, expected->left.Obj()))return false;
|
|
if (!Invoke(expression->right, expected->right.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(BeginExpression* expression, Expression* target) override
|
|
{
|
|
BeginExpression* expected = dynamic_cast<BeginExpression*>(target);
|
|
if (expected)
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(EndExpression* expression, Expression* target) override
|
|
{
|
|
EndExpression* expected = dynamic_cast<EndExpression*>(target);
|
|
if (expected)
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(CaptureExpression* expression, Expression* target) override
|
|
{
|
|
CaptureExpression* expected = dynamic_cast<CaptureExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (expression->name != expected->name)return false;
|
|
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(MatchExpression* expression, Expression* target) override
|
|
{
|
|
MatchExpression* expected = dynamic_cast<MatchExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (expression->name != expected->name)return false;
|
|
if (expression->index != expected->index)return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(PositiveExpression* expression, Expression* target) override
|
|
{
|
|
PositiveExpression* expected = dynamic_cast<PositiveExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(NegativeExpression* expression, Expression* target) override
|
|
{
|
|
NegativeExpression* expected = dynamic_cast<NegativeExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(UsingExpression* expression, Expression* target) override
|
|
{
|
|
UsingExpression* expected = dynamic_cast<UsingExpression*>(target);
|
|
if (expected)
|
|
{
|
|
if (expression->name != expected->name)return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
Expression
|
|
***********************************************************************/
|
|
|
|
bool Expression::IsEqual(vl::regex_internal::Expression* expression)
|
|
{
|
|
return IsEqualAlgorithm().Invoke(this, expression);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXPARSER.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
Helper Functions
|
|
***********************************************************************/
|
|
|
|
bool IsChar(const char32_t*& input, char32_t c)
|
|
{
|
|
if (*input == c)
|
|
{
|
|
input++;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
template<vint Size>
|
|
bool IsChars(const char32_t*& input, const char32_t(&chars)[Size])
|
|
{
|
|
for (char32_t c : chars)
|
|
{
|
|
if (*input == c)
|
|
{
|
|
input++;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
template<vint Size>
|
|
bool IsStr(const char32_t*& input, const char32_t(&str)[Size])
|
|
{
|
|
for (vint i = 0; i < Size - 1; i++)
|
|
{
|
|
if (input[i] != str[i]) return false;
|
|
}
|
|
input += Size - 1;
|
|
return true;
|
|
}
|
|
|
|
bool IsPositiveInteger(const char32_t*& input, vint& number)
|
|
{
|
|
bool readed = false;
|
|
number = 0;
|
|
while (U'0' <= *input && *input <= U'9')
|
|
{
|
|
number = number * 10 + (*input++) - U'0';
|
|
readed = true;
|
|
}
|
|
return readed;
|
|
}
|
|
|
|
bool IsName(const char32_t*& input, U32String& name)
|
|
{
|
|
const char32_t* read = input;
|
|
if ((U'A' <= *read && *read <= U'Z') || (U'a' <= *read && *read <= U'z') || *read == U'_')
|
|
{
|
|
read++;
|
|
while ((U'A' <= *read && *read <= U'Z') || (U'a' <= *read && *read <= U'z') || (U'0' <= *read && *read <= U'9') || *read == U'_')
|
|
{
|
|
read++;
|
|
}
|
|
}
|
|
if (input == read)
|
|
{
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
name = U32String::CopyFrom(input, vint(read - input));
|
|
input = read;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
Ptr<LoopExpression> ParseLoop(const char32_t*& input)
|
|
{
|
|
vint min = 0;
|
|
vint max = 0;
|
|
if (!*input)
|
|
{
|
|
return 0;
|
|
}
|
|
else if (IsChar(input, U'+'))
|
|
{
|
|
min = 1;
|
|
max = -1;
|
|
}
|
|
else if (IsChar(input, U'*'))
|
|
{
|
|
min = 0;
|
|
max = -1;
|
|
}
|
|
else if (IsChar(input, U'?'))
|
|
{
|
|
min = 0;
|
|
max = 1;
|
|
}
|
|
else if (IsChar(input, U'{'))
|
|
{
|
|
if (IsPositiveInteger(input, min))
|
|
{
|
|
if (IsChar(input, U','))
|
|
{
|
|
if (!IsPositiveInteger(input, max))
|
|
{
|
|
max = -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
max = min;
|
|
}
|
|
if (!IsChar(input, U'}'))
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
{
|
|
auto expression = Ptr(new LoopExpression);
|
|
expression->min = min;
|
|
expression->max = max;
|
|
expression->preferLong = !IsChar(input, U'?');
|
|
return expression;
|
|
}
|
|
THROW_EXCEPTION:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal loop expression.", L"vl::regex_internal::ParseLoop", L"input");
|
|
}
|
|
|
|
Ptr<Expression> ParseCharSet(const char32_t*& input)
|
|
{
|
|
if (!*input)
|
|
{
|
|
return 0;
|
|
}
|
|
else if (IsChar(input, U'^'))
|
|
{
|
|
return Ptr(new BeginExpression);
|
|
}
|
|
else if (IsChar(input, U'$'))
|
|
{
|
|
return Ptr(new EndExpression);
|
|
}
|
|
else if (IsChar(input, U'\\') || IsChar(input, U'/'))
|
|
{
|
|
auto expression = Ptr(new CharSetExpression);
|
|
expression->reverse = false;
|
|
switch (*input)
|
|
{
|
|
case U'.':
|
|
expression->ranges.Add(CharRange(1, MaxChar32));
|
|
break;
|
|
case U'r':
|
|
expression->ranges.Add(CharRange(U'\r', U'\r'));
|
|
break;
|
|
case U'n':
|
|
expression->ranges.Add(CharRange(U'\n', U'\n'));
|
|
break;
|
|
case U't':
|
|
expression->ranges.Add(CharRange(U'\t', U'\t'));
|
|
break;
|
|
case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|':
|
|
case U'{':case U'}':case U'[':case U']':case U'<':case U'>':
|
|
case U'^':case U'$':case U'!':case U'=':
|
|
expression->ranges.Add(CharRange(*input, *input));
|
|
break;
|
|
case U'S':
|
|
expression->reverse = true;
|
|
case U's':
|
|
expression->ranges.Add(CharRange(U' ', U' '));
|
|
expression->ranges.Add(CharRange(U'\r', U'\r'));
|
|
expression->ranges.Add(CharRange(U'\n', U'\n'));
|
|
expression->ranges.Add(CharRange(U'\t', U'\t'));
|
|
break;
|
|
case U'D':
|
|
expression->reverse = true;
|
|
case U'd':
|
|
expression->ranges.Add(CharRange(U'0', U'9'));
|
|
break;
|
|
case U'L':
|
|
expression->reverse = true;
|
|
case U'l':
|
|
expression->ranges.Add(CharRange(U'_', U'_'));
|
|
expression->ranges.Add(CharRange(U'A', U'Z'));
|
|
expression->ranges.Add(CharRange(U'a', U'z'));
|
|
break;
|
|
case U'W':
|
|
expression->reverse = true;
|
|
case U'w':
|
|
expression->ranges.Add(CharRange(U'_', U'_'));
|
|
expression->ranges.Add(CharRange(U'0', U'9'));
|
|
expression->ranges.Add(CharRange(U'A', U'Z'));
|
|
expression->ranges.Add(CharRange(U'a', U'z'));
|
|
break;
|
|
default:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping.", L"vl::regex_internal::ParseCharSet", L"input");
|
|
}
|
|
input++;
|
|
return expression;
|
|
}
|
|
else if (IsChar(input, U'['))
|
|
{
|
|
auto expression = Ptr(new CharSetExpression);
|
|
if (IsChar(input, U'^'))
|
|
{
|
|
expression->reverse = true;
|
|
}
|
|
else
|
|
{
|
|
expression->reverse = false;
|
|
}
|
|
bool midState = false;
|
|
char32_t a = U'\0';
|
|
char32_t b = U'\0';
|
|
while (true)
|
|
{
|
|
if (IsChar(input, U'\\') || IsChar(input, U'/'))
|
|
{
|
|
char32_t c = U'\0';
|
|
switch (*input)
|
|
{
|
|
case U'r':
|
|
c = U'\r';
|
|
break;
|
|
case U'n':
|
|
c = U'\n';
|
|
break;
|
|
case U't':
|
|
c = U'\t';
|
|
break;
|
|
case U'-':case U'[':case U']':case U'\\':case U'/':case U'^':case U'$':
|
|
c = *input;
|
|
break;
|
|
default:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping, only \"rnt-[]\\/\" are legal escaped characters in [].", L"vl::regex_internal::ParseCharSet", L"input");
|
|
}
|
|
input++;
|
|
midState ? b = c : a = c;
|
|
midState = !midState;
|
|
}
|
|
else if (IsChars(input, U"-]"))
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
else if (*input)
|
|
{
|
|
midState ? b = *input++ : a = *input++;
|
|
midState = !midState;
|
|
}
|
|
else
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
if (IsChar(input, U']'))
|
|
{
|
|
if (midState)
|
|
{
|
|
b = a;
|
|
}
|
|
if (!expression->AddRangeWithConflict(CharRange(a, b)))
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
break;
|
|
}
|
|
else if (IsChar(input, U'-'))
|
|
{
|
|
if (!midState)
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (midState)
|
|
{
|
|
b = a;
|
|
}
|
|
if (expression->AddRangeWithConflict(CharRange(a, b)))
|
|
{
|
|
midState = false;
|
|
}
|
|
else
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
}
|
|
return expression;
|
|
THROW_EXCEPTION:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal character set definition.");
|
|
}
|
|
else if (IsChars(input, U"()+*?{}|"))
|
|
{
|
|
input--;
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
auto expression = Ptr(new CharSetExpression);
|
|
expression->reverse = false;
|
|
expression->ranges.Add(CharRange(*input, *input));
|
|
input++;
|
|
return expression;
|
|
}
|
|
}
|
|
|
|
Ptr<Expression> ParseFunction(const char32_t*& input)
|
|
{
|
|
if (IsStr(input, U"(="))
|
|
{
|
|
Ptr<Expression> sub = ParseExpression(input);
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
auto expression = Ptr(new PositiveExpression);
|
|
expression->expression = sub;
|
|
return expression;
|
|
}
|
|
else if (IsStr(input, U"(!"))
|
|
{
|
|
Ptr<Expression> sub = ParseExpression(input);
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
auto expression = Ptr(new NegativeExpression);
|
|
expression->expression = sub;
|
|
return expression;
|
|
}
|
|
else if (IsStr(input, U"(<&"))
|
|
{
|
|
U32String name;
|
|
if (!IsName(input, name))
|
|
{
|
|
goto NEED_NAME;
|
|
}
|
|
if (!IsChar(input, U'>'))
|
|
{
|
|
goto NEED_GREATER;
|
|
}
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
auto expression = Ptr(new UsingExpression);
|
|
expression->name = name;
|
|
return expression;
|
|
}
|
|
else if (IsStr(input, U"(<$"))
|
|
{
|
|
U32String name;
|
|
vint index = -1;
|
|
if (IsName(input, name))
|
|
{
|
|
if (IsChar(input, U';'))
|
|
{
|
|
if (!IsPositiveInteger(input, index))
|
|
{
|
|
goto NEED_NUMBER;
|
|
}
|
|
}
|
|
}
|
|
else if (!IsPositiveInteger(input, index))
|
|
{
|
|
goto NEED_NUMBER;
|
|
}
|
|
if (!IsChar(input, U'>'))
|
|
{
|
|
goto NEED_GREATER;
|
|
}
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
auto expression = Ptr(new MatchExpression);
|
|
expression->name = name;
|
|
expression->index = index;
|
|
return expression;
|
|
}
|
|
else if (IsStr(input, U"(<"))
|
|
{
|
|
U32String name;
|
|
if (!IsName(input, name))
|
|
{
|
|
goto NEED_NAME;
|
|
}
|
|
if (!IsChar(input, U'>'))
|
|
{
|
|
goto NEED_GREATER;
|
|
}
|
|
auto sub = ParseExpression(input);
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
auto expression = Ptr(new CaptureExpression);
|
|
expression->name = name;
|
|
expression->expression = sub;
|
|
return expression;
|
|
}
|
|
else if (IsStr(input, U"(?"))
|
|
{
|
|
auto sub = ParseExpression(input);
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
auto expression = Ptr(new CaptureExpression);
|
|
expression->expression = sub;
|
|
return expression;
|
|
}
|
|
else if (IsChar(input, U'('))
|
|
{
|
|
auto sub = ParseExpression(input);
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
return sub;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
NEED_RIGHT_BRACKET:
|
|
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
NEED_GREATER:
|
|
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
NEED_NAME:
|
|
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
NEED_NUMBER:
|
|
throw ArgumentException(L"Regular expression syntax error: Number expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
|
|
Ptr<Expression> ParseUnit(const char32_t*& input)
|
|
{
|
|
Ptr<Expression> unit = ParseCharSet(input);
|
|
if (!unit)
|
|
{
|
|
unit = ParseFunction(input);
|
|
}
|
|
if (!unit)
|
|
{
|
|
return 0;
|
|
}
|
|
Ptr<LoopExpression> loop;
|
|
while ((loop = ParseLoop(input)))
|
|
{
|
|
loop->expression = unit;
|
|
unit = loop;
|
|
}
|
|
return unit;
|
|
}
|
|
|
|
Ptr<Expression> ParseJoin(const char32_t*& input)
|
|
{
|
|
auto expression = ParseUnit(input);
|
|
while (true)
|
|
{
|
|
auto right = ParseUnit(input);
|
|
if (right)
|
|
{
|
|
auto sequence = Ptr(new SequenceExpression);
|
|
sequence->left = expression;
|
|
sequence->right = right;
|
|
expression = sequence;
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
return expression;
|
|
}
|
|
|
|
Ptr<Expression> ParseAlt(const char32_t*& input)
|
|
{
|
|
auto expression = ParseJoin(input);
|
|
while (true)
|
|
{
|
|
if (IsChar(input, U'|'))
|
|
{
|
|
auto right = ParseJoin(input);
|
|
if (right)
|
|
{
|
|
auto alternate = Ptr(new AlternateExpression);
|
|
alternate->left = expression;
|
|
alternate->right = right;
|
|
expression = alternate;
|
|
}
|
|
else
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseAlt", L"input");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
return expression;
|
|
}
|
|
|
|
Ptr<Expression> ParseExpression(const char32_t*& input)
|
|
{
|
|
return ParseAlt(input);
|
|
}
|
|
|
|
Ptr<RegexExpression> ParseRegexExpression(const U32String& code)
|
|
{
|
|
auto regex = Ptr(new RegexExpression);
|
|
const char32_t* start = code.Buffer();
|
|
const char32_t* input = start;
|
|
try
|
|
{
|
|
while (IsStr(input, U"(<#"))
|
|
{
|
|
U32String name;
|
|
if (!IsName(input, name))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseRegexExpression", L"code");
|
|
}
|
|
if (!IsChar(input, U'>'))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
Ptr<Expression> sub = ParseExpression(input);
|
|
if (!IsChar(input, U')'))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
if (regex->definitions.Keys().Contains(name))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Found duplicated sub expression name: \"" + u32tow(name) + L"\". ", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
else
|
|
{
|
|
regex->definitions.Add(name, sub);
|
|
}
|
|
}
|
|
regex->expression = ParseExpression(input);
|
|
if (!regex->expression)
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseUnit", L"input");
|
|
}
|
|
if (*input)
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Found unnecessary tokens.", L"vl::regex_internal::ParseUnit", L"input");
|
|
}
|
|
return regex;
|
|
}
|
|
catch (const ArgumentException& e)
|
|
{
|
|
throw RegexException(e.Message(), code, input - start);
|
|
}
|
|
}
|
|
|
|
U32String EscapeTextForRegex(const U32String& literalString)
|
|
{
|
|
U32String result;
|
|
for (vint i = 0; i < literalString.Length(); i++)
|
|
{
|
|
char32_t c = literalString[i];
|
|
switch (c)
|
|
{
|
|
case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|':
|
|
case U'{':case U'}':case U'[':case U']':case U'<':case U'>':
|
|
case U'^':case U'$':case U'!':case U'=':
|
|
result += U32String(U"\\") + U32String::FromChar(c);
|
|
break;
|
|
case U'\r':
|
|
result += U"\\r";
|
|
break;
|
|
case U'\n':
|
|
result += U"\\n";
|
|
break;
|
|
case U'\t':
|
|
result += U"\\t";
|
|
break;
|
|
default:
|
|
result += U32String::FromChar(c);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
U32String UnescapeTextForRegex(const U32String& escapedText)
|
|
{
|
|
U32String result;
|
|
for (vint i = 0; i < escapedText.Length(); i++)
|
|
{
|
|
char32_t c = escapedText[i];
|
|
if (c == U'\\' || c == U'/')
|
|
{
|
|
if (i < escapedText.Length() - 1)
|
|
{
|
|
i++;
|
|
c = escapedText[i];
|
|
switch (c)
|
|
{
|
|
case U'r':
|
|
result += U"\r";
|
|
break;
|
|
case U'n':
|
|
result += U"\n";
|
|
break;
|
|
case U't':
|
|
result += U"\t";
|
|
break;
|
|
default:
|
|
result += U32String::FromChar(c);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
result += U32String::FromChar(c);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
U32String NormalizeEscapedTextForRegex(const U32String& escapedText)
|
|
{
|
|
U32String result;
|
|
for (vint i = 0; i < escapedText.Length(); i++)
|
|
{
|
|
char32_t c = escapedText[i];
|
|
if (c == U'\\' || c == U'/')
|
|
{
|
|
if (i < escapedText.Length() - 1)
|
|
{
|
|
i++;
|
|
c = escapedText[i];
|
|
result += U32String(U"\\") + U32String::FromChar(c);
|
|
continue;
|
|
}
|
|
}
|
|
result += U32String::FromChar(c);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
bool IsRegexEscapedLiteralString(const U32String& regex)
|
|
{
|
|
for (vint i = 0; i < regex.Length(); i++)
|
|
{
|
|
char32_t c = regex[i];
|
|
if (c == U'\\' || c == U'/')
|
|
{
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
switch (c)
|
|
{
|
|
case U'\\':case U'/':case U'(':case U')':case U'+':case U'*':case U'?':case U'|':
|
|
case U'{':case U'}':case U'[':case U']':case U'<':case U'>':
|
|
case U'^':case U'$':case U'!':case U'=':
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AST\REGEXWRITER.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex
|
|
{
|
|
using namespace vl::regex_internal;
|
|
|
|
/***********************************************************************
|
|
RegexNode
|
|
***********************************************************************/
|
|
|
|
RegexNode::RegexNode(Ptr<vl::regex_internal::Expression> _expression)
|
|
:expression(_expression)
|
|
{
|
|
}
|
|
|
|
RegexNode RegexNode::Some()const
|
|
{
|
|
return Loop(1, -1);
|
|
}
|
|
|
|
RegexNode RegexNode::Any()const
|
|
{
|
|
return Loop(0, -1);
|
|
}
|
|
|
|
RegexNode RegexNode::Opt()const
|
|
{
|
|
return Loop(0, 1);
|
|
}
|
|
|
|
RegexNode RegexNode::Loop(vint min, vint max)const
|
|
{
|
|
auto target = Ptr(new LoopExpression);
|
|
target->min = min;
|
|
target->max = max;
|
|
target->preferLong = true;
|
|
target->expression = expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::AtLeast(vint min)const
|
|
{
|
|
return Loop(min, -1);
|
|
}
|
|
|
|
RegexNode RegexNode::operator+(const RegexNode& node)const
|
|
{
|
|
auto target = Ptr(new SequenceExpression);
|
|
target->left = expression;
|
|
target->right = node.expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator|(const RegexNode& node)const
|
|
{
|
|
auto target = Ptr(new AlternateExpression);
|
|
target->left = expression;
|
|
target->right = node.expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator+()const
|
|
{
|
|
auto target = Ptr(new PositiveExpression);
|
|
target->expression = expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator-()const
|
|
{
|
|
auto target = Ptr(new NegativeExpression);
|
|
target->expression = expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator!()const
|
|
{
|
|
auto source = dynamic_cast<CharSetExpression*>(expression.Obj());
|
|
CHECK_ERROR(source, L"RegexNode::operator!()#operator ! can only applies on charset expressions.");
|
|
auto target = Ptr(new CharSetExpression);
|
|
CopyFrom(target->ranges, source->ranges);
|
|
target->reverse = !source->reverse;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator%(const RegexNode& node)const
|
|
{
|
|
auto left = dynamic_cast<CharSetExpression*>(expression.Obj());
|
|
auto right = dynamic_cast<CharSetExpression*>(node.expression.Obj());
|
|
CHECK_ERROR(left && right && !left->reverse && !right->reverse, L"RegexNode::operator%(const RegexNode&)#operator % only connects non-reverse charset expressions.");
|
|
auto target = Ptr(new CharSetExpression);
|
|
target->reverse = false;
|
|
CopyFrom(target->ranges, left->ranges);
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < right->ranges.Count(); i++)
|
|
{
|
|
if (!target->AddRangeWithConflict(right->ranges[i]))
|
|
{
|
|
CHECK_ERROR(false, L"RegexNode::operator%(const RegexNode&)#Failed to create charset expression from operator %.");
|
|
}
|
|
}
|
|
return RegexNode(target);
|
|
}
|
|
|
|
/***********************************************************************
|
|
Regex Writer
|
|
***********************************************************************/
|
|
|
|
RegexNode rCapture(const U32String& name, const RegexNode& node)
|
|
{
|
|
auto target = Ptr(new CaptureExpression);
|
|
target->name = name;
|
|
target->expression = node.expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rUsing(const U32String& name)
|
|
{
|
|
auto target = Ptr(new UsingExpression);
|
|
target->name = name;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rMatch(const U32String& name, vint index)
|
|
{
|
|
auto target = Ptr(new MatchExpression);
|
|
target->name = name;
|
|
target->index = index;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rMatch(vint index)
|
|
{
|
|
auto target = Ptr(new MatchExpression);
|
|
target->index = index;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rBegin()
|
|
{
|
|
return RegexNode(Ptr(new BeginExpression));
|
|
}
|
|
|
|
RegexNode rEnd()
|
|
{
|
|
return RegexNode(Ptr(new EndExpression));
|
|
}
|
|
|
|
RegexNode rC(char32_t a, char32_t b)
|
|
{
|
|
if (!b)b = a;
|
|
auto target = Ptr(new CharSetExpression);
|
|
target->reverse = false;
|
|
target->AddRangeWithConflict(CharRange(a, b));
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode r_d()
|
|
{
|
|
return rC(U'0', U'9');
|
|
}
|
|
|
|
RegexNode r_l()
|
|
{
|
|
return rC(U'a', U'z') % rC(U'A', U'Z') % rC(U'_');
|
|
}
|
|
|
|
RegexNode r_w()
|
|
{
|
|
return rC(U'0', U'9') % rC(U'a', U'z') % rC(U'A', U'Z') % rC(U'_');
|
|
}
|
|
|
|
RegexNode rAnyChar()
|
|
{
|
|
return rC(1, MaxChar32);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\AUTOMATON\REGEXAUTOMATON.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
using namespace collections;
|
|
|
|
/***********************************************************************
|
|
Automaton
|
|
***********************************************************************/
|
|
|
|
Automaton::Automaton()
|
|
{
|
|
startState = 0;
|
|
}
|
|
|
|
State* Automaton::NewState()
|
|
{
|
|
auto state = Ptr(new State);
|
|
state->finalState = false;
|
|
state->userData = 0;
|
|
states.Add(state);
|
|
return state.Obj();
|
|
}
|
|
|
|
Transition* Automaton::NewTransition(State* start, State* end)
|
|
{
|
|
auto transition = Ptr(new Transition);
|
|
transition->source = start;
|
|
transition->target = end;
|
|
start->transitions.Add(transition.Obj());
|
|
end->inputs.Add(transition.Obj());
|
|
transitions.Add(transition);
|
|
return transition.Obj();
|
|
}
|
|
|
|
Transition* Automaton::NewChars(State* start, State* end, CharRange range)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Chars;
|
|
transition->range = range;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewEpsilon(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Epsilon;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewBeginString(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::BeginString;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewEndString(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::EndString;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewNop(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Nop;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewCapture(State* start, State* end, vint capture)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Capture;
|
|
transition->capture = capture;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewMatch(State* start, State* end, vint capture, vint index)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Match;
|
|
transition->capture = capture;
|
|
transition->index = index;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewPositive(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Positive;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewNegative(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::Negative;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewNegativeFail(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::NegativeFail;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewEnd(State* start, State* end)
|
|
{
|
|
auto transition = NewTransition(start, end);
|
|
transition->type = Transition::End;
|
|
return transition;
|
|
}
|
|
|
|
/***********************************************************************
|
|
Helpers
|
|
***********************************************************************/
|
|
|
|
bool PureEpsilonChecker(Transition* transition)
|
|
{
|
|
switch (transition->type)
|
|
{
|
|
case Transition::Epsilon:
|
|
case Transition::Nop:
|
|
case Transition::Capture:
|
|
case Transition::End:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool RichEpsilonChecker(Transition* transition)
|
|
{
|
|
switch (transition->type)
|
|
{
|
|
case Transition::Epsilon:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool AreEqual(Transition* transA, Transition* transB)
|
|
{
|
|
if (transA->type != transB->type)return false;
|
|
switch (transA->type)
|
|
{
|
|
case Transition::Chars:
|
|
return transA->range == transB->range;
|
|
case Transition::Capture:
|
|
return transA->capture == transB->capture;
|
|
case Transition::Match:
|
|
return transA->capture == transB->capture && transA->index == transB->index;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Collect epsilon states and non-epsilon transitions, their order are maintained to match the e-NFA
|
|
void CollectEpsilon(State* targetState, State* sourceState, bool(*epsilonChecker)(Transition*), List<State*>& epsilonStates, List<Transition*>& transitions)
|
|
{
|
|
if (!epsilonStates.Contains(sourceState))
|
|
{
|
|
epsilonStates.Add(sourceState);
|
|
// TODO: (enumerable) foreach:alterable
|
|
for (vint i = 0; i < sourceState->transitions.Count(); i++)
|
|
{
|
|
Transition* transition = sourceState->transitions[i];
|
|
if (epsilonChecker(transition))
|
|
{
|
|
if (!epsilonStates.Contains(transition->target))
|
|
{
|
|
if (transition->target->finalState)
|
|
{
|
|
targetState->finalState = true;
|
|
}
|
|
CollectEpsilon(targetState, transition->target, epsilonChecker, epsilonStates, transitions);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
transitions.Add(transition);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ptr<Automaton> EpsilonNfaToNfa(Ptr<Automaton> source, bool(*epsilonChecker)(Transition*), Dictionary<State*, State*>& nfaStateMap)
|
|
{
|
|
auto target = Ptr(new Automaton);
|
|
Dictionary<State*, State*> stateMap; // source->target
|
|
List<State*> epsilonStates; // current epsilon closure
|
|
List<Transition*> transitions; // current non-epsilon transitions
|
|
|
|
stateMap.Add(source->startState, target->NewState());
|
|
nfaStateMap.Add(stateMap[source->startState], source->startState);
|
|
target->startState = target->states[0].Obj();
|
|
CopyFrom(target->captureNames, source->captureNames);
|
|
|
|
// TODO: (enumerable) foreach
|
|
for (vint i = 0; i < target->states.Count(); i++)
|
|
{
|
|
// Clear cache
|
|
State* targetState = target->states[i].Obj();
|
|
State* sourceState = nfaStateMap[targetState];
|
|
if (sourceState->finalState)
|
|
{
|
|
targetState->finalState = true;
|
|
}
|
|
epsilonStates.Clear();
|
|
transitions.Clear();
|
|
|
|
// Collect epsilon states and non-epsilon transitions
|
|
CollectEpsilon(targetState, sourceState, epsilonChecker, epsilonStates, transitions);
|
|
|
|
// Iterate through all non-epsilon transitions
|
|
// TODO: (enumerable) foreach
|
|
for (vint j = 0; j < transitions.Count(); j++)
|
|
{
|
|
Transition* transition = transitions[j];
|
|
// Create and map a new target state if a new non-epsilon state is found in the e-NFA
|
|
if (!stateMap.Keys().Contains(transition->target))
|
|
{
|
|
stateMap.Add(transition->target, target->NewState());
|
|
nfaStateMap.Add(stateMap[transition->target], transition->target);
|
|
}
|
|
// Copy transition to connect between two non-epsilon state
|
|
Transition* newTransition = target->NewTransition(targetState, stateMap[transition->target]);
|
|
newTransition->capture = transition->capture;
|
|
newTransition->index = transition->index;
|
|
newTransition->range = transition->range;
|
|
newTransition->type = transition->type;
|
|
}
|
|
}
|
|
return target;
|
|
}
|
|
|
|
Ptr<Automaton> NfaToDfa(Ptr<Automaton> source, Group<State*, State*>& dfaStateMap)
|
|
{
|
|
auto target = Ptr(new Automaton);
|
|
CopyFrom(target->captureNames, source->captureNames);
|
|
State* startState = target->NewState();
|
|
target->startState = startState;
|
|
dfaStateMap.Add(startState, source->startState);
|
|
|
|
for (auto currentState_ : target->states)
|
|
{
|
|
Group<Transition*, Transition*> nfaClassToTransitions;
|
|
Dictionary<Transition*, Transition*> nfaTransitionToClass;
|
|
List<Transition*> orderedTransitionClasses;
|
|
|
|
State* currentState = currentState_.Obj();
|
|
|
|
// Iterate through all NFA states which represent the DFA state
|
|
for (auto nfaState : dfaStateMap[currentState])
|
|
{
|
|
// Iterate through all transitions from those NFA states
|
|
for (auto nfaTransition : nfaState->transitions)
|
|
{
|
|
Transition* transitionClass = nullptr;
|
|
|
|
// Check if there is any key in nfaTransitions that has the same input as the current transition
|
|
{
|
|
vint index = nfaTransitionToClass.Keys().IndexOf(nfaTransition);
|
|
if (index != -1) transitionClass = nfaTransitionToClass.Values()[index];
|
|
}
|
|
|
|
if (transitionClass == nullptr)
|
|
{
|
|
// TODO: (enumerable) foreach
|
|
for (vint l = 0; l < orderedTransitionClasses.Count(); l++)
|
|
{
|
|
Transition* key = orderedTransitionClasses[l];
|
|
if (AreEqual(key, nfaTransition))
|
|
{
|
|
transitionClass = key;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create a new key if not
|
|
if (transitionClass == nullptr)
|
|
{
|
|
transitionClass = nfaTransition;
|
|
orderedTransitionClasses.Add(transitionClass);
|
|
}
|
|
// Group the transition
|
|
nfaClassToTransitions.Add(transitionClass, nfaTransition);
|
|
nfaTransitionToClass.Add(nfaTransition, transitionClass);
|
|
}
|
|
}
|
|
|
|
// Iterate through all key transition that represent all existing transition inputs from the same state
|
|
for (auto transitionClass : orderedTransitionClasses)
|
|
{
|
|
auto&& equivalentTransitions = nfaClassToTransitions[transitionClass];
|
|
|
|
// Sort all target states and keep unique
|
|
List<State*> transitionTargets;
|
|
CopyFrom(
|
|
transitionTargets,
|
|
From(equivalentTransitions)
|
|
.Select([](auto t) { return t->target; })
|
|
.Distinct()
|
|
);
|
|
|
|
// Check if these NFA states represent a created DFA state
|
|
State* dfaState = 0;
|
|
// TODO: (enumerable) foreach on dictionary
|
|
for (vint k = 0; k < dfaStateMap.Count(); k++)
|
|
{
|
|
// Compare two NFA states set
|
|
if (CompareEnumerable(transitionTargets, dfaStateMap.GetByIndex(k)) == 0)
|
|
{
|
|
dfaState = dfaStateMap.Keys()[k];
|
|
}
|
|
}
|
|
// Create a new DFA state if there is not
|
|
if (!dfaState)
|
|
{
|
|
dfaState = target->NewState();
|
|
// TODO: (enumerable) foreach
|
|
for (vint k = 0; k < transitionTargets.Count(); k++)
|
|
{
|
|
dfaStateMap.Add(dfaState, transitionTargets[k]);
|
|
if (transitionTargets[k]->finalState)
|
|
{
|
|
dfaState->finalState = true;
|
|
}
|
|
}
|
|
}
|
|
// Create corresponding DFA transition
|
|
Transition* newTransition = target->NewTransition(currentState, dfaState);
|
|
newTransition->capture = transitionClass->capture;
|
|
newTransition->index = transitionClass->index;
|
|
newTransition->range = transitionClass->range;
|
|
newTransition->type = transitionClass->type;
|
|
}
|
|
}
|
|
|
|
return target;
|
|
}
|
|
}
|
|
}
|