Files
GacUI/Import/VlppRegex.cpp
2021-10-27 01:31:04 -07:00

3999 lines
100 KiB
C++

/***********************************************************************
THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY
DEVELOPER: Zihan Chen(vczh)
***********************************************************************/
#include "VlppRegex.h"
/***********************************************************************
.\REGEX.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex
{
using namespace collections;
using namespace regex_internal;
/***********************************************************************
RegexString
***********************************************************************/
RegexString::RegexString(vint _start)
:start(_start)
,length(0)
{
}
RegexString::RegexString(const WString& _string, vint _start, vint _length)
:value(_length==0?L"":_string.Sub(_start, _length))
,start(_start)
,length(_length)
{
}
vint RegexString::Start()const
{
return start;
}
vint RegexString::Length()const
{
return length;
}
const WString& RegexString::Value()const
{
return value;
}
bool RegexString::operator==(const RegexString& string)const
{
return start==string.start && length==string.length && value==string.value;
}
/***********************************************************************
RegexMatch
***********************************************************************/
RegexMatch::RegexMatch(const WString& _string, PureResult* _result)
:success(true)
,result(_string, _result->start, _result->length)
{
}
RegexMatch::RegexMatch(const WString& _string, RichResult* _result, RichInterpretor* _rich)
:success(true)
,result(_string, _result->start, _result->length)
{
for(vint i=0;i<_result->captures.Count();i++)
{
CaptureRecord& capture=_result->captures[i];
if(capture.capture==-1)
{
captures.Add(RegexString(_string, capture.start, capture.length));
}
else
{
groups.Add(_rich->CaptureNames().Get(capture.capture), RegexString(_string, capture.start, capture.length));
}
}
}
RegexMatch::RegexMatch(const RegexString& _result)
:success(false)
,result(_result)
{
}
bool RegexMatch::Success()const
{
return success;
}
const RegexString& RegexMatch::Result()const
{
return result;
}
const RegexMatch::CaptureList& RegexMatch::Captures()const
{
return captures;
}
const RegexMatch::CaptureGroup& RegexMatch::Groups()const
{
return groups;
}
/***********************************************************************
Regex
***********************************************************************/
void Regex::Process(const WString& text, bool keepEmpty, bool keepSuccess, bool keepFail, RegexMatch::List& matches)const
{
if(rich)
{
const wchar_t* start=text.Buffer();
const wchar_t* input=start;
RichResult result;
while(rich->Match(input, start, result))
{
vint offset=input-start;
if(keepFail)
{
if(result.start>offset || keepEmpty)
{
matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
}
}
if(keepSuccess)
{
matches.Add(new RegexMatch(text, &result, rich));
}
input=start+result.start+result.length;
}
if(keepFail)
{
vint remain=input-start;
vint length=text.Length()-remain;
if(length || keepEmpty)
{
matches.Add(new RegexMatch(RegexString(text, remain, length)));
}
}
}
else
{
const wchar_t* start=text.Buffer();
const wchar_t* input=start;
PureResult result;
while(pure->Match(input, start, result))
{
vint offset=input-start;
if(keepFail)
{
if(result.start>offset || keepEmpty)
{
matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
}
}
if(keepSuccess)
{
matches.Add(new RegexMatch(text, &result));
}
input=start+result.start+result.length;
}
if(keepFail)
{
vint remain=input-start;
vint length=text.Length()-remain;
if(length || keepEmpty)
{
matches.Add(new RegexMatch(RegexString(text, remain, length)));
}
}
}
}
Regex::Regex(const WString& code, bool preferPure)
{
CharRange::List subsets;
RegexExpression::Ref regex=ParseRegexExpression(code);
Expression::Ref expression=regex->Merge();
expression->NormalizeCharSet(subsets);
bool pureRequired=false;
bool richRequired=false;
if(preferPure)
{
if(expression->HasNoExtension())
{
pureRequired=true;
}
else
{
if(expression->CanTreatAsPure())
{
pureRequired=true;
richRequired=true;
}
else
{
richRequired=true;
}
}
}
else
{
richRequired=true;
}
try
{
if(pureRequired)
{
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
pure=new PureInterpretor(dfa, subsets);
}
if(richRequired)
{
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap);
Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
rich=new RichInterpretor(dfa);
}
}
catch(...)
{
if(pure)delete pure;
if(rich)delete rich;
throw;
}
}
Regex::~Regex()
{
if(pure)delete pure;
if(rich)delete rich;
}
bool Regex::IsPureMatch()const
{
return rich?false:true;
}
bool Regex::IsPureTest()const
{
return pure?true:false;
}
RegexMatch::Ref Regex::MatchHead(const WString& text)const
{
if(rich)
{
RichResult result;
if(rich->MatchHead(text.Buffer(), text.Buffer(), result))
{
return new RegexMatch(text, &result, rich);
}
else
{
return 0;
}
}
else
{
PureResult result;
if(pure->MatchHead(text.Buffer(), text.Buffer(), result))
{
return new RegexMatch(text, &result);
}
else
{
return 0;
}
}
}
RegexMatch::Ref Regex::Match(const WString& text)const
{
if(rich)
{
RichResult result;
if(rich->Match(text.Buffer(), text.Buffer(), result))
{
return new RegexMatch(text, &result, rich);
}
else
{
return 0;
}
}
else
{
PureResult result;
if(pure->Match(text.Buffer(), text.Buffer(), result))
{
return new RegexMatch(text, &result);
}
else
{
return 0;
}
}
}
bool Regex::TestHead(const WString& text)const
{
if(pure)
{
PureResult result;
return pure->MatchHead(text.Buffer(), text.Buffer(), result);
}
else
{
RichResult result;
return rich->MatchHead(text.Buffer(), text.Buffer(), result);
}
}
bool Regex::Test(const WString& text)const
{
if(pure)
{
PureResult result;
return pure->Match(text.Buffer(), text.Buffer(), result);
}
else
{
RichResult result;
return rich->Match(text.Buffer(), text.Buffer(), result);
}
}
void Regex::Search(const WString& text, RegexMatch::List& matches)const
{
Process(text, false, true, false, matches);
}
void Regex::Split(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
{
Process(text, keepEmptyMatch, false, true, matches);
}
void Regex::Cut(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
{
Process(text, keepEmptyMatch, true, true, matches);
}
/***********************************************************************
RegexTokens
***********************************************************************/
bool RegexToken::operator==(const RegexToken& _token)const
{
return length==_token.length && token==_token.token && reading==_token.reading;
}
bool RegexToken::operator==(const wchar_t* _token)const
{
return wcslen(_token)==length && wcsncmp(reading, _token, length)==0;
}
class RegexTokenEnumerator : public Object, public IEnumerator<RegexToken>
{
protected:
RegexToken token;
vint index = -1;
PureInterpretor* pure;
const Array<vint>& stateTokens;
const wchar_t* start;
vint codeIndex;
RegexProc proc;
const wchar_t* reading;
vint rowStart = 0;
vint columnStart = 0;
bool cacheAvailable = false;
RegexToken cacheToken;
public:
RegexTokenEnumerator(const RegexTokenEnumerator& enumerator)
:token(enumerator.token)
, index(enumerator.index)
, pure(enumerator.pure)
, stateTokens(enumerator.stateTokens)
, proc(enumerator.proc)
, reading(enumerator.reading)
, start(enumerator.start)
, rowStart(enumerator.rowStart)
, columnStart(enumerator.columnStart)
, codeIndex(enumerator.codeIndex)
, cacheAvailable(enumerator.cacheAvailable)
, cacheToken(enumerator.cacheToken)
{
}
RegexTokenEnumerator(PureInterpretor* _pure, const Array<vint>& _stateTokens, const wchar_t* _start, vint _codeIndex, RegexProc _proc)
:index(-1)
, pure(_pure)
, stateTokens(_stateTokens)
, start(_start)
, codeIndex(_codeIndex)
, proc(_proc)
, reading(_start)
{
}
IEnumerator<RegexToken>* Clone()const
{
return new RegexTokenEnumerator(*this);
}
const RegexToken& Current()const
{
return token;
}
vint Index()const
{
return index;
}
bool Next()
{
if (!cacheAvailable && !*reading) return false;
if (cacheAvailable)
{
token = cacheToken;
cacheAvailable = false;
}
else
{
token.reading = reading;
token.start = 0;
token.length = 0;
token.token = -2;
token.completeToken = true;
}
token.rowStart = rowStart;
token.columnStart = columnStart;
token.rowEnd = rowStart;
token.columnEnd = columnStart;
token.codeIndex = codeIndex;
PureResult result;
while (*reading)
{
vint id = -1;
bool completeToken = true;
if (!pure->MatchHead(reading, start, result))
{
result.start = reading - start;
if (id == -1 && result.terminateState != -1)
{
vint state = pure->GetRelatedFinalState(result.terminateState);
if (state != -1)
{
id = stateTokens[state];
}
}
if (id == -1)
{
result.length = 1;
}
else
{
completeToken = false;
}
}
else
{
id = stateTokens.Get(result.finalState);
}
if (id != -1 && proc.extendProc)
{
RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr);
proc.extendProc(proc.argument, reading, -1, true, token);
#if _DEBUG
CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer.");
#endif
result.length = token.length;
id = token.token;
completeToken = token.completeToken;
}
if (token.token == -2)
{
token.start = result.start;
token.length = result.length;
token.token = id;
token.completeToken = completeToken;
}
else if (token.token == id && id == -1)
{
token.length += result.length;
}
else
{
cacheAvailable = true;
cacheToken.reading = reading;
cacheToken.start = result.start;
cacheToken.length = result.length;
cacheToken.codeIndex = codeIndex;
cacheToken.token = id;
cacheToken.completeToken = completeToken;
}
reading += result.length;
if (cacheAvailable)
{
break;
}
}
index++;
for (vint i = 0; i < token.length; i++)
{
token.rowEnd = rowStart;
token.columnEnd = columnStart;
if (token.reading[i] == L'\n')
{
rowStart++;
columnStart = 0;
}
else
{
columnStart++;
}
}
return true;
}
void Reset()
{
index = -1;
reading = start;
cacheAvailable = false;
}
void ReadToEnd(List<RegexToken>& tokens, bool(*discard)(vint))
{
while (Next())
{
if (!discard(token.token))
{
tokens.Add(token);
}
}
}
};
RegexTokens::RegexTokens(PureInterpretor* _pure, const Array<vint>& _stateTokens, const WString& _code, vint _codeIndex, RegexProc _proc)
:pure(_pure)
, stateTokens(_stateTokens)
, code(_code)
, codeIndex(_codeIndex)
, proc(_proc)
{
}
RegexTokens::RegexTokens(const RegexTokens& tokens)
:pure(tokens.pure)
, stateTokens(tokens.stateTokens)
, code(tokens.code)
, codeIndex(tokens.codeIndex)
, proc(tokens.proc)
{
}
IEnumerator<RegexToken>* RegexTokens::CreateEnumerator()const
{
return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc);
}
bool DefaultDiscard(vint token)
{
return false;
}
void RegexTokens::ReadToEnd(collections::List<RegexToken>& tokens, bool(*discard)(vint))const
{
if(discard==0)
{
discard=&DefaultDiscard;
}
RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard);
}
/***********************************************************************
RegexLexerWalker
***********************************************************************/
RegexLexerWalker::RegexLexerWalker(PureInterpretor* _pure, const Array<vint>& _stateTokens)
:pure(_pure)
, stateTokens(_stateTokens)
{
}
RegexLexerWalker::RegexLexerWalker(const RegexLexerWalker& tokens)
: pure(tokens.pure)
, stateTokens(tokens.stateTokens)
{
}
RegexLexerWalker::~RegexLexerWalker()
{
}
RegexTokens::~RegexTokens()
{
}
vint RegexLexerWalker::GetStartState()const
{
return pure->GetStartState();
}
vint RegexLexerWalker::GetRelatedToken(vint state)const
{
vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state);
return finalState == -1 ? -1 : stateTokens.Get(finalState);
}
void RegexLexerWalker::Walk(wchar_t input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const
{
vint previousState=state;
token=-1;
finalState=false;
previousTokenStop=false;
if(state==-1)
{
state=pure->GetStartState();
previousTokenStop=true;
}
state=pure->Transit(input, state);
if(state==-1)
{
previousTokenStop=true;
if(previousState==-1)
{
finalState=true;
return;
}
else if(pure->IsFinalState(previousState))
{
state=pure->Transit(input, pure->GetStartState());
}
}
if(pure->IsFinalState(state))
{
token=stateTokens.Get(state);
finalState=true;
return;
}
else
{
finalState=state==-1;
return;
}
}
vint RegexLexerWalker::Walk(wchar_t input, vint state)const
{
vint token=-1;
bool finalState=false;
bool previousTokenStop=false;
Walk(input, state, token, finalState, previousTokenStop);
return state;
}
bool RegexLexerWalker::IsClosedToken(const wchar_t* input, vint length)const
{
vint state=pure->GetStartState();
for(vint i=0;i<length;i++)
{
state=pure->Transit(input[i], state);
if(state==-1) return true;
if(pure->IsDeadState(state)) return true;
}
return false;
}
bool RegexLexerWalker::IsClosedToken(const WString& input)const
{
return IsClosedToken(input.Buffer(), input.Length());
}
/***********************************************************************
RegexLexerColorizer
***********************************************************************/
RegexLexerColorizer::RegexLexerColorizer(const RegexLexerWalker& _walker, RegexProc _proc)
:walker(_walker)
, proc(_proc)
{
internalState.currentState = walker.GetStartState();
}
RegexLexerColorizer::RegexLexerColorizer(const RegexLexerColorizer& colorizer)
:walker(colorizer.walker)
, proc(colorizer.proc)
, internalState(colorizer.internalState)
{
}
RegexLexerColorizer::~RegexLexerColorizer()
{
}
RegexLexerColorizer::InternalState RegexLexerColorizer::GetInternalState()
{
return internalState;
}
void RegexLexerColorizer::SetInternalState(InternalState state)
{
internalState = state;
}
void RegexLexerColorizer::Pass(wchar_t input)
{
WalkOneToken(&input, 1, 0, false);
}
vint RegexLexerColorizer::GetStartState()const
{
return walker.GetStartState();
}
void RegexLexerColorizer::CallExtendProcAndColorizeProc(const wchar_t* input, vint length, RegexProcessingToken& token, bool colorize)
{
vint oldTokenLength = token.length;
proc.extendProc(proc.argument, input + token.start, length - token.start, false, token);
#if _DEBUG
{
bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken;
CHECK_ERROR(
token.completeToken || pausedAtTheEnd,
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input."
);
CHECK_ERROR(
token.completeToken || token.token != -1,
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id."
);
CHECK_ERROR(
oldTokenLength <= token.length,
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length."
);
CHECK_ERROR(
(token.interTokenState == nullptr) == !pausedAtTheEnd,
L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
);
}
#endif
if ((internalState.interTokenState = token.interTokenState))
{
internalState.interTokenId = token.token;
}
if (colorize)
{
proc.colorizeProc(proc.argument, token.start, token.length, token.token);
}
}
vint RegexLexerColorizer::WalkOneToken(const wchar_t* input, vint length, vint start, bool colorize)
{
if (internalState.interTokenState)
{
RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState);
proc.extendProc(proc.argument, input, length, false, token);
#if _DEBUG
{
bool pausedAtTheEnd = token.length == length && !token.completeToken;
CHECK_ERROR(
token.completeToken || pausedAtTheEnd,
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input."
);
CHECK_ERROR(
token.completeToken || token.token == internalState.interTokenId,
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id."
);
CHECK_ERROR(
(token.interTokenState == nullptr) == !pausedAtTheEnd,
L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
);
}
#endif
if (colorize)
{
proc.colorizeProc(proc.argument, 0, token.length, token.token);
}
if (!(internalState.interTokenState = token.interTokenState))
{
internalState.interTokenId = -1;
}
return token.length;
}
vint lastFinalStateLength = 0;
vint lastFinalStateToken = -1;
vint lastFinalStateState = -1;
vint tokenStartState = internalState.currentState;
for (vint i = start; i < length; i++)
{
vint currentToken = -1;
bool finalState = false;
bool previousTokenStop = false;
walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop);
if (previousTokenStop)
{
if (proc.extendProc && lastFinalStateToken != -1)
{
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
CallExtendProcAndColorizeProc(input, length, token, colorize);
if (token.completeToken)
{
internalState.currentState = walker.GetStartState();
}
return start + token.length;
}
else if (i == start)
{
if (tokenStartState == GetStartState())
{
if (colorize)
{
proc.colorizeProc(proc.argument, start, 1, -1);
}
internalState.currentState = walker.GetStartState();
return i + 1;
}
}
else
{
if (colorize)
{
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
}
internalState.currentState = lastFinalStateState;
return start + lastFinalStateLength;
}
}
if (finalState)
{
lastFinalStateLength = i + 1 - start;
lastFinalStateToken = currentToken;
lastFinalStateState = internalState.currentState;
}
}
if (lastFinalStateToken != -1 && start + lastFinalStateLength == length)
{
if (proc.extendProc)
{
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
CallExtendProcAndColorizeProc(input, length, token, colorize);
}
else if (colorize)
{
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
}
}
else if (colorize)
{
proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState));
}
return length;
}
void* RegexLexerColorizer::Colorize(const wchar_t* input, vint length)
{
vint index = 0;
while (index != length)
{
index = WalkOneToken(input, length, index, true);
}
return internalState.interTokenState;
}
/***********************************************************************
RegexLexer
***********************************************************************/
RegexLexer::RegexLexer(const collections::IEnumerable<WString>& tokens, RegexProc _proc)
:proc(_proc)
{
// Build DFA for all tokens
List<Expression::Ref> expressions;
List<Automaton::Ref> dfas;
CharRange::List subsets;
Ptr<IEnumerator<WString>> enumerator = tokens.CreateEnumerator();
while (enumerator->Next())
{
const WString& code = enumerator->Current();
RegexExpression::Ref regex = ParseRegexExpression(code);
Expression::Ref expression = regex->Merge();
expression->CollectCharSet(subsets);
expressions.Add(expression);
}
for (vint i = 0; i < expressions.Count(); i++)
{
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
Expression::Ref expression = expressions[i];
expression->ApplyCharSet(subsets);
Automaton::Ref eNfa = expression->GenerateEpsilonNfa();
Automaton::Ref nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
Automaton::Ref dfa = NfaToDfa(nfa, dfaStateMap);
dfas.Add(dfa);
}
// Mark all states in DFAs
for (vint i = 0; i < dfas.Count(); i++)
{
Automaton::Ref dfa = dfas[i];
for (vint j = 0; j < dfa->states.Count(); j++)
{
if (dfa->states[j]->finalState)
{
dfa->states[j]->userData = (void*)i;
}
else
{
dfa->states[j]->userData = (void*)dfas.Count();
}
}
}
// Connect all DFAs to an e-NFA
Automaton::Ref bigEnfa = new Automaton;
for (vint i = 0; i < dfas.Count(); i++)
{
CopyFrom(bigEnfa->states, dfas[i]->states);
CopyFrom(bigEnfa->transitions, dfas[i]->transitions);
}
bigEnfa->startState = bigEnfa->NewState();
for (vint i = 0; i < dfas.Count(); i++)
{
bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState);
}
// Build a single DFA out of the e-NFA
Dictionary<State*, State*> nfaStateMap;
Group<State*, State*> dfaStateMap;
Automaton::Ref bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap);
for (vint i = 0; i < nfaStateMap.Keys().Count(); i++)
{
void* userData = nfaStateMap.Values().Get(i)->userData;
nfaStateMap.Keys()[i]->userData = userData;
}
Automaton::Ref bigDfa = NfaToDfa(bigNfa, dfaStateMap);
for (vint i = 0; i < dfaStateMap.Keys().Count(); i++)
{
void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData;
for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++)
{
void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData;
if (userData > newData)
{
userData = newData;
}
}
dfaStateMap.Keys()[i]->userData = userData;
}
// Build state machine
pure = new PureInterpretor(bigDfa, subsets);
stateTokens.Resize(bigDfa->states.Count());
for (vint i = 0; i < stateTokens.Count(); i++)
{
void* userData = bigDfa->states[i]->userData;
stateTokens[i] = (vint)userData;
}
}
RegexLexer::~RegexLexer()
{
if (pure)delete pure;
}
RegexTokens RegexLexer::Parse(const WString& code, vint codeIndex)const
{
pure->PrepareForRelatedFinalStateTable();
return RegexTokens(pure, stateTokens, code, codeIndex, proc);
}
RegexLexerWalker RegexLexer::Walk()const
{
pure->PrepareForRelatedFinalStateTable();
return RegexLexerWalker(pure, stateTokens);
}
RegexLexerColorizer RegexLexer::Colorize()const
{
return RegexLexerColorizer(Walk(), proc);
}
}
}
/***********************************************************************
.\REGEXAUTOMATON.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
using namespace collections;
/***********************************************************************
Automaton
***********************************************************************/
Automaton::Automaton()
{
startState=0;
}
State* Automaton::NewState()
{
State* state=new State;
state->finalState=false;
state->userData=0;
states.Add(state);
return state;
}
Transition* Automaton::NewTransition(State* start, State* end)
{
Transition* transition=new Transition;
transition->source=start;
transition->target=end;
start->transitions.Add(transition);
end->inputs.Add(transition);
transitions.Add(transition);
return transition;
}
Transition* Automaton::NewChars(State* start, State* end, CharRange range)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Chars;
transition->range=range;
return transition;
}
Transition* Automaton::NewEpsilon(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Epsilon;
return transition;
}
Transition* Automaton::NewBeginString(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::BeginString;
return transition;
}
Transition* Automaton::NewEndString(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::EndString;
return transition;
}
Transition* Automaton::NewNop(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Nop;
return transition;
}
Transition* Automaton::NewCapture(State* start, State* end, vint capture)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Capture;
transition->capture=capture;
return transition;
}
Transition* Automaton::NewMatch(State* start, State* end, vint capture, vint index)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Match;
transition->capture=capture;
transition->index=index;
return transition;
}
Transition* Automaton::NewPositive(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Positive;
return transition;
}
Transition* Automaton::NewNegative(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::Negative;
return transition;
}
Transition* Automaton::NewNegativeFail(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::NegativeFail;
return transition;
}
Transition* Automaton::NewEnd(State* start, State* end)
{
Transition* transition=NewTransition(start, end);
transition->type=Transition::End;
return transition;
}
/***********************************************************************
Helpers
***********************************************************************/
bool PureEpsilonChecker(Transition* transition)
{
switch(transition->type)
{
case Transition::Epsilon:
case Transition::Nop:
case Transition::Capture:
case Transition::End:
return true;
default:
return false;
}
}
bool RichEpsilonChecker(Transition* transition)
{
switch(transition->type)
{
case Transition::Epsilon:
return true;
default:
return false;
}
}
bool AreEqual(Transition* transA, Transition* transB)
{
if(transA->type!=transB->type)return false;
switch(transA->type)
{
case Transition::Chars:
return transA->range==transB->range;
case Transition::Capture:
return transA->capture==transB->capture;
case Transition::Match:
return transA->capture==transB->capture && transA->index==transB->index;
default:
return true;
}
}
// Collect epsilon states and non-epsilon transitions, their order are maintained to match the e-NFA
void CollectEpsilon(State* targetState, State* sourceState, bool(*epsilonChecker)(Transition*), List<State*>& epsilonStates, List<Transition*>& transitions)
{
if(!epsilonStates.Contains(sourceState))
{
epsilonStates.Add(sourceState);
for(vint i=0;i<sourceState->transitions.Count();i++)
{
Transition* transition=sourceState->transitions[i];
if(epsilonChecker(transition))
{
if(!epsilonStates.Contains(transition->target))
{
if(transition->target->finalState)
{
targetState->finalState=true;
}
CollectEpsilon(targetState, transition->target, epsilonChecker, epsilonStates, transitions);
}
}
else
{
transitions.Add(transition);
}
}
}
}
Automaton::Ref EpsilonNfaToNfa(Automaton::Ref source, bool(*epsilonChecker)(Transition*), Dictionary<State*, State*>& nfaStateMap)
{
Automaton::Ref target=new Automaton;
Dictionary<State*, State*> stateMap; // source->target
List<State*> epsilonStates; // current epsilon closure
List<Transition*> transitions; // current non-epsilon transitions
stateMap.Add(source->startState, target->NewState());
nfaStateMap.Add(stateMap[source->startState], source->startState);
target->startState=target->states[0].Obj();
CopyFrom(target->captureNames, source->captureNames);
for(vint i=0;i<target->states.Count();i++)
{
// Clear cache
State* targetState=target->states[i].Obj();
State* sourceState=nfaStateMap[targetState];
if(sourceState->finalState)
{
targetState->finalState=true;
}
epsilonStates.Clear();
transitions.Clear();
// Collect epsilon states and non-epsilon transitions
CollectEpsilon(targetState, sourceState, epsilonChecker, epsilonStates, transitions);
// Iterate through all non-epsilon transitions
for(vint j=0;j<transitions.Count();j++)
{
Transition* transition=transitions[j];
// Create and map a new target state if a new non-epsilon state is found in the e-NFA
if(!stateMap.Keys().Contains(transition->target))
{
stateMap.Add(transition->target, target->NewState());
nfaStateMap.Add(stateMap[transition->target], transition->target);
}
// Copy transition to connect between two non-epsilon state
Transition* newTransition=target->NewTransition(targetState, stateMap[transition->target]);
newTransition->capture=transition->capture;
newTransition->index=transition->index;
newTransition->range=transition->range;
newTransition->type=transition->type;
}
}
return target;
}
Automaton::Ref NfaToDfa(Automaton::Ref source, Group<State*, State*>& dfaStateMap)
{
Automaton::Ref target=new Automaton;
Group<Transition*, Transition*> nfaTransitions;
List<Transition*> transitionClasses; // Maintain order for nfaTransitions.Keys
CopyFrom(target->captureNames, source->captureNames);
State* startState=target->NewState();
target->startState=startState;
dfaStateMap.Add(startState, source->startState);
SortedList<State*> transitionTargets;
SortedList<State*> relativeStates;
transitionTargets.SetLessMemoryMode(false);
relativeStates.SetLessMemoryMode(false);
for(vint i=0;i<target->states.Count();i++)
{
State* currentState=target->states[i].Obj();
nfaTransitions.Clear();
transitionClasses.Clear();
// Iterate through all NFA states which represent the DFA state
const List<State*>& nfaStates=dfaStateMap[currentState];
for(vint j=0;j<nfaStates.Count();j++)
{
State* nfaState=nfaStates.Get(j);
// Iterate through all transitions from those NFA states
for(vint k=0;k<nfaState->transitions.Count();k++)
{
Transition* nfaTransition=nfaState->transitions[k];
// Check if there is any key in nfaTransitions that has the same input as the current transition
Transition* transitionClass=0;
for(vint l=0;l<nfaTransitions.Keys().Count();l++)
{
Transition* key=nfaTransitions.Keys()[l];
if(AreEqual(key, nfaTransition))
{
transitionClass=key;
break;
}
}
// Create a new key if not
if(transitionClass==0)
{
transitionClass=nfaTransition;
transitionClasses.Add(transitionClass);
}
// Group the transition
nfaTransitions.Add(transitionClass, nfaTransition);
}
}
// Iterate through all key transition that represent all existing transition inputs from the same state
for(vint j=0;j<transitionClasses.Count();j++)
{
const List<Transition*>& transitionSet=nfaTransitions[transitionClasses[j]];
// Sort all target states and keep unique
transitionTargets.Clear();
for(vint l=0;l<transitionSet.Count();l++)
{
State* nfaState=transitionSet.Get(l)->target;
if(!transitionTargets.Contains(nfaState))
{
transitionTargets.Add(nfaState);
}
}
// Check if these NFA states represent a created DFA state
State* dfaState=0;
for(vint k=0;k<dfaStateMap.Count();k++)
{
// Sort NFA states for a certain DFA state
CopyFrom(relativeStates, dfaStateMap.GetByIndex(k));
// Compare two NFA states set
if(relativeStates.Count()==transitionTargets.Count())
{
bool equal=true;
for(vint l=0;l<relativeStates.Count();l++)
{
if(relativeStates[l]!=transitionTargets[l])
{
equal=false;
break;
}
}
if(equal)
{
dfaState=dfaStateMap.Keys()[k];
break;
}
}
}
// Create a new DFA state if there is not
if(!dfaState)
{
dfaState=target->NewState();
for(vint k=0;k<transitionTargets.Count();k++)
{
dfaStateMap.Add(dfaState, transitionTargets[k]);
if(transitionTargets[k]->finalState)
{
dfaState->finalState=true;
}
}
}
// Create corresponding DFA transition
Transition* transitionClass=transitionClasses[j];
Transition* newTransition=target->NewTransition(currentState, dfaState);
newTransition->capture=transitionClass->capture;
newTransition->index=transitionClass->index;
newTransition->range=transitionClass->range;
newTransition->type=transitionClass->type;
}
}
return target;
}
}
}
/***********************************************************************
.\REGEXDATA.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
CharRange
***********************************************************************/
CharRange::CharRange()
:begin(L'\0')
,end(L'\0')
{
}
CharRange::CharRange(wchar_t _begin, wchar_t _end)
:begin(_begin)
,end(_end)
{
}
bool CharRange::operator<(CharRange item)const
{
return end<item.begin;
}
bool CharRange::operator<=(CharRange item)const
{
return *this<item || *this==item;
}
bool CharRange::operator>(CharRange item)const
{
return item.end<begin;
}
bool CharRange::operator>=(CharRange item)const
{
return *this>item || *this==item;
}
bool CharRange::operator==(CharRange item)const
{
return begin==item.begin && end==item.end;
}
bool CharRange::operator!=(CharRange item)const
{
return begin!=item.begin || item.end!=end;
}
bool CharRange::operator<(wchar_t item)const
{
return end<item;
}
bool CharRange::operator<=(wchar_t item)const
{
return begin<=item;
}
bool CharRange::operator>(wchar_t item)const
{
return item<begin;
}
bool CharRange::operator>=(wchar_t item)const
{
return item<=end;
}
bool CharRange::operator==(wchar_t item)const
{
return begin<=item && item<=end;
}
bool CharRange::operator!=(wchar_t item)const
{
return item<begin || end<item;
}
}
}
/***********************************************************************
.\REGEXEXPRESSION.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
IsEqualAlgorithm
***********************************************************************/
class IsEqualAlgorithm : public RegexExpressionAlgorithm<bool, Expression*>
{
public:
bool Apply(CharSetExpression* expression, Expression* target)
{
CharSetExpression* expected=dynamic_cast<CharSetExpression*>(target);
if(expected)
{
if(expression->reverse!=expected->reverse)return false;
if(expression->ranges.Count()!=expected->ranges.Count())return false;
for(vint i=0;i<expression->ranges.Count();i++)
{
if(expression->ranges[i]!=expected->ranges[i])return false;
}
return true;
}
return false;
}
bool Apply(LoopExpression* expression, Expression* target)
{
LoopExpression* expected=dynamic_cast<LoopExpression*>(target);
if(expected)
{
if(expression->min!=expected->min)return false;
if(expression->max!=expected->max)return false;
if(expression->preferLong!=expected->preferLong)return false;
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(SequenceExpression* expression, Expression* target)
{
SequenceExpression* expected=dynamic_cast<SequenceExpression*>(target);
if(expected)
{
if(!Invoke(expression->left, expected->left.Obj()))return false;
if(!Invoke(expression->right, expected->right.Obj()))return false;
return true;
}
return false;
}
bool Apply(AlternateExpression* expression, Expression* target)
{
AlternateExpression* expected=dynamic_cast<AlternateExpression*>(target);
if(expected)
{
if(!Invoke(expression->left, expected->left.Obj()))return false;
if(!Invoke(expression->right, expected->right.Obj()))return false;
return true;
}
return false;
}
bool Apply(BeginExpression* expression, Expression* target)
{
BeginExpression* expected=dynamic_cast<BeginExpression*>(target);
if(expected)
{
return true;
}
return false;
}
bool Apply(EndExpression* expression, Expression* target)
{
EndExpression* expected=dynamic_cast<EndExpression*>(target);
if(expected)
{
return true;
}
return false;
}
bool Apply(CaptureExpression* expression, Expression* target)
{
CaptureExpression* expected=dynamic_cast<CaptureExpression*>(target);
if(expected)
{
if(expression->name!=expected->name)return false;
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(MatchExpression* expression, Expression* target)
{
MatchExpression* expected=dynamic_cast<MatchExpression*>(target);
if(expected)
{
if(expression->name!=expected->name)return false;
if(expression->index!=expected->index)return false;
return true;
}
return false;
}
bool Apply(PositiveExpression* expression, Expression* target)
{
PositiveExpression* expected=dynamic_cast<PositiveExpression*>(target);
if(expected)
{
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(NegativeExpression* expression, Expression* target)
{
NegativeExpression* expected=dynamic_cast<NegativeExpression*>(target);
if(expected)
{
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
return true;
}
return false;
}
bool Apply(UsingExpression* expression, Expression* target)
{
UsingExpression* expected=dynamic_cast<UsingExpression*>(target);
if(expected)
{
if(expression->name!=expected->name)return false;
return true;
}
return false;
}
};
/***********************************************************************
HasNoExtensionAlgorithm
***********************************************************************/
class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm<bool, void*>
{
public:
bool Apply(CharSetExpression* expression, void* target)
{
return true;
}
bool Apply(LoopExpression* expression, void* target)
{
return expression->preferLong && Invoke(expression->expression, 0);
}
bool Apply(SequenceExpression* expression, void* target)
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(AlternateExpression* expression, void* target)
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(BeginExpression* expression, void* target)
{
return false;
}
bool Apply(EndExpression* expression, void* target)
{
return false;
}
bool Apply(CaptureExpression* expression, void* target)
{
return false;
}
bool Apply(MatchExpression* expression, void* target)
{
return false;
}
bool Apply(PositiveExpression* expression, void* target)
{
return false;
}
bool Apply(NegativeExpression* expression, void* target)
{
return false;
}
bool Apply(UsingExpression* expression, void* target)
{
return false;
}
};
/***********************************************************************
CanTreatAsPureAlgorithm
***********************************************************************/
class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm<bool, void*>
{
public:
bool Apply(CharSetExpression* expression, void* target)
{
return true;
}
bool Apply(LoopExpression* expression, void* target)
{
return expression->preferLong && Invoke(expression->expression, 0);
}
bool Apply(SequenceExpression* expression, void* target)
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(AlternateExpression* expression, void* target)
{
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
}
bool Apply(BeginExpression* expression, void* target)
{
return false;
}
bool Apply(EndExpression* expression, void* target)
{
return false;
}
bool Apply(CaptureExpression* expression, void* target)
{
return Invoke(expression->expression, 0);
}
bool Apply(MatchExpression* expression, void* target)
{
return false;
}
bool Apply(PositiveExpression* expression, void* target)
{
return false;
}
bool Apply(NegativeExpression* expression, void* target)
{
return false;
}
bool Apply(UsingExpression* expression, void* target)
{
return false;
}
};
/***********************************************************************
CharSetNormalizationAlgorithm
***********************************************************************/
class NormalizedCharSet
{
public:
CharRange::List ranges;
};
class CharSetAlgorithm : public RegexExpressionAlgorithm<void, NormalizedCharSet*>
{
public:
virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)=0;
void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target)
{
if(expression->reverse)
{
wchar_t begin=1;
for(vint i=0;i<ranges.Count();i++)
{
CharRange range=ranges[i];
if(range.begin>begin)
{
Process(expression, target, CharRange(begin, range.begin-1));
}
begin=range.end+1;
}
if(begin<=65535)
{
Process(expression, target, CharRange(begin, 65535));
}
}
else
{
for(vint i=0;i<ranges.Count();i++)
{
Process(expression, target, ranges[i]);
}
}
}
void Apply(LoopExpression* expression, NormalizedCharSet* target)
{
Invoke(expression->expression, target);
}
void Apply(SequenceExpression* expression, NormalizedCharSet* target)
{
Invoke(expression->left, target);
Invoke(expression->right, target);
}
void Apply(AlternateExpression* expression, NormalizedCharSet* target)
{
Invoke(expression->left, target);
Invoke(expression->right, target);
}
void Apply(BeginExpression* expression, NormalizedCharSet* target)
{
}
void Apply(EndExpression* expression, NormalizedCharSet* target)
{
}
void Apply(CaptureExpression* expression, NormalizedCharSet* target)
{
Invoke(expression->expression, target);
}
void Apply(MatchExpression* expression, NormalizedCharSet* target)
{
}
void Apply(PositiveExpression* expression, NormalizedCharSet* target)
{
Invoke(expression->expression, target);
}
void Apply(NegativeExpression* expression, NormalizedCharSet* target)
{
Invoke(expression->expression, target);
}
void Apply(UsingExpression* expression, NormalizedCharSet* target)
{
}
};
class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm
{
public:
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
{
vint index=0;
while(index<target->ranges.Count())
{
CharRange current=target->ranges[index];
if(current<range || current>range)
{
index++;
}
else if(current.begin<range.begin)
{
// range : [ ?
// current : [ ]
target->ranges.RemoveAt(index);
target->ranges.Add(CharRange(current.begin, range.begin-1));
target->ranges.Add(CharRange(range.begin, current.end));
index++;
}
else if(current.begin>range.begin)
{
// range : [ ]
// current : [ ?
target->ranges.Add(CharRange(range.begin, current.begin-1));
range.begin=current.begin;
}
else if(current.end<range.end)
{
// range : [ ]
// current : [ ]
range.begin=current.end+1;
index++;
}
else if(current.end>range.end)
{
// range : [ ]
// current : [ ]
target->ranges.RemoveAt(index);
target->ranges.Add(range);
target->ranges.Add(CharRange(range.end+1, current.end));
return;
}
else
{
// range : [ ]
// current : [ ]
return;
}
}
target->ranges.Add(range);
}
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
{
Loop(expression, expression->ranges, target);
}
};
class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm
{
public:
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
{
for(vint j=0;j<target->ranges.Count();j++)
{
CharRange targetRange=target->ranges[j];
if(range.begin<=targetRange.begin && targetRange.end<=range.end)
{
expression->ranges.Add(targetRange);
}
}
}
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
{
CharRange::List source;
CopyFrom(source, expression->ranges);
expression->ranges.Clear();
Loop(expression, source, target);
expression->reverse=false;
}
};
/***********************************************************************
MergeAlgorithm
***********************************************************************/
class MergeParameter
{
public:
Expression::Map definitions;
RegexExpression* regex;
};
class MergeAlgorithm : public RegexExpressionAlgorithm<Expression::Ref, MergeParameter*>
{
public:
Expression::Ref Apply(CharSetExpression* expression, MergeParameter* target)
{
Ptr<CharSetExpression> result=new CharSetExpression;
CopyFrom(result->ranges, expression->ranges);
result->reverse=expression->reverse;
return result;
}
Expression::Ref Apply(LoopExpression* expression, MergeParameter* target)
{
Ptr<LoopExpression> result=new LoopExpression;
result->max=expression->max;
result->min=expression->min;
result->preferLong=expression->preferLong;
result->expression=Invoke(expression->expression, target);
return result;
}
Expression::Ref Apply(SequenceExpression* expression, MergeParameter* target)
{
Ptr<SequenceExpression> result=new SequenceExpression;
result->left=Invoke(expression->left, target);
result->right=Invoke(expression->right, target);
return result;
}
Expression::Ref Apply(AlternateExpression* expression, MergeParameter* target)
{
Ptr<AlternateExpression> result=new AlternateExpression;
result->left=Invoke(expression->left, target);
result->right=Invoke(expression->right, target);
return result;
}
Expression::Ref Apply(BeginExpression* expression, MergeParameter* target)
{
return new BeginExpression;
}
Expression::Ref Apply(EndExpression* expression, MergeParameter* target)
{
return new EndExpression;
}
Expression::Ref Apply(CaptureExpression* expression, MergeParameter* target)
{
Ptr<CaptureExpression> result=new CaptureExpression;
result->expression=Invoke(expression->expression, target);
result->name=expression->name;
return result;
}
Expression::Ref Apply(MatchExpression* expression, MergeParameter* target)
{
Ptr<MatchExpression> result=new MatchExpression;
result->name=expression->name;
result->index=expression->index;
return result;
}
Expression::Ref Apply(PositiveExpression* expression, MergeParameter* target)
{
Ptr<PositiveExpression> result=new PositiveExpression;
result->expression=Invoke(expression->expression, target);
return result;
}
Expression::Ref Apply(NegativeExpression* expression, MergeParameter* target)
{
Ptr<NegativeExpression> result=new NegativeExpression;
result->expression=Invoke(expression->expression, target);
return result;
}
Expression::Ref Apply(UsingExpression* expression, MergeParameter* target)
{
if(target->definitions.Keys().Contains(expression->name))
{
Expression::Ref reference=target->definitions[expression->name];
if(reference)
{
return reference;
}
else
{
throw ArgumentException(L"Regular expression syntax error: Found reference loops in\""+expression->name+L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
}
}
else if(target->regex->definitions.Keys().Contains(expression->name))
{
target->definitions.Add(expression->name, 0);
Expression::Ref result=Invoke(target->regex->definitions[expression->name], target);
target->definitions.Set(expression->name, result);
return result;
}
else
{
throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\""+expression->name+L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
}
}
};
/***********************************************************************
EpsilonNfaAlgorithm
***********************************************************************/
class EpsilonNfaInfo
{
public:
Automaton::Ref automaton;
};
class EpsilonNfa
{
public:
State* start;
State* end;
EpsilonNfa()
{
start=0;
end=0;
}
};
class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm<EpsilonNfa, Automaton*>
{
public:
EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target)
{
if(a.start)
{
target->NewEpsilon(a.end, b.start);
a.end=b.end;
return a;
}
else
{
return b;
}
}
EpsilonNfa Apply(CharSetExpression* expression, Automaton* target)
{
EpsilonNfa nfa;
nfa.start=target->NewState();
nfa.end=target->NewState();
for(vint i=0;i<expression->ranges.Count();i++)
{
target->NewChars(nfa.start, nfa.end, expression->ranges[i]);
}
return nfa;
}
EpsilonNfa Apply(LoopExpression* expression, Automaton* target)
{
EpsilonNfa head;
for(vint i=0;i<expression->min;i++)
{
EpsilonNfa body=Invoke(expression->expression, target);
head=Connect(head, body, target);
}
if(expression->max==-1)
{
EpsilonNfa body=Invoke(expression->expression, target);
if(!head.start)
{
head.start=head.end=target->NewState();
}
State* loopBegin=head.end;
State* loopEnd=target->NewState();
if(expression->preferLong)
{
target->NewEpsilon(loopBegin, body.start);
target->NewEpsilon(body.end, loopBegin);
target->NewNop(loopBegin, loopEnd);
}
else
{
target->NewNop(loopBegin, loopEnd);
target->NewEpsilon(loopBegin, body.start);
target->NewEpsilon(body.end, loopBegin);
}
head.end=loopEnd;
}
else if(expression->max>expression->min)
{
for(vint i=expression->min;i<expression->max;i++)
{
EpsilonNfa body=Invoke(expression->expression, target);
State* start=target->NewState();
State* end=target->NewState();
if(expression->preferLong)
{
target->NewEpsilon(start, body.start);
target->NewEpsilon(body.end, end);
target->NewNop(start, end);
}
else
{
target->NewNop(start, end);
target->NewEpsilon(start, body.start);
target->NewEpsilon(body.end, end);
}
body.start=start;
body.end=end;
head=Connect(head, body, target);
}
}
return head;
}
EpsilonNfa Apply(SequenceExpression* expression, Automaton* target)
{
EpsilonNfa a=Invoke(expression->left, target);
EpsilonNfa b=Invoke(expression->right, target);
return Connect(a, b, target);
}
EpsilonNfa Apply(AlternateExpression* expression, Automaton* target)
{
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
EpsilonNfa a=Invoke(expression->left, target);
EpsilonNfa b=Invoke(expression->right, target);
target->NewEpsilon(result.start, a.start);
target->NewEpsilon(a.end, result.end);
target->NewEpsilon(result.start, b.start);
target->NewEpsilon(b.end, result.end);
return result;
}
EpsilonNfa Apply(BeginExpression* expression, Automaton* target)
{
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
target->NewBeginString(result.start, result.end);
return result;
}
EpsilonNfa Apply(EndExpression* expression, Automaton* target)
{
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
target->NewEndString(result.start, result.end);
return result;
}
EpsilonNfa Apply(CaptureExpression* expression, Automaton* target)
{
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
vint capture=-1;
if(expression->name!=L"")
{
capture=target->captureNames.IndexOf(expression->name);
if(capture==-1)
{
capture=target->captureNames.Count();
target->captureNames.Add(expression->name);
}
}
EpsilonNfa body=Invoke(expression->expression, target);
target->NewCapture(result.start, body.start, capture);
target->NewEnd(body.end, result.end);
return result;
}
EpsilonNfa Apply(MatchExpression* expression, Automaton* target)
{
vint capture=-1;
if(expression->name!=L"")
{
capture=target->captureNames.IndexOf(expression->name);
if(capture==-1)
{
capture=target->captureNames.Count();
target->captureNames.Add(expression->name);
}
}
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
target->NewMatch(result.start, result.end, capture, expression->index);
return result;
}
EpsilonNfa Apply(PositiveExpression* expression, Automaton* target)
{
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
EpsilonNfa body=Invoke(expression->expression, target);
target->NewPositive(result.start, body.start);
target->NewEnd(body.end, result.end);
return result;
}
EpsilonNfa Apply(NegativeExpression* expression, Automaton* target)
{
EpsilonNfa result;
result.start=target->NewState();
result.end=target->NewState();
EpsilonNfa body=Invoke(expression->expression, target);
target->NewNegative(result.start, body.start);
target->NewEnd(body.end, result.end);
target->NewNegativeFail(result.start, result.end);
return result;
}
EpsilonNfa Apply(UsingExpression* expression, Automaton* target)
{
CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine.");
}
};
/***********************************************************************
Expression
***********************************************************************/
bool Expression::IsEqual(vl::regex_internal::Expression *expression)
{
return IsEqualAlgorithm().Invoke(this, expression);
}
bool Expression::HasNoExtension()
{
return HasNoExtensionAlgorithm().Invoke(this, 0);
}
bool Expression::CanTreatAsPure()
{
return CanTreatAsPureAlgorithm().Invoke(this, 0);
}
void Expression::NormalizeCharSet(CharRange::List& subsets)
{
NormalizedCharSet normalized;
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
CopyFrom(subsets, normalized.ranges);
}
void Expression::CollectCharSet(CharRange::List& subsets)
{
NormalizedCharSet normalized;
CopyFrom(normalized.ranges, subsets);
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
CopyFrom(subsets, normalized.ranges);
}
void Expression::ApplyCharSet(CharRange::List& subsets)
{
NormalizedCharSet normalized;
CopyFrom(normalized.ranges, subsets);
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
}
Automaton::Ref Expression::GenerateEpsilonNfa()
{
Automaton::Ref automaton=new Automaton;
EpsilonNfa result=EpsilonNfaAlgorithm().Invoke(this, automaton.Obj());
automaton->startState=result.start;
result.end->finalState=true;
return automaton;
}
/***********************************************************************
CharSetExpression
***********************************************************************/
bool CharSetExpression::AddRangeWithConflict(CharRange range)
{
if(range.begin>range.end)
{
wchar_t t=range.begin;
range.begin=range.end;
range.end=t;
}
for(vint i=0;i<ranges.Count();i++)
{
if(!(range<ranges[i] || range>ranges[i]))
{
return false;
}
}
ranges.Add(range);
return true;
}
/***********************************************************************
RegexExpression
***********************************************************************/
Expression::Ref RegexExpression::Merge()
{
MergeParameter merge;
merge.regex=this;
return MergeAlgorithm().Invoke(expression, &merge);
}
/***********************************************************************
Expression::Apply
***********************************************************************/
void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm)
{
algorithm.Visit(this);
}
}
}
/***********************************************************************
.\REGEXPARSER.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
Helper Functions
***********************************************************************/
bool IsChar(const wchar_t*& input, wchar_t c)
{
if(*input==c)
{
input++;
return true;
}
else
{
return false;
}
}
bool IsChars(const wchar_t*& input, const wchar_t* chars, wchar_t& c)
{
const wchar_t* position=::wcschr(chars, *input);
if(position)
{
c=*input++;
return true;
}
else
{
return false;
}
}
bool IsStr(const wchar_t*& input, const wchar_t* str)
{
size_t len=wcslen(str);
if(wcsncmp(input, str, len)==0)
{
input+=len;
return true;
}
else
{
return false;
}
}
bool IsChars(const wchar_t*& input, const wchar_t* chars)
{
wchar_t c;
return IsChars(input, chars, c);
}
bool IsPositiveInteger(const wchar_t*& input, vint& number)
{
bool readed=false;
number=0;
while(L'0'<=*input && *input<=L'9')
{
number=number*10+(*input++)-L'0';
readed=true;
}
return readed;
}
bool IsName(const wchar_t*& input, WString& name)
{
const wchar_t* read=input;
if((L'A'<=*read && *read<=L'Z') || (L'a'<=*read && *read<=L'z') || *read==L'_')
{
read++;
while((L'A'<=*read && *read<=L'Z') || (L'a'<=*read && *read<=L'z') || (L'0'<=*read && *read<=L'9') || *read==L'_')
{
read++;
}
}
if(input==read)
{
return false;
}
else
{
name=WString::CopyFrom(input, vint(read-input));
input=read;
return true;
}
}
Ptr<LoopExpression> ParseLoop(const wchar_t*& input)
{
vint min=0;
vint max=0;
if(!*input)
{
return 0;
}
else if(IsChar(input, L'+'))
{
min=1;
max=-1;
}
else if(IsChar(input, L'*'))
{
min=0;
max=-1;
}
else if(IsChar(input, L'?'))
{
min=0;
max=1;
}
else if(IsChar(input, L'{'))
{
if(IsPositiveInteger(input, min))
{
if(IsChar(input, L','))
{
if(!IsPositiveInteger(input, max))
{
max=-1;
}
}
else
{
max=min;
}
if(!IsChar(input, L'}'))
{
goto THROW_EXCEPTION;
}
}
else
{
goto THROW_EXCEPTION;
}
}
else
{
return 0;
}
{
LoopExpression* expression=new LoopExpression;
expression->min=min;
expression->max=max;
expression->preferLong=!IsChar(input, L'?');
return expression;
}
THROW_EXCEPTION:
throw ArgumentException(L"Regular expression syntax error: Illegal loop expression.", L"vl::regex_internal::ParseLoop", L"input");
}
Ptr<Expression> ParseCharSet(const wchar_t*& input)
{
if(!*input)
{
return 0;
}
else if(IsChar(input, L'^'))
{
return new BeginExpression;
}
else if(IsChar(input, L'$'))
{
return new EndExpression;
}
else if(IsChar(input, L'\\') || IsChar(input, L'/'))
{
Ptr<CharSetExpression> expression=new CharSetExpression;
expression->reverse=false;
switch(*input)
{
case L'.':
expression->ranges.Add(CharRange(1, 65535));
break;
case L'r':
expression->ranges.Add(CharRange(L'\r', L'\r'));
break;
case L'n':
expression->ranges.Add(CharRange(L'\n', L'\n'));
break;
case L't':
expression->ranges.Add(CharRange(L'\t', L'\t'));
break;
case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|':
case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
case L'^':case L'$':case L'!':case L'=':
expression->ranges.Add(CharRange(*input, *input));
break;
case L'S':
expression->reverse=true;
case L's':
expression->ranges.Add(CharRange(L' ', L' '));
expression->ranges.Add(CharRange(L'\r', L'\r'));
expression->ranges.Add(CharRange(L'\n', L'\n'));
expression->ranges.Add(CharRange(L'\t', L'\t'));
break;
case L'D':
expression->reverse=true;
case L'd':
expression->ranges.Add(CharRange(L'0', L'9'));
break;
case L'L':
expression->reverse=true;
case L'l':
expression->ranges.Add(CharRange(L'_', L'_'));
expression->ranges.Add(CharRange(L'A', L'Z'));
expression->ranges.Add(CharRange(L'a', L'z'));
break;
case L'W':
expression->reverse=true;
case L'w':
expression->ranges.Add(CharRange(L'_', L'_'));
expression->ranges.Add(CharRange(L'0', L'9'));
expression->ranges.Add(CharRange(L'A', L'Z'));
expression->ranges.Add(CharRange(L'a', L'z'));
break;
default:
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping.", L"vl::regex_internal::ParseCharSet", L"input");
}
input++;
return expression;
}
else if(IsChar(input, L'['))
{
Ptr<CharSetExpression> expression=new CharSetExpression;
if(IsChar(input, L'^'))
{
expression->reverse=true;
}
else
{
expression->reverse=false;
}
bool midState=false;
wchar_t a=L'\0';
wchar_t b=L'\0';
while(true)
{
if(IsChar(input, L'\\') || IsChar(input, L'/'))
{
wchar_t c=L'\0';
switch(*input)
{
case L'r':
c=L'\r';
break;
case L'n':
c=L'\n';
break;
case L't':
c=L'\t';
break;
case L'-':case L'[':case L']':case L'\\':case L'/':case L'^':case L'$':
c=*input;
break;
default:
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping, only \"rnt-[]\\/\" are legal escaped characters in [].", L"vl::regex_internal::ParseCharSet", L"input");
}
input++;
midState?b=c:a=c;
midState=!midState;
}
else if(IsChars(input, L"-]"))
{
goto THROW_EXCEPTION;
}
else if(*input)
{
midState?b=*input++:a=*input++;
midState=!midState;
}
else
{
goto THROW_EXCEPTION;
}
if(IsChar(input, L']'))
{
if(midState)
{
b=a;
}
if(!expression->AddRangeWithConflict(CharRange(a, b)))
{
goto THROW_EXCEPTION;
}
break;
}
else if(IsChar(input, L'-'))
{
if(!midState)
{
goto THROW_EXCEPTION;
}
}
else
{
if(midState)
{
b=a;
}
if(expression->AddRangeWithConflict(CharRange(a, b)))
{
midState=false;
}
else
{
goto THROW_EXCEPTION;
}
}
}
return expression;
THROW_EXCEPTION:
throw ArgumentException(L"Regular expression syntax error: Illegal character set definition.");
}
else if(IsChars(input, L"()+*?{}|"))
{
input--;
return 0;
}
else
{
CharSetExpression* expression=new CharSetExpression;
expression->reverse=false;
expression->ranges.Add(CharRange(*input, *input));
input++;
return expression;
}
}
Ptr<Expression> ParseFunction(const wchar_t*& input)
{
if(IsStr(input, L"(="))
{
Ptr<Expression> sub=ParseExpression(input);
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
PositiveExpression* expression=new PositiveExpression;
expression->expression=sub;
return expression;
}
else if(IsStr(input, L"(!"))
{
Ptr<Expression> sub=ParseExpression(input);
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
NegativeExpression* expression=new NegativeExpression;
expression->expression=sub;
return expression;
}
else if(IsStr(input, L"(<&"))
{
WString name;
if(!IsName(input, name))
{
goto NEED_NAME;
}
if(!IsChar(input, L'>'))
{
goto NEED_GREATER;
}
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
UsingExpression* expression=new UsingExpression;
expression->name=name;
return expression;
}
else if(IsStr(input, L"(<$"))
{
WString name;
vint index=-1;
if(IsName(input, name))
{
if(IsChar(input, L';'))
{
if(!IsPositiveInteger(input, index))
{
goto NEED_NUMBER;
}
}
}
else if(!IsPositiveInteger(input, index))
{
goto NEED_NUMBER;
}
if(!IsChar(input, L'>'))
{
goto NEED_GREATER;
}
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
MatchExpression* expression=new MatchExpression;
expression->name=name;
expression->index=index;
return expression;
}
else if(IsStr(input, L"(<"))
{
WString name;
if(!IsName(input, name))
{
goto NEED_NAME;
}
if(!IsChar(input, L'>'))
{
goto NEED_GREATER;
}
Ptr<Expression> sub=ParseExpression(input);
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
CaptureExpression* expression=new CaptureExpression;
expression->name=name;
expression->expression=sub;
return expression;
}
else if(IsStr(input, L"(?"))
{
Ptr<Expression> sub=ParseExpression(input);
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
CaptureExpression* expression=new CaptureExpression;
expression->expression=sub;
return expression;
}
else if(IsChar(input, L'('))
{
Ptr<Expression> sub=ParseExpression(input);
if(!IsChar(input, L')'))
{
goto NEED_RIGHT_BRACKET;
}
return sub;
}
else
{
return 0;
}
NEED_RIGHT_BRACKET:
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
NEED_GREATER:
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
NEED_NAME:
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseFunction", L"input");
NEED_NUMBER:
throw ArgumentException(L"Regular expression syntax error: Number expected.", L"vl::regex_internal::ParseFunction", L"input");
}
Ptr<Expression> ParseUnit(const wchar_t*& input)
{
Ptr<Expression> unit=ParseCharSet(input);
if(!unit)
{
unit=ParseFunction(input);
}
if(!unit)
{
return 0;
}
Ptr<LoopExpression> loop;
while((loop=ParseLoop(input)))
{
loop->expression=unit;
unit=loop;
}
return unit;
}
Ptr<Expression> ParseJoin(const wchar_t*& input)
{
Ptr<Expression> expression=ParseUnit(input);
while(true)
{
Ptr<Expression> right=ParseUnit(input);
if(right)
{
SequenceExpression* sequence=new SequenceExpression;
sequence->left=expression;
sequence->right=right;
expression=sequence;
}
else
{
break;
}
}
return expression;
}
Ptr<Expression> ParseAlt(const wchar_t*& input)
{
Ptr<Expression> expression=ParseJoin(input);
while(true)
{
if(IsChar(input, L'|'))
{
Ptr<Expression> right=ParseJoin(input);
if(right)
{
AlternateExpression* alternate=new AlternateExpression;
alternate->left=expression;
alternate->right=right;
expression=alternate;
}
else
{
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseAlt", L"input");
}
}
else
{
break;
}
}
return expression;
}
Ptr<Expression> ParseExpression(const wchar_t*& input)
{
return ParseAlt(input);
}
RegexExpression::Ref ParseRegexExpression(const WString& code)
{
RegexExpression::Ref regex=new RegexExpression;
const wchar_t* start=code.Buffer();
const wchar_t* input=start;
try
{
while(IsStr(input, L"(<#"))
{
WString name;
if(!IsName(input, name))
{
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseRegexExpression", L"code");
}
if(!IsChar(input, L'>'))
{
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
}
Ptr<Expression> sub=ParseExpression(input);
if(!IsChar(input, L')'))
{
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
}
if(regex->definitions.Keys().Contains(name))
{
throw ArgumentException(L"Regular expression syntax error: Found duplicated sub expression name: \""+name+L"\". ", L"vl::regex_internal::ParseFunction", L"input");
}
else
{
regex->definitions.Add(name, sub);
}
}
regex->expression=ParseExpression(input);
if(!regex->expression)
{
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseUnit", L"input");
}
if(*input)
{
throw ArgumentException(L"Regular expression syntax error: Found unnecessary tokens.", L"vl::regex_internal::ParseUnit", L"input");
}
return regex;
}
catch(const ArgumentException& e)
{
throw ParsingException(e.Message(), code, input-start);
}
}
WString EscapeTextForRegex(const WString& literalString)
{
WString result;
for(vint i=0;i<literalString.Length();i++)
{
wchar_t c=literalString[i];
switch(c)
{
case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|':
case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
case L'^':case L'$':case L'!':case L'=':
result+=WString(L"\\")+WString::FromChar(c);
break;
case L'\r':
result+=L"\\r";
break;
case L'\n':
result+=L"\\n";
break;
case L'\t':
result+=L"\\t";
break;
default:
result+=WString::FromChar(c);
}
}
return result;
}
WString UnescapeTextForRegex(const WString& escapedText)
{
WString result;
for(vint i=0;i<escapedText.Length();i++)
{
wchar_t c=escapedText[i];
if(c==L'\\' || c==L'/')
{
if(i<escapedText.Length()-1)
{
i++;
c=escapedText[i];
switch(c)
{
case L'r':
result+=L"\r";
break;
case L'n':
result+=L"\n";
break;
case L't':
result+=L"\t";
break;
default:
result+=WString::FromChar(c);
}
continue;
}
}
result+=WString::FromChar(c);
}
return result;
}
WString NormalizeEscapedTextForRegex(const WString& escapedText)
{
WString result;
for(vint i=0;i<escapedText.Length();i++)
{
wchar_t c=escapedText[i];
if(c==L'\\' || c==L'/')
{
if(i<escapedText.Length()-1)
{
i++;
c=escapedText[i];
result+=WString(L"\\")+WString::FromChar(c);
continue;
}
}
result+=WString::FromChar(c);
}
return result;
}
bool IsRegexEscapedLiteralString(const WString& regex)
{
for(vint i=0;i<regex.Length();i++)
{
wchar_t c=regex[i];
if(c==L'\\' || c==L'/')
{
i++;
}
else
{
switch(c)
{
case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|':
case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
case L'^':case L'$':case L'!':case L'=':
return false;
}
}
}
return true;
}
}
}
/***********************************************************************
.\REGEXPURE.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
PureInterpretor
***********************************************************************/
PureInterpretor::PureInterpretor(Automaton::Ref dfa, CharRange::List& subsets)
:transition(0)
,finalState(0)
,relatedFinalState(0)
{
stateCount=dfa->states.Count();
charSetCount=subsets.Count()+1;
startState=dfa->states.IndexOf(dfa->startState);
// Map char to input index (equivalent char class)
for(vint i=0;i<SupportedCharCount;i++)
{
charMap[i]=charSetCount-1;
}
for(vint i=0;i<subsets.Count();i++)
{
CharRange range=subsets[i];
for(vint j=range.begin;j<=range.end;j++)
{
charMap[j]=i;
}
}
// Create transitions from DFA, using input index to represent input char
transition=new vint*[stateCount];
for(vint i=0;i<stateCount;i++)
{
transition[i]=new vint[charSetCount];
for(vint j=0;j<charSetCount;j++)
{
transition[i][j]=-1;
}
State* state=dfa->states[i].Obj();
for(vint j=0;j<state->transitions.Count();j++)
{
Transition* dfaTransition=state->transitions[j];
switch(dfaTransition->type)
{
case Transition::Chars:
{
vint index=subsets.IndexOf(dfaTransition->range);
if(index==-1)
{
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Automaton::Ref, CharRange::List&)#Specified chars don't appear in the normalized char ranges.");
}
transition[i][index]=dfa->states.IndexOf(dfaTransition->target);
}
break;
default:
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Automaton::Ref, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions.");
}
}
}
// Mark final states
finalState=new bool[stateCount];
for(vint i=0;i<stateCount;i++)
{
finalState[i]=dfa->states[i]->finalState;
}
}
PureInterpretor::~PureInterpretor()
{
if(relatedFinalState) delete[] relatedFinalState;
delete[] finalState;
for(vint i=0;i<stateCount;i++)
{
delete[] transition[i];
}
delete[] transition;
}
bool PureInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, PureResult& result)
{
result.start=input-start;
result.length=-1;
result.finalState=-1;
result.terminateState=-1;
vint currentState=startState;
vint terminateState=-1;
vint terminateLength=-1;
const wchar_t* read=input;
while(currentState!=-1)
{
terminateState=currentState;
terminateLength=read-input;
if(finalState[currentState])
{
result.length=terminateLength;
result.finalState=currentState;
}
if(!*read)break;
#ifdef VCZH_GCC
if(*read>=SupportedCharCount)break;
#endif
vint charIndex=charMap[*read++];
currentState=transition[currentState][charIndex];
}
if(result.finalState==-1)
{
if(terminateLength>0)
{
result.terminateState=terminateState;
}
result.length=terminateLength;
return false;
}
else
{
return true;
}
}
bool PureInterpretor::Match(const wchar_t* input, const wchar_t* start, PureResult& result)
{
const wchar_t* read=input;
while(*read)
{
if(MatchHead(read, start, result))
{
return true;
}
read++;
}
return false;
}
vint PureInterpretor::GetStartState()
{
return startState;
}
vint PureInterpretor::Transit(wchar_t input, vint state)
{
if(0<=state && state<stateCount)
{
vint charIndex=charMap[input];
vint nextState=transition[state][charIndex];
return nextState;
}
else
{
return -1;
}
}
bool PureInterpretor::IsFinalState(vint state)
{
return 0<=state && state<stateCount && finalState[state];
}
bool PureInterpretor::IsDeadState(vint state)
{
if(state==-1) return true;
for(vint i=0;i<charSetCount;i++)
{
if(transition[state][i]!=-1)
{
return false;
}
}
return true;
}
void PureInterpretor::PrepareForRelatedFinalStateTable()
{
if(!relatedFinalState)
{
relatedFinalState=new vint[stateCount];
for(vint i=0;i<stateCount;i++)
{
relatedFinalState[i]=finalState[i]?i:-1;
}
while(true)
{
vint modifyCount=0;
for(vint i=0;i<stateCount;i++)
{
if(relatedFinalState[i]==-1)
{
vint state=-1;
for(vint j=0;j<charSetCount;j++)
{
vint nextState=transition[i][j];
if(nextState!=-1)
{
state=relatedFinalState[nextState];
if(state!=-1)
{
break;
}
}
}
if(state!=-1)
{
relatedFinalState[i]=state;
modifyCount++;
}
}
}
if(modifyCount==0)
{
break;
}
}
}
}
vint PureInterpretor::GetRelatedFinalState(vint state)
{
return relatedFinalState?relatedFinalState[state]:-1;
}
}
}
/***********************************************************************
.\REGEXRICH.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex_internal
{
/***********************************************************************
Data Structures for Backtracking
***********************************************************************/
class StateSaver
{
public:
enum StateStoreType
{
Positive,
Negative,
Other
};
const wchar_t* reading; // Current reading position
State* currentState; // Current state
vint minTransition; // The first transition to backtrack
vint captureCount; // Available capture count (the list size may larger than this)
vint stateSaverCount; // Available saver count (the list size may larger than this)
vint extensionSaverAvailable; // Available extension saver count (the list size may larger than this)
vint extensionSaverCount; // Available extension saver count (during executing)
StateStoreType storeType; // Reason to keep this record
bool operator==(const StateSaver& saver)const
{
return
reading == saver.reading &&
currentState == saver.currentState &&
minTransition == saver.minTransition &&
captureCount == saver.captureCount;
}
};
class ExtensionSaver
{
public:
vint previous; // Previous extension saver index
vint captureListIndex; // Where to write the captured text
Transition* transition; // The extension begin transition (Capture, Positive, Negative)
const wchar_t* reading; // The reading position
bool operator==(const ExtensionSaver& saver)const
{
return
captureListIndex == saver.captureListIndex &&
transition == saver.transition &&
reading == saver.reading;
}
};
}
namespace regex_internal
{
using namespace collections;
void Push(List<ExtensionSaver>& elements, vint& available, vint& count, const ExtensionSaver& element)
{
if(elements.Count()==count)
{
elements.Add(element);
}
else
{
elements[count]=element;
}
ExtensionSaver& current=elements[count];
current.previous=available;
available=count++;
}
ExtensionSaver Pop(List<ExtensionSaver>& elements, vint& available, vint& count)
{
ExtensionSaver& current=elements[available];
available=current.previous;
return current;
}
template<typename T, typename K>
void PushNonSaver(List<T, K>& elements, vint& count, const T& element)
{
if(elements.Count()==count)
{
elements.Add(element);
}
else
{
elements[count]=element;
}
count++;
}
template<typename T, typename K>
T PopNonSaver(List<T, K>& elements, vint& count)
{
return elements[--count];
}
}
namespace regex_internal
{
/***********************************************************************
CaptureRecord
***********************************************************************/
bool CaptureRecord::operator==(const CaptureRecord& record)const
{
return capture==record.capture && start==record.start && length==record.length;
}
/***********************************************************************
RichInterpretor
***********************************************************************/
RichInterpretor::RichInterpretor(Automaton::Ref _dfa)
:dfa(_dfa)
{
datas=new UserData[dfa->states.Count()];
for(vint i=0;i<dfa->states.Count();i++)
{
State* state=dfa->states[i].Obj();
vint charEdges=0;
vint nonCharEdges=0;
bool mustSave=false;
for(vint j=0;j<state->transitions.Count();j++)
{
if(state->transitions[j]->type==Transition::Chars)
{
charEdges++;
}
else
{
if(state->transitions[j]->type==Transition::Negative ||
state->transitions[j]->type==Transition::Positive)
{
mustSave=true;
}
nonCharEdges++;
}
}
datas[i].NeedKeepState=mustSave || nonCharEdges>1 || (nonCharEdges!=0 && charEdges!=0);
state->userData=&datas[i];
}
}
RichInterpretor::~RichInterpretor()
{
delete[] datas;
}
bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, RichResult& result)
{
List<StateSaver> stateSavers;
List<ExtensionSaver> extensionSavers;
StateSaver currentState;
currentState.captureCount=0;
currentState.currentState=dfa->startState;
currentState.extensionSaverAvailable=-1;
currentState.extensionSaverCount=0;
currentState.minTransition=0;
currentState.reading=input;
currentState.stateSaverCount=0;
currentState.storeType=StateSaver::Other;
while (!currentState.currentState->finalState)
{
bool found = false; // true means at least one transition matches the input
StateSaver oldState = currentState;
// Iterate through all transitions from the current state
for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++)
{
Transition* transition = currentState.currentState->transitions[i];
switch (transition->type)
{
case Transition::Chars:
{
// match the input if the current character fall into the range
CharRange range = transition->range;
found =
range.begin <= *currentState.reading &&
range.end >= *currentState.reading;
if (found)
{
currentState.reading++;
}
}
break;
case Transition::BeginString:
{
// match the input if this is the first character, and it is not consumed
found = currentState.reading == start;
}
break;
case Transition::EndString:
{
// match the input if this is after the last character, and it is not consumed
found = *currentState.reading == L'\0';
}
break;
case Transition::Nop:
{
// match without any condition
found = true;
}
break;
case Transition::Capture:
{
// Push the capture information
ExtensionSaver saver;
saver.captureListIndex = currentState.captureCount;
saver.reading = currentState.reading;
saver.transition = transition;
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
// Push the capture record, and it will be written if the input matches the regex
CaptureRecord capture;
capture.capture = transition->capture;
capture.start = currentState.reading - start;
capture.length = -1;
PushNonSaver(result.captures, currentState.captureCount, capture);
found = true;
}
break;
case Transition::Match:
{
vint index = 0;
for (vint j = 0; j < currentState.captureCount; j++)
{
CaptureRecord& capture = result.captures[j];
// If the capture name matched
if (capture.capture == transition->capture)
{
// If the capture index matched, or it is -1
if (capture.length != -1 && (transition->index == -1 || transition->index == index))
{
// If the captured text matched
if (wcsncmp(start + capture.start, currentState.reading, capture.length) == 0)
{
// Consume so much input
currentState.reading += capture.length;
found = true;
break;
}
}
// Fail if f the captured text with the specified name and index doesn't match
if (transition->index != -1 && index == transition->index)
{
break;
}
else
{
index++;
}
}
}
}
break;
case Transition::Positive:
{
// Push the positive lookahead information
ExtensionSaver saver;
saver.captureListIndex = -1;
saver.reading = currentState.reading;
saver.transition = transition;
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
// Set found = true so that PushNonSaver(oldState) happens later
oldState.storeType = StateSaver::Positive;
found = true;
}
break;
case Transition::Negative:
{
// Push the positive lookahead information
ExtensionSaver saver;
saver.captureListIndex = -1;
saver.reading = currentState.reading;
saver.transition = transition;
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
// Set found = true so that PushNonSaver(oldState) happens later
oldState.storeType = StateSaver::Negative;
found = true;
}
break;
case Transition::NegativeFail:
{
// NegativeFail will be used when the nagative lookahead failed
}
break;
case Transition::End:
{
// Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here
ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount);
switch (extensionSaver.transition->type)
{
case Transition::Capture:
{
// Write the captured text
CaptureRecord& capture = result.captures[extensionSaver.captureListIndex];
capture.length = (currentState.reading - start) - capture.start;
found = true;
}
break;
case Transition::Positive:
// Find the last positive lookahead state saver
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
{
StateSaver& stateSaver = stateSavers[j];
if (stateSaver.storeType == StateSaver::Positive)
{
// restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input
oldState.reading = stateSaver.reading;
oldState.stateSaverCount = j;
currentState.reading = stateSaver.reading;
currentState.stateSaverCount = j;
break;
}
}
found = true;
break;
case Transition::Negative:
// Find the last negative lookahead state saver
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
{
StateSaver& stateSaver = stateSavers[j];
if (stateSaver.storeType == StateSaver::Negative)
{
// restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input
oldState = stateSaver;
oldState.storeType = StateSaver::Other;
currentState = stateSaver;
currentState.storeType = StateSaver::Other;
i = currentState.minTransition - 1;
break;
}
}
break;
default:;
}
}
break;
default:;
}
// Save the parsing state when necessary
if (found)
{
UserData* data = (UserData*)currentState.currentState->userData;
if (data->NeedKeepState)
{
oldState.minTransition = i + 1;
PushNonSaver(stateSavers, currentState.stateSaverCount, oldState);
}
currentState.currentState = transition->target;
currentState.minTransition = 0;
break;
}
}
// If no transition from the current state can be used
if (!found)
{
// If there is a chance to do backtracking
if (currentState.stateSaverCount)
{
currentState = PopNonSaver(stateSavers, currentState.stateSaverCount);
// minTransition - 1 is always valid since the value is stored with adding 1
// So minTransition - 1 record the transition, which is the reason the parsing state is saved
if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative)
{
// Find the next NegativeFail transition
// Because when a negative lookahead regex failed to match, it is actually succeeded
// Since a negative lookahead means we don't want to match this regex
for (vint i = 0; i < currentState.currentState->transitions.Count(); i++)
{
Transition* transition = currentState.currentState->transitions[i];
if (transition->type == Transition::NegativeFail)
{
// Restore the state to the target of NegativeFail to let the parsing continue
currentState.currentState = transition->target;
currentState.minTransition = 0;
currentState.storeType = StateSaver::Other;
break;
}
}
}
}
else
{
break;
}
}
}
if (currentState.currentState->finalState)
{
// Keep available captures if succeeded
result.start = input - start;
result.length = (currentState.reading - start) - result.start;
for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--)
{
result.captures.RemoveAt(i);
}
return true;
}
else
{
// Clear captures if failed
result.captures.Clear();
return false;
}
}
bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, RichResult& result)
{
const wchar_t* read=input;
while(*read)
{
if(MatchHead(read, start, result))
{
return true;
}
read++;
}
return false;
}
const List<WString>& RichInterpretor::CaptureNames()
{
return dfa->captureNames;
}
}
}
/***********************************************************************
.\REGEXWRITER.CPP
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
namespace vl
{
namespace regex
{
using namespace vl::regex_internal;
/***********************************************************************
RegexNode
***********************************************************************/
RegexNode::RegexNode(vl::regex_internal::Expression::Ref _expression)
:expression(_expression)
{
}
RegexNode RegexNode::Some()const
{
return Loop(1, -1);
}
RegexNode RegexNode::Any()const
{
return Loop(0, -1);
}
RegexNode RegexNode::Opt()const
{
return Loop(0, 1);
}
RegexNode RegexNode::Loop(vint min, vint max)const
{
LoopExpression* target=new LoopExpression;
target->min=min;
target->max=max;
target->preferLong=true;
target->expression=expression;
return RegexNode(target);
}
RegexNode RegexNode::AtLeast(vint min)const
{
return Loop(min, -1);
}
RegexNode RegexNode::operator+(const RegexNode& node)const
{
SequenceExpression* target=new SequenceExpression;
target->left=expression;
target->right=node.expression;
return RegexNode(target);
}
RegexNode RegexNode::operator|(const RegexNode& node)const
{
AlternateExpression* target=new AlternateExpression;
target->left=expression;
target->right=node.expression;
return RegexNode(target);
}
RegexNode RegexNode::operator+()const
{
PositiveExpression* target=new PositiveExpression;
target->expression=expression;
return RegexNode(target);
}
RegexNode RegexNode::operator-()const
{
NegativeExpression* target=new NegativeExpression;
target->expression=expression;
return RegexNode(target);
}
RegexNode RegexNode::operator!()const
{
CharSetExpression* source=dynamic_cast<CharSetExpression*>(expression.Obj());
CHECK_ERROR(source, L"RegexNode::operator!()#operator ! can only applies on charset expressions.");
Ptr<CharSetExpression> target=new CharSetExpression;
CopyFrom(target->ranges, source->ranges);
target->reverse=!source->reverse;
return RegexNode(target);
}
RegexNode RegexNode::operator%(const RegexNode& node)const
{
CharSetExpression* left=dynamic_cast<CharSetExpression*>(expression.Obj());
CharSetExpression* right=dynamic_cast<CharSetExpression*>(node.expression.Obj());
CHECK_ERROR(left && right && !left->reverse && !right->reverse, L"RegexNode::operator%(const RegexNode&)#operator % only connects non-reverse charset expressions.");
Ptr<CharSetExpression> target=new CharSetExpression;
target->reverse=false;
CopyFrom(target->ranges, left->ranges);
for(vint i=0;i<right->ranges.Count();i++)
{
if(!target->AddRangeWithConflict(right->ranges[i]))
{
CHECK_ERROR(false, L"RegexNode::operator%(const RegexNode&)#Failed to create charset expression from operator %.");
}
}
return RegexNode(target);
}
/***********************************************************************
Regex Writer
***********************************************************************/
RegexNode rCapture(const WString& name, const RegexNode& node)
{
CaptureExpression* target=new CaptureExpression;
target->name=name;
target->expression=node.expression;
return RegexNode(target);
}
RegexNode rUsing(const WString& name)
{
UsingExpression* target=new UsingExpression;
target->name=name;
return RegexNode(target);
}
RegexNode rMatch(const WString& name, vint index)
{
MatchExpression* target=new MatchExpression;
target->name=name;
target->index=index;
return RegexNode(target);
}
RegexNode rMatch(vint index)
{
MatchExpression* target=new MatchExpression;
target->index=index;
return RegexNode(target);
}
RegexNode rBegin()
{
return RegexNode(new BeginExpression);
}
RegexNode rEnd()
{
return RegexNode(new EndExpression);
}
RegexNode rC(wchar_t a, wchar_t b)
{
if(!b)b=a;
CharSetExpression* target=new CharSetExpression;
target->reverse=false;
target->AddRangeWithConflict(CharRange(a, b));
return RegexNode(target);
}
RegexNode r_d()
{
return rC(L'0', L'9');
}
RegexNode r_l()
{
return rC(L'a', L'z')%rC(L'A', L'Z')%rC(L'_');
}
RegexNode r_w()
{
return rC(L'0', L'9')%rC(L'a', L'z')%rC(L'A', L'Z')%rC(L'_');
}
RegexNode rAnyChar()
{
return rC(1, 65535);
}
}
}