mirror of
https://github.com/vczh-libraries/Release.git
synced 2026-02-06 12:01:39 +08:00
3999 lines
100 KiB
C++
3999 lines
100 KiB
C++
/***********************************************************************
|
|
THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY
|
|
DEVELOPER: Zihan Chen(vczh)
|
|
***********************************************************************/
|
|
#include "VlppRegex.h"
|
|
|
|
/***********************************************************************
|
|
.\REGEX.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex
|
|
{
|
|
using namespace collections;
|
|
using namespace regex_internal;
|
|
|
|
/***********************************************************************
|
|
RegexString
|
|
***********************************************************************/
|
|
|
|
RegexString::RegexString(vint _start)
|
|
:start(_start)
|
|
,length(0)
|
|
{
|
|
}
|
|
|
|
RegexString::RegexString(const WString& _string, vint _start, vint _length)
|
|
:value(_length==0?L"":_string.Sub(_start, _length))
|
|
,start(_start)
|
|
,length(_length)
|
|
{
|
|
}
|
|
|
|
vint RegexString::Start()const
|
|
{
|
|
return start;
|
|
}
|
|
|
|
vint RegexString::Length()const
|
|
{
|
|
return length;
|
|
}
|
|
|
|
const WString& RegexString::Value()const
|
|
{
|
|
return value;
|
|
}
|
|
|
|
bool RegexString::operator==(const RegexString& string)const
|
|
{
|
|
return start==string.start && length==string.length && value==string.value;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexMatch
|
|
***********************************************************************/
|
|
|
|
RegexMatch::RegexMatch(const WString& _string, PureResult* _result)
|
|
:success(true)
|
|
,result(_string, _result->start, _result->length)
|
|
{
|
|
}
|
|
|
|
RegexMatch::RegexMatch(const WString& _string, RichResult* _result, RichInterpretor* _rich)
|
|
:success(true)
|
|
,result(_string, _result->start, _result->length)
|
|
{
|
|
for(vint i=0;i<_result->captures.Count();i++)
|
|
{
|
|
CaptureRecord& capture=_result->captures[i];
|
|
if(capture.capture==-1)
|
|
{
|
|
captures.Add(RegexString(_string, capture.start, capture.length));
|
|
}
|
|
else
|
|
{
|
|
groups.Add(_rich->CaptureNames().Get(capture.capture), RegexString(_string, capture.start, capture.length));
|
|
}
|
|
}
|
|
}
|
|
|
|
RegexMatch::RegexMatch(const RegexString& _result)
|
|
:success(false)
|
|
,result(_result)
|
|
{
|
|
}
|
|
|
|
bool RegexMatch::Success()const
|
|
{
|
|
return success;
|
|
}
|
|
|
|
const RegexString& RegexMatch::Result()const
|
|
{
|
|
return result;
|
|
}
|
|
|
|
const RegexMatch::CaptureList& RegexMatch::Captures()const
|
|
{
|
|
return captures;
|
|
}
|
|
|
|
const RegexMatch::CaptureGroup& RegexMatch::Groups()const
|
|
{
|
|
return groups;
|
|
}
|
|
|
|
/***********************************************************************
|
|
Regex
|
|
***********************************************************************/
|
|
|
|
void Regex::Process(const WString& text, bool keepEmpty, bool keepSuccess, bool keepFail, RegexMatch::List& matches)const
|
|
{
|
|
if(rich)
|
|
{
|
|
const wchar_t* start=text.Buffer();
|
|
const wchar_t* input=start;
|
|
RichResult result;
|
|
while(rich->Match(input, start, result))
|
|
{
|
|
vint offset=input-start;
|
|
if(keepFail)
|
|
{
|
|
if(result.start>offset || keepEmpty)
|
|
{
|
|
matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
|
|
}
|
|
}
|
|
if(keepSuccess)
|
|
{
|
|
matches.Add(new RegexMatch(text, &result, rich));
|
|
}
|
|
input=start+result.start+result.length;
|
|
}
|
|
if(keepFail)
|
|
{
|
|
vint remain=input-start;
|
|
vint length=text.Length()-remain;
|
|
if(length || keepEmpty)
|
|
{
|
|
matches.Add(new RegexMatch(RegexString(text, remain, length)));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
const wchar_t* start=text.Buffer();
|
|
const wchar_t* input=start;
|
|
PureResult result;
|
|
while(pure->Match(input, start, result))
|
|
{
|
|
vint offset=input-start;
|
|
if(keepFail)
|
|
{
|
|
if(result.start>offset || keepEmpty)
|
|
{
|
|
matches.Add(new RegexMatch(RegexString(text, offset, result.start-offset)));
|
|
}
|
|
}
|
|
if(keepSuccess)
|
|
{
|
|
matches.Add(new RegexMatch(text, &result));
|
|
}
|
|
input=start+result.start+result.length;
|
|
}
|
|
if(keepFail)
|
|
{
|
|
vint remain=input-start;
|
|
vint length=text.Length()-remain;
|
|
if(length || keepEmpty)
|
|
{
|
|
matches.Add(new RegexMatch(RegexString(text, remain, length)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Regex::Regex(const WString& code, bool preferPure)
|
|
{
|
|
CharRange::List subsets;
|
|
RegexExpression::Ref regex=ParseRegexExpression(code);
|
|
Expression::Ref expression=regex->Merge();
|
|
expression->NormalizeCharSet(subsets);
|
|
|
|
bool pureRequired=false;
|
|
bool richRequired=false;
|
|
if(preferPure)
|
|
{
|
|
if(expression->HasNoExtension())
|
|
{
|
|
pureRequired=true;
|
|
}
|
|
else
|
|
{
|
|
if(expression->CanTreatAsPure())
|
|
{
|
|
pureRequired=true;
|
|
richRequired=true;
|
|
}
|
|
else
|
|
{
|
|
richRequired=true;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
richRequired=true;
|
|
}
|
|
|
|
try
|
|
{
|
|
if(pureRequired)
|
|
{
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
|
|
Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
|
|
Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
|
|
pure=new PureInterpretor(dfa, subsets);
|
|
}
|
|
if(richRequired)
|
|
{
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
Automaton::Ref eNfa=expression->GenerateEpsilonNfa();
|
|
Automaton::Ref nfa=EpsilonNfaToNfa(eNfa, RichEpsilonChecker, nfaStateMap);
|
|
Automaton::Ref dfa=NfaToDfa(nfa, dfaStateMap);
|
|
rich=new RichInterpretor(dfa);
|
|
}
|
|
}
|
|
catch(...)
|
|
{
|
|
if(pure)delete pure;
|
|
if(rich)delete rich;
|
|
throw;
|
|
}
|
|
}
|
|
|
|
Regex::~Regex()
|
|
{
|
|
if(pure)delete pure;
|
|
if(rich)delete rich;
|
|
}
|
|
|
|
bool Regex::IsPureMatch()const
|
|
{
|
|
return rich?false:true;
|
|
}
|
|
|
|
bool Regex::IsPureTest()const
|
|
{
|
|
return pure?true:false;
|
|
}
|
|
|
|
RegexMatch::Ref Regex::MatchHead(const WString& text)const
|
|
{
|
|
if(rich)
|
|
{
|
|
RichResult result;
|
|
if(rich->MatchHead(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return new RegexMatch(text, &result, rich);
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PureResult result;
|
|
if(pure->MatchHead(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return new RegexMatch(text, &result);
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
RegexMatch::Ref Regex::Match(const WString& text)const
|
|
{
|
|
if(rich)
|
|
{
|
|
RichResult result;
|
|
if(rich->Match(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return new RegexMatch(text, &result, rich);
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
PureResult result;
|
|
if(pure->Match(text.Buffer(), text.Buffer(), result))
|
|
{
|
|
return new RegexMatch(text, &result);
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Regex::TestHead(const WString& text)const
|
|
{
|
|
if(pure)
|
|
{
|
|
PureResult result;
|
|
return pure->MatchHead(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
else
|
|
{
|
|
RichResult result;
|
|
return rich->MatchHead(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
}
|
|
|
|
bool Regex::Test(const WString& text)const
|
|
{
|
|
if(pure)
|
|
{
|
|
PureResult result;
|
|
return pure->Match(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
else
|
|
{
|
|
RichResult result;
|
|
return rich->Match(text.Buffer(), text.Buffer(), result);
|
|
}
|
|
}
|
|
|
|
void Regex::Search(const WString& text, RegexMatch::List& matches)const
|
|
{
|
|
Process(text, false, true, false, matches);
|
|
}
|
|
|
|
void Regex::Split(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
|
|
{
|
|
Process(text, keepEmptyMatch, false, true, matches);
|
|
}
|
|
|
|
void Regex::Cut(const WString& text, bool keepEmptyMatch, RegexMatch::List& matches)const
|
|
{
|
|
Process(text, keepEmptyMatch, true, true, matches);
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexTokens
|
|
***********************************************************************/
|
|
|
|
bool RegexToken::operator==(const RegexToken& _token)const
|
|
{
|
|
return length==_token.length && token==_token.token && reading==_token.reading;
|
|
}
|
|
|
|
bool RegexToken::operator==(const wchar_t* _token)const
|
|
{
|
|
return wcslen(_token)==length && wcsncmp(reading, _token, length)==0;
|
|
}
|
|
|
|
class RegexTokenEnumerator : public Object, public IEnumerator<RegexToken>
|
|
{
|
|
protected:
|
|
RegexToken token;
|
|
vint index = -1;
|
|
|
|
PureInterpretor* pure;
|
|
const Array<vint>& stateTokens;
|
|
const wchar_t* start;
|
|
vint codeIndex;
|
|
RegexProc proc;
|
|
|
|
const wchar_t* reading;
|
|
vint rowStart = 0;
|
|
vint columnStart = 0;
|
|
bool cacheAvailable = false;
|
|
RegexToken cacheToken;
|
|
|
|
public:
|
|
RegexTokenEnumerator(const RegexTokenEnumerator& enumerator)
|
|
:token(enumerator.token)
|
|
, index(enumerator.index)
|
|
, pure(enumerator.pure)
|
|
, stateTokens(enumerator.stateTokens)
|
|
, proc(enumerator.proc)
|
|
, reading(enumerator.reading)
|
|
, start(enumerator.start)
|
|
, rowStart(enumerator.rowStart)
|
|
, columnStart(enumerator.columnStart)
|
|
, codeIndex(enumerator.codeIndex)
|
|
, cacheAvailable(enumerator.cacheAvailable)
|
|
, cacheToken(enumerator.cacheToken)
|
|
{
|
|
}
|
|
|
|
RegexTokenEnumerator(PureInterpretor* _pure, const Array<vint>& _stateTokens, const wchar_t* _start, vint _codeIndex, RegexProc _proc)
|
|
:index(-1)
|
|
, pure(_pure)
|
|
, stateTokens(_stateTokens)
|
|
, start(_start)
|
|
, codeIndex(_codeIndex)
|
|
, proc(_proc)
|
|
, reading(_start)
|
|
{
|
|
}
|
|
|
|
IEnumerator<RegexToken>* Clone()const
|
|
{
|
|
return new RegexTokenEnumerator(*this);
|
|
}
|
|
|
|
const RegexToken& Current()const
|
|
{
|
|
return token;
|
|
}
|
|
|
|
vint Index()const
|
|
{
|
|
return index;
|
|
}
|
|
|
|
bool Next()
|
|
{
|
|
if (!cacheAvailable && !*reading) return false;
|
|
if (cacheAvailable)
|
|
{
|
|
token = cacheToken;
|
|
cacheAvailable = false;
|
|
}
|
|
else
|
|
{
|
|
token.reading = reading;
|
|
token.start = 0;
|
|
token.length = 0;
|
|
token.token = -2;
|
|
token.completeToken = true;
|
|
}
|
|
|
|
token.rowStart = rowStart;
|
|
token.columnStart = columnStart;
|
|
token.rowEnd = rowStart;
|
|
token.columnEnd = columnStart;
|
|
token.codeIndex = codeIndex;
|
|
|
|
PureResult result;
|
|
while (*reading)
|
|
{
|
|
vint id = -1;
|
|
bool completeToken = true;
|
|
if (!pure->MatchHead(reading, start, result))
|
|
{
|
|
result.start = reading - start;
|
|
|
|
if (id == -1 && result.terminateState != -1)
|
|
{
|
|
vint state = pure->GetRelatedFinalState(result.terminateState);
|
|
if (state != -1)
|
|
{
|
|
id = stateTokens[state];
|
|
}
|
|
}
|
|
|
|
if (id == -1)
|
|
{
|
|
result.length = 1;
|
|
}
|
|
else
|
|
{
|
|
completeToken = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
id = stateTokens.Get(result.finalState);
|
|
}
|
|
|
|
if (id != -1 && proc.extendProc)
|
|
{
|
|
RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr);
|
|
proc.extendProc(proc.argument, reading, -1, true, token);
|
|
#if _DEBUG
|
|
CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer.");
|
|
#endif
|
|
result.length = token.length;
|
|
id = token.token;
|
|
completeToken = token.completeToken;
|
|
}
|
|
|
|
if (token.token == -2)
|
|
{
|
|
token.start = result.start;
|
|
token.length = result.length;
|
|
token.token = id;
|
|
token.completeToken = completeToken;
|
|
}
|
|
else if (token.token == id && id == -1)
|
|
{
|
|
token.length += result.length;
|
|
}
|
|
else
|
|
{
|
|
cacheAvailable = true;
|
|
cacheToken.reading = reading;
|
|
cacheToken.start = result.start;
|
|
cacheToken.length = result.length;
|
|
cacheToken.codeIndex = codeIndex;
|
|
cacheToken.token = id;
|
|
cacheToken.completeToken = completeToken;
|
|
}
|
|
reading += result.length;
|
|
|
|
if (cacheAvailable)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
index++;
|
|
|
|
for (vint i = 0; i < token.length; i++)
|
|
{
|
|
token.rowEnd = rowStart;
|
|
token.columnEnd = columnStart;
|
|
if (token.reading[i] == L'\n')
|
|
{
|
|
rowStart++;
|
|
columnStart = 0;
|
|
}
|
|
else
|
|
{
|
|
columnStart++;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void Reset()
|
|
{
|
|
index = -1;
|
|
reading = start;
|
|
cacheAvailable = false;
|
|
}
|
|
|
|
void ReadToEnd(List<RegexToken>& tokens, bool(*discard)(vint))
|
|
{
|
|
while (Next())
|
|
{
|
|
if (!discard(token.token))
|
|
{
|
|
tokens.Add(token);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
RegexTokens::RegexTokens(PureInterpretor* _pure, const Array<vint>& _stateTokens, const WString& _code, vint _codeIndex, RegexProc _proc)
|
|
:pure(_pure)
|
|
, stateTokens(_stateTokens)
|
|
, code(_code)
|
|
, codeIndex(_codeIndex)
|
|
, proc(_proc)
|
|
{
|
|
}
|
|
|
|
RegexTokens::RegexTokens(const RegexTokens& tokens)
|
|
:pure(tokens.pure)
|
|
, stateTokens(tokens.stateTokens)
|
|
, code(tokens.code)
|
|
, codeIndex(tokens.codeIndex)
|
|
, proc(tokens.proc)
|
|
{
|
|
}
|
|
|
|
IEnumerator<RegexToken>* RegexTokens::CreateEnumerator()const
|
|
{
|
|
return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc);
|
|
}
|
|
|
|
bool DefaultDiscard(vint token)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
void RegexTokens::ReadToEnd(collections::List<RegexToken>& tokens, bool(*discard)(vint))const
|
|
{
|
|
if(discard==0)
|
|
{
|
|
discard=&DefaultDiscard;
|
|
}
|
|
RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard);
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexerWalker
|
|
***********************************************************************/
|
|
|
|
RegexLexerWalker::RegexLexerWalker(PureInterpretor* _pure, const Array<vint>& _stateTokens)
|
|
:pure(_pure)
|
|
, stateTokens(_stateTokens)
|
|
{
|
|
}
|
|
|
|
RegexLexerWalker::RegexLexerWalker(const RegexLexerWalker& tokens)
|
|
: pure(tokens.pure)
|
|
, stateTokens(tokens.stateTokens)
|
|
{
|
|
}
|
|
|
|
RegexLexerWalker::~RegexLexerWalker()
|
|
{
|
|
}
|
|
|
|
RegexTokens::~RegexTokens()
|
|
{
|
|
}
|
|
|
|
vint RegexLexerWalker::GetStartState()const
|
|
{
|
|
return pure->GetStartState();
|
|
}
|
|
|
|
vint RegexLexerWalker::GetRelatedToken(vint state)const
|
|
{
|
|
vint finalState = state == -1 ? -1 : pure->GetRelatedFinalState(state);
|
|
return finalState == -1 ? -1 : stateTokens.Get(finalState);
|
|
}
|
|
|
|
void RegexLexerWalker::Walk(wchar_t input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const
|
|
{
|
|
vint previousState=state;
|
|
token=-1;
|
|
finalState=false;
|
|
previousTokenStop=false;
|
|
if(state==-1)
|
|
{
|
|
state=pure->GetStartState();
|
|
previousTokenStop=true;
|
|
}
|
|
|
|
state=pure->Transit(input, state);
|
|
if(state==-1)
|
|
{
|
|
previousTokenStop=true;
|
|
if(previousState==-1)
|
|
{
|
|
finalState=true;
|
|
return;
|
|
}
|
|
else if(pure->IsFinalState(previousState))
|
|
{
|
|
state=pure->Transit(input, pure->GetStartState());
|
|
}
|
|
}
|
|
if(pure->IsFinalState(state))
|
|
{
|
|
token=stateTokens.Get(state);
|
|
finalState=true;
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
finalState=state==-1;
|
|
return;
|
|
}
|
|
}
|
|
|
|
vint RegexLexerWalker::Walk(wchar_t input, vint state)const
|
|
{
|
|
vint token=-1;
|
|
bool finalState=false;
|
|
bool previousTokenStop=false;
|
|
Walk(input, state, token, finalState, previousTokenStop);
|
|
return state;
|
|
}
|
|
|
|
bool RegexLexerWalker::IsClosedToken(const wchar_t* input, vint length)const
|
|
{
|
|
vint state=pure->GetStartState();
|
|
for(vint i=0;i<length;i++)
|
|
{
|
|
state=pure->Transit(input[i], state);
|
|
if(state==-1) return true;
|
|
if(pure->IsDeadState(state)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool RegexLexerWalker::IsClosedToken(const WString& input)const
|
|
{
|
|
return IsClosedToken(input.Buffer(), input.Length());
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexerColorizer
|
|
***********************************************************************/
|
|
|
|
RegexLexerColorizer::RegexLexerColorizer(const RegexLexerWalker& _walker, RegexProc _proc)
|
|
:walker(_walker)
|
|
, proc(_proc)
|
|
{
|
|
internalState.currentState = walker.GetStartState();
|
|
}
|
|
|
|
RegexLexerColorizer::RegexLexerColorizer(const RegexLexerColorizer& colorizer)
|
|
:walker(colorizer.walker)
|
|
, proc(colorizer.proc)
|
|
, internalState(colorizer.internalState)
|
|
{
|
|
}
|
|
|
|
RegexLexerColorizer::~RegexLexerColorizer()
|
|
{
|
|
}
|
|
|
|
RegexLexerColorizer::InternalState RegexLexerColorizer::GetInternalState()
|
|
{
|
|
return internalState;
|
|
}
|
|
void RegexLexerColorizer::SetInternalState(InternalState state)
|
|
{
|
|
internalState = state;
|
|
}
|
|
|
|
void RegexLexerColorizer::Pass(wchar_t input)
|
|
{
|
|
WalkOneToken(&input, 1, 0, false);
|
|
}
|
|
|
|
vint RegexLexerColorizer::GetStartState()const
|
|
{
|
|
return walker.GetStartState();
|
|
}
|
|
|
|
void RegexLexerColorizer::CallExtendProcAndColorizeProc(const wchar_t* input, vint length, RegexProcessingToken& token, bool colorize)
|
|
{
|
|
vint oldTokenLength = token.length;
|
|
proc.extendProc(proc.argument, input + token.start, length - token.start, false, token);
|
|
#if _DEBUG
|
|
{
|
|
bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken;
|
|
CHECK_ERROR(
|
|
token.completeToken || pausedAtTheEnd,
|
|
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input."
|
|
);
|
|
CHECK_ERROR(
|
|
token.completeToken || token.token != -1,
|
|
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id."
|
|
);
|
|
CHECK_ERROR(
|
|
oldTokenLength <= token.length,
|
|
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length."
|
|
);
|
|
CHECK_ERROR(
|
|
(token.interTokenState == nullptr) == !pausedAtTheEnd,
|
|
L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
|
|
);
|
|
}
|
|
#endif
|
|
if ((internalState.interTokenState = token.interTokenState))
|
|
{
|
|
internalState.interTokenId = token.token;
|
|
}
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, token.start, token.length, token.token);
|
|
}
|
|
}
|
|
|
|
vint RegexLexerColorizer::WalkOneToken(const wchar_t* input, vint length, vint start, bool colorize)
|
|
{
|
|
if (internalState.interTokenState)
|
|
{
|
|
RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState);
|
|
proc.extendProc(proc.argument, input, length, false, token);
|
|
#if _DEBUG
|
|
{
|
|
bool pausedAtTheEnd = token.length == length && !token.completeToken;
|
|
CHECK_ERROR(
|
|
token.completeToken || pausedAtTheEnd,
|
|
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input."
|
|
);
|
|
CHECK_ERROR(
|
|
token.completeToken || token.token == internalState.interTokenId,
|
|
L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id."
|
|
);
|
|
CHECK_ERROR(
|
|
(token.interTokenState == nullptr) == !pausedAtTheEnd,
|
|
L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
|
|
);
|
|
}
|
|
#endif
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, 0, token.length, token.token);
|
|
}
|
|
if (!(internalState.interTokenState = token.interTokenState))
|
|
{
|
|
internalState.interTokenId = -1;
|
|
}
|
|
return token.length;
|
|
}
|
|
|
|
vint lastFinalStateLength = 0;
|
|
vint lastFinalStateToken = -1;
|
|
vint lastFinalStateState = -1;
|
|
|
|
vint tokenStartState = internalState.currentState;
|
|
for (vint i = start; i < length; i++)
|
|
{
|
|
vint currentToken = -1;
|
|
bool finalState = false;
|
|
bool previousTokenStop = false;
|
|
walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop);
|
|
|
|
if (previousTokenStop)
|
|
{
|
|
if (proc.extendProc && lastFinalStateToken != -1)
|
|
{
|
|
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
|
|
CallExtendProcAndColorizeProc(input, length, token, colorize);
|
|
if (token.completeToken)
|
|
{
|
|
internalState.currentState = walker.GetStartState();
|
|
}
|
|
return start + token.length;
|
|
}
|
|
else if (i == start)
|
|
{
|
|
if (tokenStartState == GetStartState())
|
|
{
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, 1, -1);
|
|
}
|
|
internalState.currentState = walker.GetStartState();
|
|
return i + 1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
|
|
}
|
|
internalState.currentState = lastFinalStateState;
|
|
return start + lastFinalStateLength;
|
|
}
|
|
}
|
|
|
|
if (finalState)
|
|
{
|
|
lastFinalStateLength = i + 1 - start;
|
|
lastFinalStateToken = currentToken;
|
|
lastFinalStateState = internalState.currentState;
|
|
}
|
|
}
|
|
|
|
if (lastFinalStateToken != -1 && start + lastFinalStateLength == length)
|
|
{
|
|
if (proc.extendProc)
|
|
{
|
|
RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
|
|
CallExtendProcAndColorizeProc(input, length, token, colorize);
|
|
}
|
|
else if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
|
|
}
|
|
}
|
|
else if (colorize)
|
|
{
|
|
proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState));
|
|
}
|
|
return length;
|
|
}
|
|
|
|
void* RegexLexerColorizer::Colorize(const wchar_t* input, vint length)
|
|
{
|
|
vint index = 0;
|
|
while (index != length)
|
|
{
|
|
index = WalkOneToken(input, length, index, true);
|
|
}
|
|
return internalState.interTokenState;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexLexer
|
|
***********************************************************************/
|
|
|
|
RegexLexer::RegexLexer(const collections::IEnumerable<WString>& tokens, RegexProc _proc)
|
|
:proc(_proc)
|
|
{
|
|
// Build DFA for all tokens
|
|
List<Expression::Ref> expressions;
|
|
List<Automaton::Ref> dfas;
|
|
CharRange::List subsets;
|
|
Ptr<IEnumerator<WString>> enumerator = tokens.CreateEnumerator();
|
|
while (enumerator->Next())
|
|
{
|
|
const WString& code = enumerator->Current();
|
|
|
|
RegexExpression::Ref regex = ParseRegexExpression(code);
|
|
Expression::Ref expression = regex->Merge();
|
|
expression->CollectCharSet(subsets);
|
|
expressions.Add(expression);
|
|
}
|
|
for (vint i = 0; i < expressions.Count(); i++)
|
|
{
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
Expression::Ref expression = expressions[i];
|
|
expression->ApplyCharSet(subsets);
|
|
Automaton::Ref eNfa = expression->GenerateEpsilonNfa();
|
|
Automaton::Ref nfa = EpsilonNfaToNfa(eNfa, PureEpsilonChecker, nfaStateMap);
|
|
Automaton::Ref dfa = NfaToDfa(nfa, dfaStateMap);
|
|
dfas.Add(dfa);
|
|
}
|
|
|
|
// Mark all states in DFAs
|
|
for (vint i = 0; i < dfas.Count(); i++)
|
|
{
|
|
Automaton::Ref dfa = dfas[i];
|
|
for (vint j = 0; j < dfa->states.Count(); j++)
|
|
{
|
|
if (dfa->states[j]->finalState)
|
|
{
|
|
dfa->states[j]->userData = (void*)i;
|
|
}
|
|
else
|
|
{
|
|
dfa->states[j]->userData = (void*)dfas.Count();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Connect all DFAs to an e-NFA
|
|
Automaton::Ref bigEnfa = new Automaton;
|
|
for (vint i = 0; i < dfas.Count(); i++)
|
|
{
|
|
CopyFrom(bigEnfa->states, dfas[i]->states);
|
|
CopyFrom(bigEnfa->transitions, dfas[i]->transitions);
|
|
}
|
|
bigEnfa->startState = bigEnfa->NewState();
|
|
for (vint i = 0; i < dfas.Count(); i++)
|
|
{
|
|
bigEnfa->NewEpsilon(bigEnfa->startState, dfas[i]->startState);
|
|
}
|
|
|
|
// Build a single DFA out of the e-NFA
|
|
Dictionary<State*, State*> nfaStateMap;
|
|
Group<State*, State*> dfaStateMap;
|
|
Automaton::Ref bigNfa = EpsilonNfaToNfa(bigEnfa, PureEpsilonChecker, nfaStateMap);
|
|
for (vint i = 0; i < nfaStateMap.Keys().Count(); i++)
|
|
{
|
|
void* userData = nfaStateMap.Values().Get(i)->userData;
|
|
nfaStateMap.Keys()[i]->userData = userData;
|
|
}
|
|
Automaton::Ref bigDfa = NfaToDfa(bigNfa, dfaStateMap);
|
|
for (vint i = 0; i < dfaStateMap.Keys().Count(); i++)
|
|
{
|
|
void* userData = dfaStateMap.GetByIndex(i).Get(0)->userData;
|
|
for (vint j = 1; j < dfaStateMap.GetByIndex(i).Count(); j++)
|
|
{
|
|
void* newData = dfaStateMap.GetByIndex(i).Get(j)->userData;
|
|
if (userData > newData)
|
|
{
|
|
userData = newData;
|
|
}
|
|
}
|
|
dfaStateMap.Keys()[i]->userData = userData;
|
|
}
|
|
|
|
// Build state machine
|
|
pure = new PureInterpretor(bigDfa, subsets);
|
|
stateTokens.Resize(bigDfa->states.Count());
|
|
for (vint i = 0; i < stateTokens.Count(); i++)
|
|
{
|
|
void* userData = bigDfa->states[i]->userData;
|
|
stateTokens[i] = (vint)userData;
|
|
}
|
|
}
|
|
|
|
RegexLexer::~RegexLexer()
|
|
{
|
|
if (pure)delete pure;
|
|
}
|
|
|
|
RegexTokens RegexLexer::Parse(const WString& code, vint codeIndex)const
|
|
{
|
|
pure->PrepareForRelatedFinalStateTable();
|
|
return RegexTokens(pure, stateTokens, code, codeIndex, proc);
|
|
}
|
|
|
|
RegexLexerWalker RegexLexer::Walk()const
|
|
{
|
|
pure->PrepareForRelatedFinalStateTable();
|
|
return RegexLexerWalker(pure, stateTokens);
|
|
}
|
|
|
|
RegexLexerColorizer RegexLexer::Colorize()const
|
|
{
|
|
return RegexLexerColorizer(Walk(), proc);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXAUTOMATON.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
using namespace collections;
|
|
|
|
/***********************************************************************
|
|
Automaton
|
|
***********************************************************************/
|
|
|
|
Automaton::Automaton()
|
|
{
|
|
startState=0;
|
|
}
|
|
|
|
State* Automaton::NewState()
|
|
{
|
|
State* state=new State;
|
|
state->finalState=false;
|
|
state->userData=0;
|
|
states.Add(state);
|
|
return state;
|
|
}
|
|
|
|
Transition* Automaton::NewTransition(State* start, State* end)
|
|
{
|
|
Transition* transition=new Transition;
|
|
transition->source=start;
|
|
transition->target=end;
|
|
start->transitions.Add(transition);
|
|
end->inputs.Add(transition);
|
|
transitions.Add(transition);
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewChars(State* start, State* end, CharRange range)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Chars;
|
|
transition->range=range;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewEpsilon(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Epsilon;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewBeginString(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::BeginString;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewEndString(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::EndString;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewNop(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Nop;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewCapture(State* start, State* end, vint capture)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Capture;
|
|
transition->capture=capture;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewMatch(State* start, State* end, vint capture, vint index)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Match;
|
|
transition->capture=capture;
|
|
transition->index=index;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewPositive(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Positive;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewNegative(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::Negative;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewNegativeFail(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::NegativeFail;
|
|
return transition;
|
|
}
|
|
|
|
Transition* Automaton::NewEnd(State* start, State* end)
|
|
{
|
|
Transition* transition=NewTransition(start, end);
|
|
transition->type=Transition::End;
|
|
return transition;
|
|
}
|
|
|
|
/***********************************************************************
|
|
Helpers
|
|
***********************************************************************/
|
|
|
|
bool PureEpsilonChecker(Transition* transition)
|
|
{
|
|
switch(transition->type)
|
|
{
|
|
case Transition::Epsilon:
|
|
case Transition::Nop:
|
|
case Transition::Capture:
|
|
case Transition::End:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool RichEpsilonChecker(Transition* transition)
|
|
{
|
|
switch(transition->type)
|
|
{
|
|
case Transition::Epsilon:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool AreEqual(Transition* transA, Transition* transB)
|
|
{
|
|
if(transA->type!=transB->type)return false;
|
|
switch(transA->type)
|
|
{
|
|
case Transition::Chars:
|
|
return transA->range==transB->range;
|
|
case Transition::Capture:
|
|
return transA->capture==transB->capture;
|
|
case Transition::Match:
|
|
return transA->capture==transB->capture && transA->index==transB->index;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Collect epsilon states and non-epsilon transitions, their order are maintained to match the e-NFA
|
|
void CollectEpsilon(State* targetState, State* sourceState, bool(*epsilonChecker)(Transition*), List<State*>& epsilonStates, List<Transition*>& transitions)
|
|
{
|
|
if(!epsilonStates.Contains(sourceState))
|
|
{
|
|
epsilonStates.Add(sourceState);
|
|
for(vint i=0;i<sourceState->transitions.Count();i++)
|
|
{
|
|
Transition* transition=sourceState->transitions[i];
|
|
if(epsilonChecker(transition))
|
|
{
|
|
if(!epsilonStates.Contains(transition->target))
|
|
{
|
|
if(transition->target->finalState)
|
|
{
|
|
targetState->finalState=true;
|
|
}
|
|
CollectEpsilon(targetState, transition->target, epsilonChecker, epsilonStates, transitions);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
transitions.Add(transition);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Automaton::Ref EpsilonNfaToNfa(Automaton::Ref source, bool(*epsilonChecker)(Transition*), Dictionary<State*, State*>& nfaStateMap)
|
|
{
|
|
Automaton::Ref target=new Automaton;
|
|
Dictionary<State*, State*> stateMap; // source->target
|
|
List<State*> epsilonStates; // current epsilon closure
|
|
List<Transition*> transitions; // current non-epsilon transitions
|
|
|
|
stateMap.Add(source->startState, target->NewState());
|
|
nfaStateMap.Add(stateMap[source->startState], source->startState);
|
|
target->startState=target->states[0].Obj();
|
|
CopyFrom(target->captureNames, source->captureNames);
|
|
|
|
for(vint i=0;i<target->states.Count();i++)
|
|
{
|
|
// Clear cache
|
|
State* targetState=target->states[i].Obj();
|
|
State* sourceState=nfaStateMap[targetState];
|
|
if(sourceState->finalState)
|
|
{
|
|
targetState->finalState=true;
|
|
}
|
|
epsilonStates.Clear();
|
|
transitions.Clear();
|
|
|
|
// Collect epsilon states and non-epsilon transitions
|
|
CollectEpsilon(targetState, sourceState, epsilonChecker, epsilonStates, transitions);
|
|
|
|
// Iterate through all non-epsilon transitions
|
|
for(vint j=0;j<transitions.Count();j++)
|
|
{
|
|
Transition* transition=transitions[j];
|
|
// Create and map a new target state if a new non-epsilon state is found in the e-NFA
|
|
if(!stateMap.Keys().Contains(transition->target))
|
|
{
|
|
stateMap.Add(transition->target, target->NewState());
|
|
nfaStateMap.Add(stateMap[transition->target], transition->target);
|
|
}
|
|
// Copy transition to connect between two non-epsilon state
|
|
Transition* newTransition=target->NewTransition(targetState, stateMap[transition->target]);
|
|
newTransition->capture=transition->capture;
|
|
newTransition->index=transition->index;
|
|
newTransition->range=transition->range;
|
|
newTransition->type=transition->type;
|
|
}
|
|
}
|
|
return target;
|
|
}
|
|
|
|
Automaton::Ref NfaToDfa(Automaton::Ref source, Group<State*, State*>& dfaStateMap)
|
|
{
|
|
Automaton::Ref target=new Automaton;
|
|
Group<Transition*, Transition*> nfaTransitions;
|
|
List<Transition*> transitionClasses; // Maintain order for nfaTransitions.Keys
|
|
|
|
CopyFrom(target->captureNames, source->captureNames);
|
|
State* startState=target->NewState();
|
|
target->startState=startState;
|
|
dfaStateMap.Add(startState, source->startState);
|
|
|
|
SortedList<State*> transitionTargets;
|
|
SortedList<State*> relativeStates;
|
|
transitionTargets.SetLessMemoryMode(false);
|
|
relativeStates.SetLessMemoryMode(false);
|
|
|
|
for(vint i=0;i<target->states.Count();i++)
|
|
{
|
|
State* currentState=target->states[i].Obj();
|
|
nfaTransitions.Clear();
|
|
transitionClasses.Clear();
|
|
|
|
// Iterate through all NFA states which represent the DFA state
|
|
const List<State*>& nfaStates=dfaStateMap[currentState];
|
|
for(vint j=0;j<nfaStates.Count();j++)
|
|
{
|
|
State* nfaState=nfaStates.Get(j);
|
|
// Iterate through all transitions from those NFA states
|
|
for(vint k=0;k<nfaState->transitions.Count();k++)
|
|
{
|
|
Transition* nfaTransition=nfaState->transitions[k];
|
|
// Check if there is any key in nfaTransitions that has the same input as the current transition
|
|
Transition* transitionClass=0;
|
|
for(vint l=0;l<nfaTransitions.Keys().Count();l++)
|
|
{
|
|
Transition* key=nfaTransitions.Keys()[l];
|
|
if(AreEqual(key, nfaTransition))
|
|
{
|
|
transitionClass=key;
|
|
break;
|
|
}
|
|
}
|
|
// Create a new key if not
|
|
if(transitionClass==0)
|
|
{
|
|
transitionClass=nfaTransition;
|
|
transitionClasses.Add(transitionClass);
|
|
}
|
|
// Group the transition
|
|
nfaTransitions.Add(transitionClass, nfaTransition);
|
|
}
|
|
}
|
|
|
|
// Iterate through all key transition that represent all existing transition inputs from the same state
|
|
for(vint j=0;j<transitionClasses.Count();j++)
|
|
{
|
|
const List<Transition*>& transitionSet=nfaTransitions[transitionClasses[j]];
|
|
// Sort all target states and keep unique
|
|
transitionTargets.Clear();
|
|
for(vint l=0;l<transitionSet.Count();l++)
|
|
{
|
|
State* nfaState=transitionSet.Get(l)->target;
|
|
if(!transitionTargets.Contains(nfaState))
|
|
{
|
|
transitionTargets.Add(nfaState);
|
|
}
|
|
}
|
|
// Check if these NFA states represent a created DFA state
|
|
State* dfaState=0;
|
|
for(vint k=0;k<dfaStateMap.Count();k++)
|
|
{
|
|
// Sort NFA states for a certain DFA state
|
|
CopyFrom(relativeStates, dfaStateMap.GetByIndex(k));
|
|
// Compare two NFA states set
|
|
if(relativeStates.Count()==transitionTargets.Count())
|
|
{
|
|
bool equal=true;
|
|
for(vint l=0;l<relativeStates.Count();l++)
|
|
{
|
|
if(relativeStates[l]!=transitionTargets[l])
|
|
{
|
|
equal=false;
|
|
break;
|
|
}
|
|
}
|
|
if(equal)
|
|
{
|
|
dfaState=dfaStateMap.Keys()[k];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// Create a new DFA state if there is not
|
|
if(!dfaState)
|
|
{
|
|
dfaState=target->NewState();
|
|
for(vint k=0;k<transitionTargets.Count();k++)
|
|
{
|
|
dfaStateMap.Add(dfaState, transitionTargets[k]);
|
|
if(transitionTargets[k]->finalState)
|
|
{
|
|
dfaState->finalState=true;
|
|
}
|
|
}
|
|
}
|
|
// Create corresponding DFA transition
|
|
Transition* transitionClass=transitionClasses[j];
|
|
Transition* newTransition=target->NewTransition(currentState, dfaState);
|
|
newTransition->capture=transitionClass->capture;
|
|
newTransition->index=transitionClass->index;
|
|
newTransition->range=transitionClass->range;
|
|
newTransition->type=transitionClass->type;
|
|
}
|
|
}
|
|
|
|
return target;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXDATA.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
CharRange
|
|
***********************************************************************/
|
|
|
|
CharRange::CharRange()
|
|
:begin(L'\0')
|
|
,end(L'\0')
|
|
{
|
|
}
|
|
|
|
CharRange::CharRange(wchar_t _begin, wchar_t _end)
|
|
:begin(_begin)
|
|
,end(_end)
|
|
{
|
|
}
|
|
|
|
bool CharRange::operator<(CharRange item)const
|
|
{
|
|
return end<item.begin;
|
|
}
|
|
|
|
bool CharRange::operator<=(CharRange item)const
|
|
{
|
|
return *this<item || *this==item;
|
|
}
|
|
|
|
bool CharRange::operator>(CharRange item)const
|
|
{
|
|
return item.end<begin;
|
|
}
|
|
|
|
bool CharRange::operator>=(CharRange item)const
|
|
{
|
|
return *this>item || *this==item;
|
|
}
|
|
|
|
bool CharRange::operator==(CharRange item)const
|
|
{
|
|
return begin==item.begin && end==item.end;
|
|
}
|
|
|
|
bool CharRange::operator!=(CharRange item)const
|
|
{
|
|
return begin!=item.begin || item.end!=end;
|
|
}
|
|
|
|
bool CharRange::operator<(wchar_t item)const
|
|
{
|
|
return end<item;
|
|
}
|
|
|
|
bool CharRange::operator<=(wchar_t item)const
|
|
{
|
|
return begin<=item;
|
|
}
|
|
|
|
bool CharRange::operator>(wchar_t item)const
|
|
{
|
|
return item<begin;
|
|
}
|
|
|
|
bool CharRange::operator>=(wchar_t item)const
|
|
{
|
|
return item<=end;
|
|
}
|
|
|
|
bool CharRange::operator==(wchar_t item)const
|
|
{
|
|
return begin<=item && item<=end;
|
|
}
|
|
|
|
bool CharRange::operator!=(wchar_t item)const
|
|
{
|
|
return item<begin || end<item;
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXEXPRESSION.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
IsEqualAlgorithm
|
|
***********************************************************************/
|
|
|
|
class IsEqualAlgorithm : public RegexExpressionAlgorithm<bool, Expression*>
|
|
{
|
|
public:
|
|
bool Apply(CharSetExpression* expression, Expression* target)
|
|
{
|
|
CharSetExpression* expected=dynamic_cast<CharSetExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(expression->reverse!=expected->reverse)return false;
|
|
if(expression->ranges.Count()!=expected->ranges.Count())return false;
|
|
for(vint i=0;i<expression->ranges.Count();i++)
|
|
{
|
|
if(expression->ranges[i]!=expected->ranges[i])return false;
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(LoopExpression* expression, Expression* target)
|
|
{
|
|
LoopExpression* expected=dynamic_cast<LoopExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(expression->min!=expected->min)return false;
|
|
if(expression->max!=expected->max)return false;
|
|
if(expression->preferLong!=expected->preferLong)return false;
|
|
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(SequenceExpression* expression, Expression* target)
|
|
{
|
|
SequenceExpression* expected=dynamic_cast<SequenceExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(!Invoke(expression->left, expected->left.Obj()))return false;
|
|
if(!Invoke(expression->right, expected->right.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(AlternateExpression* expression, Expression* target)
|
|
{
|
|
AlternateExpression* expected=dynamic_cast<AlternateExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(!Invoke(expression->left, expected->left.Obj()))return false;
|
|
if(!Invoke(expression->right, expected->right.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(BeginExpression* expression, Expression* target)
|
|
{
|
|
BeginExpression* expected=dynamic_cast<BeginExpression*>(target);
|
|
if(expected)
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(EndExpression* expression, Expression* target)
|
|
{
|
|
EndExpression* expected=dynamic_cast<EndExpression*>(target);
|
|
if(expected)
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(CaptureExpression* expression, Expression* target)
|
|
{
|
|
CaptureExpression* expected=dynamic_cast<CaptureExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(expression->name!=expected->name)return false;
|
|
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(MatchExpression* expression, Expression* target)
|
|
{
|
|
MatchExpression* expected=dynamic_cast<MatchExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(expression->name!=expected->name)return false;
|
|
if(expression->index!=expected->index)return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(PositiveExpression* expression, Expression* target)
|
|
{
|
|
PositiveExpression* expected=dynamic_cast<PositiveExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(NegativeExpression* expression, Expression* target)
|
|
{
|
|
NegativeExpression* expected=dynamic_cast<NegativeExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(!Invoke(expression->expression, expected->expression.Obj()))return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Apply(UsingExpression* expression, Expression* target)
|
|
{
|
|
UsingExpression* expected=dynamic_cast<UsingExpression*>(target);
|
|
if(expected)
|
|
{
|
|
if(expression->name!=expected->name)return false;
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
HasNoExtensionAlgorithm
|
|
***********************************************************************/
|
|
|
|
class HasNoExtensionAlgorithm : public RegexExpressionAlgorithm<bool, void*>
|
|
{
|
|
public:
|
|
bool Apply(CharSetExpression* expression, void* target)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool Apply(LoopExpression* expression, void* target)
|
|
{
|
|
return expression->preferLong && Invoke(expression->expression, 0);
|
|
}
|
|
|
|
bool Apply(SequenceExpression* expression, void* target)
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(AlternateExpression* expression, void* target)
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(BeginExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(EndExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(CaptureExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(MatchExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(PositiveExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(NegativeExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(UsingExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
CanTreatAsPureAlgorithm
|
|
***********************************************************************/
|
|
|
|
class CanTreatAsPureAlgorithm : public RegexExpressionAlgorithm<bool, void*>
|
|
{
|
|
public:
|
|
bool Apply(CharSetExpression* expression, void* target)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
bool Apply(LoopExpression* expression, void* target)
|
|
{
|
|
return expression->preferLong && Invoke(expression->expression, 0);
|
|
}
|
|
|
|
bool Apply(SequenceExpression* expression, void* target)
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(AlternateExpression* expression, void* target)
|
|
{
|
|
return Invoke(expression->left, 0) && Invoke(expression->right, 0);
|
|
}
|
|
|
|
bool Apply(BeginExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(EndExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(CaptureExpression* expression, void* target)
|
|
{
|
|
return Invoke(expression->expression, 0);
|
|
}
|
|
|
|
bool Apply(MatchExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(PositiveExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(NegativeExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool Apply(UsingExpression* expression, void* target)
|
|
{
|
|
return false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
CharSetNormalizationAlgorithm
|
|
***********************************************************************/
|
|
|
|
class NormalizedCharSet
|
|
{
|
|
public:
|
|
CharRange::List ranges;
|
|
};
|
|
|
|
class CharSetAlgorithm : public RegexExpressionAlgorithm<void, NormalizedCharSet*>
|
|
{
|
|
public:
|
|
virtual void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)=0;
|
|
|
|
void Loop(CharSetExpression* expression, CharRange::List& ranges, NormalizedCharSet* target)
|
|
{
|
|
if(expression->reverse)
|
|
{
|
|
wchar_t begin=1;
|
|
for(vint i=0;i<ranges.Count();i++)
|
|
{
|
|
CharRange range=ranges[i];
|
|
if(range.begin>begin)
|
|
{
|
|
Process(expression, target, CharRange(begin, range.begin-1));
|
|
}
|
|
begin=range.end+1;
|
|
}
|
|
if(begin<=65535)
|
|
{
|
|
Process(expression, target, CharRange(begin, 65535));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(vint i=0;i<ranges.Count();i++)
|
|
{
|
|
Process(expression, target, ranges[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Apply(LoopExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(SequenceExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Invoke(expression->left, target);
|
|
Invoke(expression->right, target);
|
|
}
|
|
|
|
void Apply(AlternateExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Invoke(expression->left, target);
|
|
Invoke(expression->right, target);
|
|
}
|
|
|
|
void Apply(BeginExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
}
|
|
|
|
void Apply(EndExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
}
|
|
|
|
void Apply(CaptureExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(MatchExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
}
|
|
|
|
void Apply(PositiveExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(NegativeExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Invoke(expression->expression, target);
|
|
}
|
|
|
|
void Apply(UsingExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
}
|
|
};
|
|
|
|
class BuildNormalizedCharSetAlgorithm : public CharSetAlgorithm
|
|
{
|
|
public:
|
|
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
|
|
{
|
|
vint index=0;
|
|
while(index<target->ranges.Count())
|
|
{
|
|
CharRange current=target->ranges[index];
|
|
if(current<range || current>range)
|
|
{
|
|
index++;
|
|
}
|
|
else if(current.begin<range.begin)
|
|
{
|
|
// range : [ ?
|
|
// current : [ ]
|
|
target->ranges.RemoveAt(index);
|
|
target->ranges.Add(CharRange(current.begin, range.begin-1));
|
|
target->ranges.Add(CharRange(range.begin, current.end));
|
|
index++;
|
|
}
|
|
else if(current.begin>range.begin)
|
|
{
|
|
// range : [ ]
|
|
// current : [ ?
|
|
target->ranges.Add(CharRange(range.begin, current.begin-1));
|
|
range.begin=current.begin;
|
|
}
|
|
else if(current.end<range.end)
|
|
{
|
|
// range : [ ]
|
|
// current : [ ]
|
|
range.begin=current.end+1;
|
|
index++;
|
|
}
|
|
else if(current.end>range.end)
|
|
{
|
|
// range : [ ]
|
|
// current : [ ]
|
|
target->ranges.RemoveAt(index);
|
|
target->ranges.Add(range);
|
|
target->ranges.Add(CharRange(range.end+1, current.end));
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
// range : [ ]
|
|
// current : [ ]
|
|
return;
|
|
}
|
|
}
|
|
target->ranges.Add(range);
|
|
}
|
|
|
|
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
Loop(expression, expression->ranges, target);
|
|
}
|
|
};
|
|
|
|
class SetNormalizedCharSetAlgorithm : public CharSetAlgorithm
|
|
{
|
|
public:
|
|
void Process(CharSetExpression* expression, NormalizedCharSet* target, CharRange range)
|
|
{
|
|
for(vint j=0;j<target->ranges.Count();j++)
|
|
{
|
|
CharRange targetRange=target->ranges[j];
|
|
if(range.begin<=targetRange.begin && targetRange.end<=range.end)
|
|
{
|
|
expression->ranges.Add(targetRange);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Apply(CharSetExpression* expression, NormalizedCharSet* target)
|
|
{
|
|
CharRange::List source;
|
|
CopyFrom(source, expression->ranges);
|
|
expression->ranges.Clear();
|
|
Loop(expression, source, target);
|
|
expression->reverse=false;
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
MergeAlgorithm
|
|
***********************************************************************/
|
|
|
|
class MergeParameter
|
|
{
|
|
public:
|
|
Expression::Map definitions;
|
|
RegexExpression* regex;
|
|
};
|
|
|
|
class MergeAlgorithm : public RegexExpressionAlgorithm<Expression::Ref, MergeParameter*>
|
|
{
|
|
public:
|
|
Expression::Ref Apply(CharSetExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<CharSetExpression> result=new CharSetExpression;
|
|
CopyFrom(result->ranges, expression->ranges);
|
|
result->reverse=expression->reverse;
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(LoopExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<LoopExpression> result=new LoopExpression;
|
|
result->max=expression->max;
|
|
result->min=expression->min;
|
|
result->preferLong=expression->preferLong;
|
|
result->expression=Invoke(expression->expression, target);
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(SequenceExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<SequenceExpression> result=new SequenceExpression;
|
|
result->left=Invoke(expression->left, target);
|
|
result->right=Invoke(expression->right, target);
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(AlternateExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<AlternateExpression> result=new AlternateExpression;
|
|
result->left=Invoke(expression->left, target);
|
|
result->right=Invoke(expression->right, target);
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(BeginExpression* expression, MergeParameter* target)
|
|
{
|
|
return new BeginExpression;
|
|
}
|
|
|
|
Expression::Ref Apply(EndExpression* expression, MergeParameter* target)
|
|
{
|
|
return new EndExpression;
|
|
}
|
|
|
|
Expression::Ref Apply(CaptureExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<CaptureExpression> result=new CaptureExpression;
|
|
result->expression=Invoke(expression->expression, target);
|
|
result->name=expression->name;
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(MatchExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<MatchExpression> result=new MatchExpression;
|
|
result->name=expression->name;
|
|
result->index=expression->index;
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(PositiveExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<PositiveExpression> result=new PositiveExpression;
|
|
result->expression=Invoke(expression->expression, target);
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(NegativeExpression* expression, MergeParameter* target)
|
|
{
|
|
Ptr<NegativeExpression> result=new NegativeExpression;
|
|
result->expression=Invoke(expression->expression, target);
|
|
return result;
|
|
}
|
|
|
|
Expression::Ref Apply(UsingExpression* expression, MergeParameter* target)
|
|
{
|
|
if(target->definitions.Keys().Contains(expression->name))
|
|
{
|
|
Expression::Ref reference=target->definitions[expression->name];
|
|
if(reference)
|
|
{
|
|
return reference;
|
|
}
|
|
else
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Found reference loops in\""+expression->name+L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
|
|
}
|
|
}
|
|
else if(target->regex->definitions.Keys().Contains(expression->name))
|
|
{
|
|
target->definitions.Add(expression->name, 0);
|
|
Expression::Ref result=Invoke(target->regex->definitions[expression->name], target);
|
|
target->definitions.Set(expression->name, result);
|
|
return result;
|
|
}
|
|
else
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Cannot find sub expression reference\""+expression->name+L"\".", L"vl::regex_internal::RegexExpression::Merge", L"");
|
|
}
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
EpsilonNfaAlgorithm
|
|
***********************************************************************/
|
|
|
|
class EpsilonNfaInfo
|
|
{
|
|
public:
|
|
Automaton::Ref automaton;
|
|
};
|
|
|
|
class EpsilonNfa
|
|
{
|
|
public:
|
|
State* start;
|
|
State* end;
|
|
|
|
EpsilonNfa()
|
|
{
|
|
start=0;
|
|
end=0;
|
|
}
|
|
};
|
|
|
|
class EpsilonNfaAlgorithm : public RegexExpressionAlgorithm<EpsilonNfa, Automaton*>
|
|
{
|
|
public:
|
|
EpsilonNfa Connect(EpsilonNfa a, EpsilonNfa b, Automaton* target)
|
|
{
|
|
if(a.start)
|
|
{
|
|
target->NewEpsilon(a.end, b.start);
|
|
a.end=b.end;
|
|
return a;
|
|
}
|
|
else
|
|
{
|
|
return b;
|
|
}
|
|
}
|
|
|
|
EpsilonNfa Apply(CharSetExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa nfa;
|
|
nfa.start=target->NewState();
|
|
nfa.end=target->NewState();
|
|
for(vint i=0;i<expression->ranges.Count();i++)
|
|
{
|
|
target->NewChars(nfa.start, nfa.end, expression->ranges[i]);
|
|
}
|
|
return nfa;
|
|
}
|
|
|
|
EpsilonNfa Apply(LoopExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa head;
|
|
for(vint i=0;i<expression->min;i++)
|
|
{
|
|
EpsilonNfa body=Invoke(expression->expression, target);
|
|
head=Connect(head, body, target);
|
|
}
|
|
if(expression->max==-1)
|
|
{
|
|
EpsilonNfa body=Invoke(expression->expression, target);
|
|
if(!head.start)
|
|
{
|
|
head.start=head.end=target->NewState();
|
|
}
|
|
State* loopBegin=head.end;
|
|
State* loopEnd=target->NewState();
|
|
if(expression->preferLong)
|
|
{
|
|
target->NewEpsilon(loopBegin, body.start);
|
|
target->NewEpsilon(body.end, loopBegin);
|
|
target->NewNop(loopBegin, loopEnd);
|
|
}
|
|
else
|
|
{
|
|
target->NewNop(loopBegin, loopEnd);
|
|
target->NewEpsilon(loopBegin, body.start);
|
|
target->NewEpsilon(body.end, loopBegin);
|
|
}
|
|
head.end=loopEnd;
|
|
}
|
|
else if(expression->max>expression->min)
|
|
{
|
|
for(vint i=expression->min;i<expression->max;i++)
|
|
{
|
|
EpsilonNfa body=Invoke(expression->expression, target);
|
|
State* start=target->NewState();
|
|
State* end=target->NewState();
|
|
if(expression->preferLong)
|
|
{
|
|
target->NewEpsilon(start, body.start);
|
|
target->NewEpsilon(body.end, end);
|
|
target->NewNop(start, end);
|
|
}
|
|
else
|
|
{
|
|
target->NewNop(start, end);
|
|
target->NewEpsilon(start, body.start);
|
|
target->NewEpsilon(body.end, end);
|
|
}
|
|
body.start=start;
|
|
body.end=end;
|
|
head=Connect(head, body, target);
|
|
}
|
|
}
|
|
return head;
|
|
}
|
|
|
|
EpsilonNfa Apply(SequenceExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa a=Invoke(expression->left, target);
|
|
EpsilonNfa b=Invoke(expression->right, target);
|
|
return Connect(a, b, target);
|
|
}
|
|
|
|
EpsilonNfa Apply(AlternateExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
EpsilonNfa a=Invoke(expression->left, target);
|
|
EpsilonNfa b=Invoke(expression->right, target);
|
|
target->NewEpsilon(result.start, a.start);
|
|
target->NewEpsilon(a.end, result.end);
|
|
target->NewEpsilon(result.start, b.start);
|
|
target->NewEpsilon(b.end, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(BeginExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
target->NewBeginString(result.start, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(EndExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
target->NewEndString(result.start, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(CaptureExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
|
|
vint capture=-1;
|
|
if(expression->name!=L"")
|
|
{
|
|
capture=target->captureNames.IndexOf(expression->name);
|
|
if(capture==-1)
|
|
{
|
|
capture=target->captureNames.Count();
|
|
target->captureNames.Add(expression->name);
|
|
}
|
|
}
|
|
|
|
EpsilonNfa body=Invoke(expression->expression, target);
|
|
target->NewCapture(result.start, body.start, capture);
|
|
target->NewEnd(body.end, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(MatchExpression* expression, Automaton* target)
|
|
{
|
|
vint capture=-1;
|
|
if(expression->name!=L"")
|
|
{
|
|
capture=target->captureNames.IndexOf(expression->name);
|
|
if(capture==-1)
|
|
{
|
|
capture=target->captureNames.Count();
|
|
target->captureNames.Add(expression->name);
|
|
}
|
|
}
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
target->NewMatch(result.start, result.end, capture, expression->index);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(PositiveExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
EpsilonNfa body=Invoke(expression->expression, target);
|
|
target->NewPositive(result.start, body.start);
|
|
target->NewEnd(body.end, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(NegativeExpression* expression, Automaton* target)
|
|
{
|
|
EpsilonNfa result;
|
|
result.start=target->NewState();
|
|
result.end=target->NewState();
|
|
EpsilonNfa body=Invoke(expression->expression, target);
|
|
target->NewNegative(result.start, body.start);
|
|
target->NewEnd(body.end, result.end);
|
|
target->NewNegativeFail(result.start, result.end);
|
|
return result;
|
|
}
|
|
|
|
EpsilonNfa Apply(UsingExpression* expression, Automaton* target)
|
|
{
|
|
CHECK_FAIL(L"RegexExpression::GenerateEpsilonNfa()#UsingExpression cannot create state machine.");
|
|
}
|
|
};
|
|
|
|
/***********************************************************************
|
|
Expression
|
|
***********************************************************************/
|
|
|
|
bool Expression::IsEqual(vl::regex_internal::Expression *expression)
|
|
{
|
|
return IsEqualAlgorithm().Invoke(this, expression);
|
|
}
|
|
|
|
bool Expression::HasNoExtension()
|
|
{
|
|
return HasNoExtensionAlgorithm().Invoke(this, 0);
|
|
}
|
|
|
|
bool Expression::CanTreatAsPure()
|
|
{
|
|
return CanTreatAsPureAlgorithm().Invoke(this, 0);
|
|
}
|
|
|
|
void Expression::NormalizeCharSet(CharRange::List& subsets)
|
|
{
|
|
NormalizedCharSet normalized;
|
|
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
CopyFrom(subsets, normalized.ranges);
|
|
}
|
|
|
|
void Expression::CollectCharSet(CharRange::List& subsets)
|
|
{
|
|
NormalizedCharSet normalized;
|
|
CopyFrom(normalized.ranges, subsets);
|
|
BuildNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
CopyFrom(subsets, normalized.ranges);
|
|
}
|
|
|
|
void Expression::ApplyCharSet(CharRange::List& subsets)
|
|
{
|
|
NormalizedCharSet normalized;
|
|
CopyFrom(normalized.ranges, subsets);
|
|
SetNormalizedCharSetAlgorithm().Invoke(this, &normalized);
|
|
}
|
|
|
|
Automaton::Ref Expression::GenerateEpsilonNfa()
|
|
{
|
|
Automaton::Ref automaton=new Automaton;
|
|
EpsilonNfa result=EpsilonNfaAlgorithm().Invoke(this, automaton.Obj());
|
|
automaton->startState=result.start;
|
|
result.end->finalState=true;
|
|
return automaton;
|
|
}
|
|
|
|
/***********************************************************************
|
|
CharSetExpression
|
|
***********************************************************************/
|
|
|
|
bool CharSetExpression::AddRangeWithConflict(CharRange range)
|
|
{
|
|
if(range.begin>range.end)
|
|
{
|
|
wchar_t t=range.begin;
|
|
range.begin=range.end;
|
|
range.end=t;
|
|
}
|
|
for(vint i=0;i<ranges.Count();i++)
|
|
{
|
|
if(!(range<ranges[i] || range>ranges[i]))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
ranges.Add(range);
|
|
return true;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RegexExpression
|
|
***********************************************************************/
|
|
|
|
Expression::Ref RegexExpression::Merge()
|
|
{
|
|
MergeParameter merge;
|
|
merge.regex=this;
|
|
return MergeAlgorithm().Invoke(expression, &merge);
|
|
}
|
|
|
|
/***********************************************************************
|
|
Expression::Apply
|
|
***********************************************************************/
|
|
|
|
void CharSetExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void LoopExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void SequenceExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void AlternateExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void BeginExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void EndExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void CaptureExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void MatchExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void PositiveExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void NegativeExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
|
|
void UsingExpression::Apply(IRegexExpressionAlgorithm& algorithm)
|
|
{
|
|
algorithm.Visit(this);
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXPARSER.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
Helper Functions
|
|
***********************************************************************/
|
|
|
|
bool IsChar(const wchar_t*& input, wchar_t c)
|
|
{
|
|
if(*input==c)
|
|
{
|
|
input++;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool IsChars(const wchar_t*& input, const wchar_t* chars, wchar_t& c)
|
|
{
|
|
const wchar_t* position=::wcschr(chars, *input);
|
|
if(position)
|
|
{
|
|
c=*input++;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool IsStr(const wchar_t*& input, const wchar_t* str)
|
|
{
|
|
size_t len=wcslen(str);
|
|
if(wcsncmp(input, str, len)==0)
|
|
{
|
|
input+=len;
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool IsChars(const wchar_t*& input, const wchar_t* chars)
|
|
{
|
|
wchar_t c;
|
|
return IsChars(input, chars, c);
|
|
}
|
|
|
|
bool IsPositiveInteger(const wchar_t*& input, vint& number)
|
|
{
|
|
bool readed=false;
|
|
number=0;
|
|
while(L'0'<=*input && *input<=L'9')
|
|
{
|
|
number=number*10+(*input++)-L'0';
|
|
readed=true;
|
|
}
|
|
return readed;
|
|
}
|
|
|
|
bool IsName(const wchar_t*& input, WString& name)
|
|
{
|
|
const wchar_t* read=input;
|
|
if((L'A'<=*read && *read<=L'Z') || (L'a'<=*read && *read<=L'z') || *read==L'_')
|
|
{
|
|
read++;
|
|
while((L'A'<=*read && *read<=L'Z') || (L'a'<=*read && *read<=L'z') || (L'0'<=*read && *read<=L'9') || *read==L'_')
|
|
{
|
|
read++;
|
|
}
|
|
}
|
|
if(input==read)
|
|
{
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
name=WString::CopyFrom(input, vint(read-input));
|
|
input=read;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
Ptr<LoopExpression> ParseLoop(const wchar_t*& input)
|
|
{
|
|
vint min=0;
|
|
vint max=0;
|
|
if(!*input)
|
|
{
|
|
return 0;
|
|
}
|
|
else if(IsChar(input, L'+'))
|
|
{
|
|
min=1;
|
|
max=-1;
|
|
}
|
|
else if(IsChar(input, L'*'))
|
|
{
|
|
min=0;
|
|
max=-1;
|
|
}
|
|
else if(IsChar(input, L'?'))
|
|
{
|
|
min=0;
|
|
max=1;
|
|
}
|
|
else if(IsChar(input, L'{'))
|
|
{
|
|
if(IsPositiveInteger(input, min))
|
|
{
|
|
if(IsChar(input, L','))
|
|
{
|
|
if(!IsPositiveInteger(input, max))
|
|
{
|
|
max=-1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
max=min;
|
|
}
|
|
if(!IsChar(input, L'}'))
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
{
|
|
LoopExpression* expression=new LoopExpression;
|
|
expression->min=min;
|
|
expression->max=max;
|
|
expression->preferLong=!IsChar(input, L'?');
|
|
return expression;
|
|
}
|
|
THROW_EXCEPTION:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal loop expression.", L"vl::regex_internal::ParseLoop", L"input");
|
|
}
|
|
|
|
Ptr<Expression> ParseCharSet(const wchar_t*& input)
|
|
{
|
|
if(!*input)
|
|
{
|
|
return 0;
|
|
}
|
|
else if(IsChar(input, L'^'))
|
|
{
|
|
return new BeginExpression;
|
|
}
|
|
else if(IsChar(input, L'$'))
|
|
{
|
|
return new EndExpression;
|
|
}
|
|
else if(IsChar(input, L'\\') || IsChar(input, L'/'))
|
|
{
|
|
Ptr<CharSetExpression> expression=new CharSetExpression;
|
|
expression->reverse=false;
|
|
switch(*input)
|
|
{
|
|
case L'.':
|
|
expression->ranges.Add(CharRange(1, 65535));
|
|
break;
|
|
case L'r':
|
|
expression->ranges.Add(CharRange(L'\r', L'\r'));
|
|
break;
|
|
case L'n':
|
|
expression->ranges.Add(CharRange(L'\n', L'\n'));
|
|
break;
|
|
case L't':
|
|
expression->ranges.Add(CharRange(L'\t', L'\t'));
|
|
break;
|
|
case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|':
|
|
case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
|
|
case L'^':case L'$':case L'!':case L'=':
|
|
expression->ranges.Add(CharRange(*input, *input));
|
|
break;
|
|
case L'S':
|
|
expression->reverse=true;
|
|
case L's':
|
|
expression->ranges.Add(CharRange(L' ', L' '));
|
|
expression->ranges.Add(CharRange(L'\r', L'\r'));
|
|
expression->ranges.Add(CharRange(L'\n', L'\n'));
|
|
expression->ranges.Add(CharRange(L'\t', L'\t'));
|
|
break;
|
|
case L'D':
|
|
expression->reverse=true;
|
|
case L'd':
|
|
expression->ranges.Add(CharRange(L'0', L'9'));
|
|
break;
|
|
case L'L':
|
|
expression->reverse=true;
|
|
case L'l':
|
|
expression->ranges.Add(CharRange(L'_', L'_'));
|
|
expression->ranges.Add(CharRange(L'A', L'Z'));
|
|
expression->ranges.Add(CharRange(L'a', L'z'));
|
|
break;
|
|
case L'W':
|
|
expression->reverse=true;
|
|
case L'w':
|
|
expression->ranges.Add(CharRange(L'_', L'_'));
|
|
expression->ranges.Add(CharRange(L'0', L'9'));
|
|
expression->ranges.Add(CharRange(L'A', L'Z'));
|
|
expression->ranges.Add(CharRange(L'a', L'z'));
|
|
break;
|
|
default:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping.", L"vl::regex_internal::ParseCharSet", L"input");
|
|
}
|
|
input++;
|
|
return expression;
|
|
}
|
|
else if(IsChar(input, L'['))
|
|
{
|
|
Ptr<CharSetExpression> expression=new CharSetExpression;
|
|
if(IsChar(input, L'^'))
|
|
{
|
|
expression->reverse=true;
|
|
}
|
|
else
|
|
{
|
|
expression->reverse=false;
|
|
}
|
|
bool midState=false;
|
|
wchar_t a=L'\0';
|
|
wchar_t b=L'\0';
|
|
while(true)
|
|
{
|
|
if(IsChar(input, L'\\') || IsChar(input, L'/'))
|
|
{
|
|
wchar_t c=L'\0';
|
|
switch(*input)
|
|
{
|
|
case L'r':
|
|
c=L'\r';
|
|
break;
|
|
case L'n':
|
|
c=L'\n';
|
|
break;
|
|
case L't':
|
|
c=L'\t';
|
|
break;
|
|
case L'-':case L'[':case L']':case L'\\':case L'/':case L'^':case L'$':
|
|
c=*input;
|
|
break;
|
|
default:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal character escaping, only \"rnt-[]\\/\" are legal escaped characters in [].", L"vl::regex_internal::ParseCharSet", L"input");
|
|
}
|
|
input++;
|
|
midState?b=c:a=c;
|
|
midState=!midState;
|
|
}
|
|
else if(IsChars(input, L"-]"))
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
else if(*input)
|
|
{
|
|
midState?b=*input++:a=*input++;
|
|
midState=!midState;
|
|
}
|
|
else
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
if(IsChar(input, L']'))
|
|
{
|
|
if(midState)
|
|
{
|
|
b=a;
|
|
}
|
|
if(!expression->AddRangeWithConflict(CharRange(a, b)))
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
break;
|
|
}
|
|
else if(IsChar(input, L'-'))
|
|
{
|
|
if(!midState)
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(midState)
|
|
{
|
|
b=a;
|
|
}
|
|
if(expression->AddRangeWithConflict(CharRange(a, b)))
|
|
{
|
|
midState=false;
|
|
}
|
|
else
|
|
{
|
|
goto THROW_EXCEPTION;
|
|
}
|
|
}
|
|
}
|
|
return expression;
|
|
THROW_EXCEPTION:
|
|
throw ArgumentException(L"Regular expression syntax error: Illegal character set definition.");
|
|
}
|
|
else if(IsChars(input, L"()+*?{}|"))
|
|
{
|
|
input--;
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
CharSetExpression* expression=new CharSetExpression;
|
|
expression->reverse=false;
|
|
expression->ranges.Add(CharRange(*input, *input));
|
|
input++;
|
|
return expression;
|
|
}
|
|
}
|
|
|
|
Ptr<Expression> ParseFunction(const wchar_t*& input)
|
|
{
|
|
if(IsStr(input, L"(="))
|
|
{
|
|
Ptr<Expression> sub=ParseExpression(input);
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
PositiveExpression* expression=new PositiveExpression;
|
|
expression->expression=sub;
|
|
return expression;
|
|
}
|
|
else if(IsStr(input, L"(!"))
|
|
{
|
|
Ptr<Expression> sub=ParseExpression(input);
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
NegativeExpression* expression=new NegativeExpression;
|
|
expression->expression=sub;
|
|
return expression;
|
|
}
|
|
else if(IsStr(input, L"(<&"))
|
|
{
|
|
WString name;
|
|
if(!IsName(input, name))
|
|
{
|
|
goto NEED_NAME;
|
|
}
|
|
if(!IsChar(input, L'>'))
|
|
{
|
|
goto NEED_GREATER;
|
|
}
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
UsingExpression* expression=new UsingExpression;
|
|
expression->name=name;
|
|
return expression;
|
|
}
|
|
else if(IsStr(input, L"(<$"))
|
|
{
|
|
WString name;
|
|
vint index=-1;
|
|
if(IsName(input, name))
|
|
{
|
|
if(IsChar(input, L';'))
|
|
{
|
|
if(!IsPositiveInteger(input, index))
|
|
{
|
|
goto NEED_NUMBER;
|
|
}
|
|
}
|
|
}
|
|
else if(!IsPositiveInteger(input, index))
|
|
{
|
|
goto NEED_NUMBER;
|
|
}
|
|
if(!IsChar(input, L'>'))
|
|
{
|
|
goto NEED_GREATER;
|
|
}
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
MatchExpression* expression=new MatchExpression;
|
|
expression->name=name;
|
|
expression->index=index;
|
|
return expression;
|
|
}
|
|
else if(IsStr(input, L"(<"))
|
|
{
|
|
WString name;
|
|
if(!IsName(input, name))
|
|
{
|
|
goto NEED_NAME;
|
|
}
|
|
if(!IsChar(input, L'>'))
|
|
{
|
|
goto NEED_GREATER;
|
|
}
|
|
Ptr<Expression> sub=ParseExpression(input);
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
CaptureExpression* expression=new CaptureExpression;
|
|
expression->name=name;
|
|
expression->expression=sub;
|
|
return expression;
|
|
}
|
|
else if(IsStr(input, L"(?"))
|
|
{
|
|
Ptr<Expression> sub=ParseExpression(input);
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
CaptureExpression* expression=new CaptureExpression;
|
|
expression->expression=sub;
|
|
return expression;
|
|
}
|
|
else if(IsChar(input, L'('))
|
|
{
|
|
Ptr<Expression> sub=ParseExpression(input);
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
goto NEED_RIGHT_BRACKET;
|
|
}
|
|
return sub;
|
|
}
|
|
else
|
|
{
|
|
return 0;
|
|
}
|
|
NEED_RIGHT_BRACKET:
|
|
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
NEED_GREATER:
|
|
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
NEED_NAME:
|
|
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
NEED_NUMBER:
|
|
throw ArgumentException(L"Regular expression syntax error: Number expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
|
|
Ptr<Expression> ParseUnit(const wchar_t*& input)
|
|
{
|
|
Ptr<Expression> unit=ParseCharSet(input);
|
|
if(!unit)
|
|
{
|
|
unit=ParseFunction(input);
|
|
}
|
|
if(!unit)
|
|
{
|
|
return 0;
|
|
}
|
|
Ptr<LoopExpression> loop;
|
|
while((loop=ParseLoop(input)))
|
|
{
|
|
loop->expression=unit;
|
|
unit=loop;
|
|
}
|
|
return unit;
|
|
}
|
|
|
|
Ptr<Expression> ParseJoin(const wchar_t*& input)
|
|
{
|
|
Ptr<Expression> expression=ParseUnit(input);
|
|
while(true)
|
|
{
|
|
Ptr<Expression> right=ParseUnit(input);
|
|
if(right)
|
|
{
|
|
SequenceExpression* sequence=new SequenceExpression;
|
|
sequence->left=expression;
|
|
sequence->right=right;
|
|
expression=sequence;
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
return expression;
|
|
}
|
|
|
|
Ptr<Expression> ParseAlt(const wchar_t*& input)
|
|
{
|
|
Ptr<Expression> expression=ParseJoin(input);
|
|
while(true)
|
|
{
|
|
if(IsChar(input, L'|'))
|
|
{
|
|
Ptr<Expression> right=ParseJoin(input);
|
|
if(right)
|
|
{
|
|
AlternateExpression* alternate=new AlternateExpression;
|
|
alternate->left=expression;
|
|
alternate->right=right;
|
|
expression=alternate;
|
|
}
|
|
else
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseAlt", L"input");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
return expression;
|
|
}
|
|
|
|
Ptr<Expression> ParseExpression(const wchar_t*& input)
|
|
{
|
|
return ParseAlt(input);
|
|
}
|
|
|
|
RegexExpression::Ref ParseRegexExpression(const WString& code)
|
|
{
|
|
RegexExpression::Ref regex=new RegexExpression;
|
|
const wchar_t* start=code.Buffer();
|
|
const wchar_t* input=start;
|
|
try
|
|
{
|
|
while(IsStr(input, L"(<#"))
|
|
{
|
|
WString name;
|
|
if(!IsName(input, name))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Identifier expected.", L"vl::regex_internal::ParseRegexExpression", L"code");
|
|
}
|
|
if(!IsChar(input, L'>'))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: \">\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
Ptr<Expression> sub=ParseExpression(input);
|
|
if(!IsChar(input, L')'))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: \")\" expected.", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
if(regex->definitions.Keys().Contains(name))
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Found duplicated sub expression name: \""+name+L"\". ", L"vl::regex_internal::ParseFunction", L"input");
|
|
}
|
|
else
|
|
{
|
|
regex->definitions.Add(name, sub);
|
|
}
|
|
}
|
|
regex->expression=ParseExpression(input);
|
|
if(!regex->expression)
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Expression expected.", L"vl::regex_internal::ParseUnit", L"input");
|
|
}
|
|
if(*input)
|
|
{
|
|
throw ArgumentException(L"Regular expression syntax error: Found unnecessary tokens.", L"vl::regex_internal::ParseUnit", L"input");
|
|
}
|
|
return regex;
|
|
}
|
|
catch(const ArgumentException& e)
|
|
{
|
|
throw ParsingException(e.Message(), code, input-start);
|
|
}
|
|
}
|
|
|
|
WString EscapeTextForRegex(const WString& literalString)
|
|
{
|
|
WString result;
|
|
for(vint i=0;i<literalString.Length();i++)
|
|
{
|
|
wchar_t c=literalString[i];
|
|
switch(c)
|
|
{
|
|
case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|':
|
|
case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
|
|
case L'^':case L'$':case L'!':case L'=':
|
|
result+=WString(L"\\")+WString::FromChar(c);
|
|
break;
|
|
case L'\r':
|
|
result+=L"\\r";
|
|
break;
|
|
case L'\n':
|
|
result+=L"\\n";
|
|
break;
|
|
case L'\t':
|
|
result+=L"\\t";
|
|
break;
|
|
default:
|
|
result+=WString::FromChar(c);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
WString UnescapeTextForRegex(const WString& escapedText)
|
|
{
|
|
WString result;
|
|
for(vint i=0;i<escapedText.Length();i++)
|
|
{
|
|
wchar_t c=escapedText[i];
|
|
if(c==L'\\' || c==L'/')
|
|
{
|
|
if(i<escapedText.Length()-1)
|
|
{
|
|
i++;
|
|
c=escapedText[i];
|
|
switch(c)
|
|
{
|
|
case L'r':
|
|
result+=L"\r";
|
|
break;
|
|
case L'n':
|
|
result+=L"\n";
|
|
break;
|
|
case L't':
|
|
result+=L"\t";
|
|
break;
|
|
default:
|
|
result+=WString::FromChar(c);
|
|
}
|
|
continue;
|
|
}
|
|
}
|
|
result+=WString::FromChar(c);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
WString NormalizeEscapedTextForRegex(const WString& escapedText)
|
|
{
|
|
WString result;
|
|
for(vint i=0;i<escapedText.Length();i++)
|
|
{
|
|
wchar_t c=escapedText[i];
|
|
if(c==L'\\' || c==L'/')
|
|
{
|
|
if(i<escapedText.Length()-1)
|
|
{
|
|
i++;
|
|
c=escapedText[i];
|
|
result+=WString(L"\\")+WString::FromChar(c);
|
|
continue;
|
|
}
|
|
}
|
|
result+=WString::FromChar(c);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
bool IsRegexEscapedLiteralString(const WString& regex)
|
|
{
|
|
for(vint i=0;i<regex.Length();i++)
|
|
{
|
|
wchar_t c=regex[i];
|
|
if(c==L'\\' || c==L'/')
|
|
{
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
switch(c)
|
|
{
|
|
case L'\\':case L'/':case L'(':case L')':case L'+':case L'*':case L'?':case L'|':
|
|
case L'{':case L'}':case L'[':case L']':case L'<':case L'>':
|
|
case L'^':case L'$':case L'!':case L'=':
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXPURE.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
PureInterpretor
|
|
***********************************************************************/
|
|
|
|
PureInterpretor::PureInterpretor(Automaton::Ref dfa, CharRange::List& subsets)
|
|
:transition(0)
|
|
,finalState(0)
|
|
,relatedFinalState(0)
|
|
{
|
|
stateCount=dfa->states.Count();
|
|
charSetCount=subsets.Count()+1;
|
|
startState=dfa->states.IndexOf(dfa->startState);
|
|
|
|
// Map char to input index (equivalent char class)
|
|
for(vint i=0;i<SupportedCharCount;i++)
|
|
{
|
|
charMap[i]=charSetCount-1;
|
|
}
|
|
for(vint i=0;i<subsets.Count();i++)
|
|
{
|
|
CharRange range=subsets[i];
|
|
for(vint j=range.begin;j<=range.end;j++)
|
|
{
|
|
charMap[j]=i;
|
|
}
|
|
}
|
|
|
|
// Create transitions from DFA, using input index to represent input char
|
|
transition=new vint*[stateCount];
|
|
for(vint i=0;i<stateCount;i++)
|
|
{
|
|
transition[i]=new vint[charSetCount];
|
|
for(vint j=0;j<charSetCount;j++)
|
|
{
|
|
transition[i][j]=-1;
|
|
}
|
|
|
|
State* state=dfa->states[i].Obj();
|
|
for(vint j=0;j<state->transitions.Count();j++)
|
|
{
|
|
Transition* dfaTransition=state->transitions[j];
|
|
switch(dfaTransition->type)
|
|
{
|
|
case Transition::Chars:
|
|
{
|
|
vint index=subsets.IndexOf(dfaTransition->range);
|
|
if(index==-1)
|
|
{
|
|
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Automaton::Ref, CharRange::List&)#Specified chars don't appear in the normalized char ranges.");
|
|
}
|
|
transition[i][index]=dfa->states.IndexOf(dfaTransition->target);
|
|
}
|
|
break;
|
|
default:
|
|
CHECK_ERROR(false, L"PureInterpretor::PureInterpretor(Automaton::Ref, CharRange::List&)#PureInterpretor only accepts Transition::Chars transitions.");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Mark final states
|
|
finalState=new bool[stateCount];
|
|
for(vint i=0;i<stateCount;i++)
|
|
{
|
|
finalState[i]=dfa->states[i]->finalState;
|
|
}
|
|
}
|
|
|
|
PureInterpretor::~PureInterpretor()
|
|
{
|
|
if(relatedFinalState) delete[] relatedFinalState;
|
|
delete[] finalState;
|
|
for(vint i=0;i<stateCount;i++)
|
|
{
|
|
delete[] transition[i];
|
|
}
|
|
delete[] transition;
|
|
}
|
|
|
|
bool PureInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, PureResult& result)
|
|
{
|
|
result.start=input-start;
|
|
result.length=-1;
|
|
result.finalState=-1;
|
|
result.terminateState=-1;
|
|
|
|
vint currentState=startState;
|
|
vint terminateState=-1;
|
|
vint terminateLength=-1;
|
|
const wchar_t* read=input;
|
|
while(currentState!=-1)
|
|
{
|
|
terminateState=currentState;
|
|
terminateLength=read-input;
|
|
if(finalState[currentState])
|
|
{
|
|
result.length=terminateLength;
|
|
result.finalState=currentState;
|
|
}
|
|
if(!*read)break;
|
|
#ifdef VCZH_GCC
|
|
if(*read>=SupportedCharCount)break;
|
|
#endif
|
|
vint charIndex=charMap[*read++];
|
|
currentState=transition[currentState][charIndex];
|
|
}
|
|
|
|
if(result.finalState==-1)
|
|
{
|
|
if(terminateLength>0)
|
|
{
|
|
result.terminateState=terminateState;
|
|
}
|
|
result.length=terminateLength;
|
|
return false;
|
|
}
|
|
else
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool PureInterpretor::Match(const wchar_t* input, const wchar_t* start, PureResult& result)
|
|
{
|
|
const wchar_t* read=input;
|
|
while(*read)
|
|
{
|
|
if(MatchHead(read, start, result))
|
|
{
|
|
return true;
|
|
}
|
|
read++;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
vint PureInterpretor::GetStartState()
|
|
{
|
|
return startState;
|
|
}
|
|
|
|
vint PureInterpretor::Transit(wchar_t input, vint state)
|
|
{
|
|
if(0<=state && state<stateCount)
|
|
{
|
|
vint charIndex=charMap[input];
|
|
vint nextState=transition[state][charIndex];
|
|
return nextState;
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
bool PureInterpretor::IsFinalState(vint state)
|
|
{
|
|
return 0<=state && state<stateCount && finalState[state];
|
|
}
|
|
|
|
bool PureInterpretor::IsDeadState(vint state)
|
|
{
|
|
if(state==-1) return true;
|
|
for(vint i=0;i<charSetCount;i++)
|
|
{
|
|
if(transition[state][i]!=-1)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void PureInterpretor::PrepareForRelatedFinalStateTable()
|
|
{
|
|
if(!relatedFinalState)
|
|
{
|
|
relatedFinalState=new vint[stateCount];
|
|
for(vint i=0;i<stateCount;i++)
|
|
{
|
|
relatedFinalState[i]=finalState[i]?i:-1;
|
|
}
|
|
while(true)
|
|
{
|
|
vint modifyCount=0;
|
|
for(vint i=0;i<stateCount;i++)
|
|
{
|
|
if(relatedFinalState[i]==-1)
|
|
{
|
|
vint state=-1;
|
|
for(vint j=0;j<charSetCount;j++)
|
|
{
|
|
vint nextState=transition[i][j];
|
|
if(nextState!=-1)
|
|
{
|
|
state=relatedFinalState[nextState];
|
|
if(state!=-1)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if(state!=-1)
|
|
{
|
|
relatedFinalState[i]=state;
|
|
modifyCount++;
|
|
}
|
|
}
|
|
}
|
|
if(modifyCount==0)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
vint PureInterpretor::GetRelatedFinalState(vint state)
|
|
{
|
|
return relatedFinalState?relatedFinalState[state]:-1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXRICH.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex_internal
|
|
{
|
|
|
|
/***********************************************************************
|
|
Data Structures for Backtracking
|
|
***********************************************************************/
|
|
|
|
class StateSaver
|
|
{
|
|
public:
|
|
enum StateStoreType
|
|
{
|
|
Positive,
|
|
Negative,
|
|
Other
|
|
};
|
|
|
|
const wchar_t* reading; // Current reading position
|
|
State* currentState; // Current state
|
|
vint minTransition; // The first transition to backtrack
|
|
vint captureCount; // Available capture count (the list size may larger than this)
|
|
vint stateSaverCount; // Available saver count (the list size may larger than this)
|
|
vint extensionSaverAvailable; // Available extension saver count (the list size may larger than this)
|
|
vint extensionSaverCount; // Available extension saver count (during executing)
|
|
StateStoreType storeType; // Reason to keep this record
|
|
|
|
bool operator==(const StateSaver& saver)const
|
|
{
|
|
return
|
|
reading == saver.reading &&
|
|
currentState == saver.currentState &&
|
|
minTransition == saver.minTransition &&
|
|
captureCount == saver.captureCount;
|
|
}
|
|
};
|
|
|
|
class ExtensionSaver
|
|
{
|
|
public:
|
|
vint previous; // Previous extension saver index
|
|
vint captureListIndex; // Where to write the captured text
|
|
Transition* transition; // The extension begin transition (Capture, Positive, Negative)
|
|
const wchar_t* reading; // The reading position
|
|
|
|
bool operator==(const ExtensionSaver& saver)const
|
|
{
|
|
return
|
|
captureListIndex == saver.captureListIndex &&
|
|
transition == saver.transition &&
|
|
reading == saver.reading;
|
|
}
|
|
};
|
|
}
|
|
|
|
namespace regex_internal
|
|
{
|
|
using namespace collections;
|
|
|
|
void Push(List<ExtensionSaver>& elements, vint& available, vint& count, const ExtensionSaver& element)
|
|
{
|
|
if(elements.Count()==count)
|
|
{
|
|
elements.Add(element);
|
|
}
|
|
else
|
|
{
|
|
elements[count]=element;
|
|
}
|
|
ExtensionSaver& current=elements[count];
|
|
current.previous=available;
|
|
available=count++;
|
|
}
|
|
|
|
ExtensionSaver Pop(List<ExtensionSaver>& elements, vint& available, vint& count)
|
|
{
|
|
ExtensionSaver& current=elements[available];
|
|
available=current.previous;
|
|
return current;
|
|
}
|
|
|
|
template<typename T, typename K>
|
|
void PushNonSaver(List<T, K>& elements, vint& count, const T& element)
|
|
{
|
|
if(elements.Count()==count)
|
|
{
|
|
elements.Add(element);
|
|
}
|
|
else
|
|
{
|
|
elements[count]=element;
|
|
}
|
|
count++;
|
|
}
|
|
|
|
template<typename T, typename K>
|
|
T PopNonSaver(List<T, K>& elements, vint& count)
|
|
{
|
|
return elements[--count];
|
|
}
|
|
}
|
|
|
|
namespace regex_internal
|
|
{
|
|
/***********************************************************************
|
|
CaptureRecord
|
|
***********************************************************************/
|
|
|
|
bool CaptureRecord::operator==(const CaptureRecord& record)const
|
|
{
|
|
return capture==record.capture && start==record.start && length==record.length;
|
|
}
|
|
|
|
/***********************************************************************
|
|
RichInterpretor
|
|
***********************************************************************/
|
|
|
|
RichInterpretor::RichInterpretor(Automaton::Ref _dfa)
|
|
:dfa(_dfa)
|
|
{
|
|
datas=new UserData[dfa->states.Count()];
|
|
|
|
for(vint i=0;i<dfa->states.Count();i++)
|
|
{
|
|
State* state=dfa->states[i].Obj();
|
|
vint charEdges=0;
|
|
vint nonCharEdges=0;
|
|
bool mustSave=false;
|
|
for(vint j=0;j<state->transitions.Count();j++)
|
|
{
|
|
if(state->transitions[j]->type==Transition::Chars)
|
|
{
|
|
charEdges++;
|
|
}
|
|
else
|
|
{
|
|
if(state->transitions[j]->type==Transition::Negative ||
|
|
state->transitions[j]->type==Transition::Positive)
|
|
{
|
|
mustSave=true;
|
|
}
|
|
nonCharEdges++;
|
|
}
|
|
}
|
|
datas[i].NeedKeepState=mustSave || nonCharEdges>1 || (nonCharEdges!=0 && charEdges!=0);
|
|
state->userData=&datas[i];
|
|
}
|
|
}
|
|
|
|
RichInterpretor::~RichInterpretor()
|
|
{
|
|
delete[] datas;
|
|
}
|
|
|
|
bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, RichResult& result)
|
|
{
|
|
List<StateSaver> stateSavers;
|
|
List<ExtensionSaver> extensionSavers;
|
|
|
|
StateSaver currentState;
|
|
currentState.captureCount=0;
|
|
currentState.currentState=dfa->startState;
|
|
currentState.extensionSaverAvailable=-1;
|
|
currentState.extensionSaverCount=0;
|
|
currentState.minTransition=0;
|
|
currentState.reading=input;
|
|
currentState.stateSaverCount=0;
|
|
currentState.storeType=StateSaver::Other;
|
|
|
|
while (!currentState.currentState->finalState)
|
|
{
|
|
bool found = false; // true means at least one transition matches the input
|
|
StateSaver oldState = currentState;
|
|
// Iterate through all transitions from the current state
|
|
for (vint i = currentState.minTransition; i < currentState.currentState->transitions.Count(); i++)
|
|
{
|
|
Transition* transition = currentState.currentState->transitions[i];
|
|
switch (transition->type)
|
|
{
|
|
case Transition::Chars:
|
|
{
|
|
// match the input if the current character fall into the range
|
|
CharRange range = transition->range;
|
|
found =
|
|
range.begin <= *currentState.reading &&
|
|
range.end >= *currentState.reading;
|
|
if (found)
|
|
{
|
|
currentState.reading++;
|
|
}
|
|
}
|
|
break;
|
|
case Transition::BeginString:
|
|
{
|
|
// match the input if this is the first character, and it is not consumed
|
|
found = currentState.reading == start;
|
|
}
|
|
break;
|
|
case Transition::EndString:
|
|
{
|
|
// match the input if this is after the last character, and it is not consumed
|
|
found = *currentState.reading == L'\0';
|
|
}
|
|
break;
|
|
case Transition::Nop:
|
|
{
|
|
// match without any condition
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Capture:
|
|
{
|
|
// Push the capture information
|
|
ExtensionSaver saver;
|
|
saver.captureListIndex = currentState.captureCount;
|
|
saver.reading = currentState.reading;
|
|
saver.transition = transition;
|
|
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
|
|
|
|
// Push the capture record, and it will be written if the input matches the regex
|
|
CaptureRecord capture;
|
|
capture.capture = transition->capture;
|
|
capture.start = currentState.reading - start;
|
|
capture.length = -1;
|
|
PushNonSaver(result.captures, currentState.captureCount, capture);
|
|
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Match:
|
|
{
|
|
vint index = 0;
|
|
for (vint j = 0; j < currentState.captureCount; j++)
|
|
{
|
|
CaptureRecord& capture = result.captures[j];
|
|
// If the capture name matched
|
|
if (capture.capture == transition->capture)
|
|
{
|
|
// If the capture index matched, or it is -1
|
|
if (capture.length != -1 && (transition->index == -1 || transition->index == index))
|
|
{
|
|
// If the captured text matched
|
|
if (wcsncmp(start + capture.start, currentState.reading, capture.length) == 0)
|
|
{
|
|
// Consume so much input
|
|
currentState.reading += capture.length;
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Fail if f the captured text with the specified name and index doesn't match
|
|
if (transition->index != -1 && index == transition->index)
|
|
{
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
index++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case Transition::Positive:
|
|
{
|
|
// Push the positive lookahead information
|
|
ExtensionSaver saver;
|
|
saver.captureListIndex = -1;
|
|
saver.reading = currentState.reading;
|
|
saver.transition = transition;
|
|
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
|
|
|
|
// Set found = true so that PushNonSaver(oldState) happens later
|
|
oldState.storeType = StateSaver::Positive;
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Negative:
|
|
{
|
|
// Push the positive lookahead information
|
|
|
|
ExtensionSaver saver;
|
|
saver.captureListIndex = -1;
|
|
saver.reading = currentState.reading;
|
|
saver.transition = transition;
|
|
Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
|
|
|
|
// Set found = true so that PushNonSaver(oldState) happens later
|
|
oldState.storeType = StateSaver::Negative;
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::NegativeFail:
|
|
{
|
|
// NegativeFail will be used when the nagative lookahead failed
|
|
}
|
|
break;
|
|
case Transition::End:
|
|
{
|
|
// Find the corresponding extension saver so that we can know how to deal with a matched sub regex that ends here
|
|
ExtensionSaver extensionSaver = Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount);
|
|
switch (extensionSaver.transition->type)
|
|
{
|
|
case Transition::Capture:
|
|
{
|
|
// Write the captured text
|
|
CaptureRecord& capture = result.captures[extensionSaver.captureListIndex];
|
|
capture.length = (currentState.reading - start) - capture.start;
|
|
found = true;
|
|
}
|
|
break;
|
|
case Transition::Positive:
|
|
// Find the last positive lookahead state saver
|
|
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
|
|
{
|
|
StateSaver& stateSaver = stateSavers[j];
|
|
if (stateSaver.storeType == StateSaver::Positive)
|
|
{
|
|
// restore the parsing state just before matching the positive lookahead, since positive lookahead doesn't consume input
|
|
oldState.reading = stateSaver.reading;
|
|
oldState.stateSaverCount = j;
|
|
currentState.reading = stateSaver.reading;
|
|
currentState.stateSaverCount = j;
|
|
break;
|
|
}
|
|
}
|
|
found = true;
|
|
break;
|
|
case Transition::Negative:
|
|
// Find the last negative lookahead state saver
|
|
for (vint j = currentState.stateSaverCount - 1; j >= 0; j--)
|
|
{
|
|
StateSaver& stateSaver = stateSavers[j];
|
|
if (stateSaver.storeType == StateSaver::Negative)
|
|
{
|
|
// restore the parsing state just before matching the negative lookahead, since positive lookahead doesn't consume input
|
|
oldState = stateSaver;
|
|
oldState.storeType = StateSaver::Other;
|
|
currentState = stateSaver;
|
|
currentState.storeType = StateSaver::Other;
|
|
i = currentState.minTransition - 1;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
default:;
|
|
}
|
|
}
|
|
break;
|
|
default:;
|
|
}
|
|
|
|
// Save the parsing state when necessary
|
|
if (found)
|
|
{
|
|
UserData* data = (UserData*)currentState.currentState->userData;
|
|
if (data->NeedKeepState)
|
|
{
|
|
oldState.minTransition = i + 1;
|
|
PushNonSaver(stateSavers, currentState.stateSaverCount, oldState);
|
|
}
|
|
currentState.currentState = transition->target;
|
|
currentState.minTransition = 0;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// If no transition from the current state can be used
|
|
if (!found)
|
|
{
|
|
// If there is a chance to do backtracking
|
|
if (currentState.stateSaverCount)
|
|
{
|
|
currentState = PopNonSaver(stateSavers, currentState.stateSaverCount);
|
|
// minTransition - 1 is always valid since the value is stored with adding 1
|
|
// So minTransition - 1 record the transition, which is the reason the parsing state is saved
|
|
if (currentState.currentState->transitions[currentState.minTransition - 1]->type == Transition::Negative)
|
|
{
|
|
// Find the next NegativeFail transition
|
|
// Because when a negative lookahead regex failed to match, it is actually succeeded
|
|
// Since a negative lookahead means we don't want to match this regex
|
|
for (vint i = 0; i < currentState.currentState->transitions.Count(); i++)
|
|
{
|
|
Transition* transition = currentState.currentState->transitions[i];
|
|
if (transition->type == Transition::NegativeFail)
|
|
{
|
|
// Restore the state to the target of NegativeFail to let the parsing continue
|
|
currentState.currentState = transition->target;
|
|
currentState.minTransition = 0;
|
|
currentState.storeType = StateSaver::Other;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (currentState.currentState->finalState)
|
|
{
|
|
// Keep available captures if succeeded
|
|
result.start = input - start;
|
|
result.length = (currentState.reading - start) - result.start;
|
|
for (vint i = result.captures.Count() - 1; i >= currentState.captureCount; i--)
|
|
{
|
|
result.captures.RemoveAt(i);
|
|
}
|
|
return true;
|
|
}
|
|
else
|
|
{
|
|
// Clear captures if failed
|
|
result.captures.Clear();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, RichResult& result)
|
|
{
|
|
const wchar_t* read=input;
|
|
while(*read)
|
|
{
|
|
if(MatchHead(read, start, result))
|
|
{
|
|
return true;
|
|
}
|
|
read++;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
const List<WString>& RichInterpretor::CaptureNames()
|
|
{
|
|
return dfa->captureNames;
|
|
}
|
|
}
|
|
}
|
|
|
|
/***********************************************************************
|
|
.\REGEXWRITER.CPP
|
|
***********************************************************************/
|
|
/***********************************************************************
|
|
Author: Zihan Chen (vczh)
|
|
Licensed under https://github.com/vczh-libraries/License
|
|
***********************************************************************/
|
|
|
|
|
|
namespace vl
|
|
{
|
|
namespace regex
|
|
{
|
|
using namespace vl::regex_internal;
|
|
|
|
/***********************************************************************
|
|
RegexNode
|
|
***********************************************************************/
|
|
|
|
RegexNode::RegexNode(vl::regex_internal::Expression::Ref _expression)
|
|
:expression(_expression)
|
|
{
|
|
}
|
|
|
|
RegexNode RegexNode::Some()const
|
|
{
|
|
return Loop(1, -1);
|
|
}
|
|
|
|
RegexNode RegexNode::Any()const
|
|
{
|
|
return Loop(0, -1);
|
|
}
|
|
|
|
RegexNode RegexNode::Opt()const
|
|
{
|
|
return Loop(0, 1);
|
|
}
|
|
|
|
RegexNode RegexNode::Loop(vint min, vint max)const
|
|
{
|
|
LoopExpression* target=new LoopExpression;
|
|
target->min=min;
|
|
target->max=max;
|
|
target->preferLong=true;
|
|
target->expression=expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::AtLeast(vint min)const
|
|
{
|
|
return Loop(min, -1);
|
|
}
|
|
|
|
RegexNode RegexNode::operator+(const RegexNode& node)const
|
|
{
|
|
SequenceExpression* target=new SequenceExpression;
|
|
target->left=expression;
|
|
target->right=node.expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator|(const RegexNode& node)const
|
|
{
|
|
AlternateExpression* target=new AlternateExpression;
|
|
target->left=expression;
|
|
target->right=node.expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator+()const
|
|
{
|
|
PositiveExpression* target=new PositiveExpression;
|
|
target->expression=expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator-()const
|
|
{
|
|
NegativeExpression* target=new NegativeExpression;
|
|
target->expression=expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator!()const
|
|
{
|
|
CharSetExpression* source=dynamic_cast<CharSetExpression*>(expression.Obj());
|
|
CHECK_ERROR(source, L"RegexNode::operator!()#operator ! can only applies on charset expressions.");
|
|
Ptr<CharSetExpression> target=new CharSetExpression;
|
|
CopyFrom(target->ranges, source->ranges);
|
|
target->reverse=!source->reverse;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode RegexNode::operator%(const RegexNode& node)const
|
|
{
|
|
CharSetExpression* left=dynamic_cast<CharSetExpression*>(expression.Obj());
|
|
CharSetExpression* right=dynamic_cast<CharSetExpression*>(node.expression.Obj());
|
|
CHECK_ERROR(left && right && !left->reverse && !right->reverse, L"RegexNode::operator%(const RegexNode&)#operator % only connects non-reverse charset expressions.");
|
|
Ptr<CharSetExpression> target=new CharSetExpression;
|
|
target->reverse=false;
|
|
CopyFrom(target->ranges, left->ranges);
|
|
for(vint i=0;i<right->ranges.Count();i++)
|
|
{
|
|
if(!target->AddRangeWithConflict(right->ranges[i]))
|
|
{
|
|
CHECK_ERROR(false, L"RegexNode::operator%(const RegexNode&)#Failed to create charset expression from operator %.");
|
|
}
|
|
}
|
|
return RegexNode(target);
|
|
}
|
|
|
|
/***********************************************************************
|
|
Regex Writer
|
|
***********************************************************************/
|
|
|
|
RegexNode rCapture(const WString& name, const RegexNode& node)
|
|
{
|
|
CaptureExpression* target=new CaptureExpression;
|
|
target->name=name;
|
|
target->expression=node.expression;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rUsing(const WString& name)
|
|
{
|
|
UsingExpression* target=new UsingExpression;
|
|
target->name=name;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rMatch(const WString& name, vint index)
|
|
{
|
|
MatchExpression* target=new MatchExpression;
|
|
target->name=name;
|
|
target->index=index;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rMatch(vint index)
|
|
{
|
|
MatchExpression* target=new MatchExpression;
|
|
target->index=index;
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode rBegin()
|
|
{
|
|
return RegexNode(new BeginExpression);
|
|
}
|
|
|
|
RegexNode rEnd()
|
|
{
|
|
return RegexNode(new EndExpression);
|
|
}
|
|
|
|
RegexNode rC(wchar_t a, wchar_t b)
|
|
{
|
|
if(!b)b=a;
|
|
CharSetExpression* target=new CharSetExpression;
|
|
target->reverse=false;
|
|
target->AddRangeWithConflict(CharRange(a, b));
|
|
return RegexNode(target);
|
|
}
|
|
|
|
RegexNode r_d()
|
|
{
|
|
return rC(L'0', L'9');
|
|
}
|
|
|
|
RegexNode r_l()
|
|
{
|
|
return rC(L'a', L'z')%rC(L'A', L'Z')%rC(L'_');
|
|
}
|
|
|
|
RegexNode r_w()
|
|
{
|
|
return rC(L'0', L'9')%rC(L'a', L'z')%rC(L'A', L'Z')%rC(L'_');
|
|
}
|
|
|
|
RegexNode rAnyChar()
|
|
{
|
|
return rC(1, 65535);
|
|
}
|
|
}
|
|
}
|