/***********************************************************************
THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY
DEVELOPER: Zihan Chen(vczh)
***********************************************************************/
#include "Vlpp.h"
#include "VlppOS.h"
/***********************************************************************
.\REGEX.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEX
#define VCZH_REGEX_REGEX
namespace vl
{
namespace stream
{
class IStream;
}
namespace regex_internal
{
class PureResult;
class PureInterpretor;
class RichResult;
class RichInterpretor;
}
namespace regex
{
class RegexBase_;
class RegexLexerBase_;
template
class RegexLexer_;
/***********************************************************************
Data Structure
***********************************************************************/
/// A sub string of the string that a is matched against.
/// .
/// The sub string.
const ObjectString& Value() const { return value; }
bool operator==(const RegexString_& string) const
{
return start == string.start && length == string.length && value == string.value;
}
};
/// A match produces by a .
/// Captures())
/// {
/// Console::WriteLine(capture.Value());
/// }
/// }
/// ]]>
const CaptureList& Captures()const;
/// Get all sub strings that are captured by named groups.
/// All sub strings that are captured by named groups.
/// C/S+)(/.*?))+$");
/// auto match = regex.MatchHead(L"C++ and C# are my favorite programing languages");
/// for (auto capture : match->Groups().Get(regex.CaptureNames().IndexOf(L"lang")))
/// {
/// Console::WriteLine(capture.Value());
/// }
/// }
/// ]]>
const CaptureGroup& Groups()const;
};
/***********************************************************************
Regex
***********************************************************************/
class RegexBase_ abstract : public Object
{
protected:
regex_internal::PureInterpretor* pure = nullptr;
regex_internal::RichInterpretor* rich = nullptr;
template
void Process(const ObjectString& text, bool keepEmpty, bool keepSuccess, bool keepFail, typename RegexMatch_::List& matches)const;
public:
RegexBase_() = default;
~RegexBase_();
/// Test is a DFA used to match a string.
/// Returns true if a DFA is used.
bool IsPureMatch() const { return rich ? false : true; }
/// Test is a DFA used to test a string. It ignores all capturing.
/// Returns true if a DFA is used.
bool IsPureTest() const { return pure ? true : false; }
/// Match a prefix of the text.
/// The text to match.
/// Result().Value());
/// }
/// ]]>
template
typename RegexMatch_::Ref MatchHead(const ObjectString& text)const;
template
typename RegexMatch_::Ref MatchHead(const T* text) const { return MatchHead(ObjectString(text)); }
/// Match a sub string of the text.
/// The text to match.
/// Result().Value());
/// }
/// ]]>
template
typename RegexMatch_::Ref Match(const ObjectString& text)const;
template
typename RegexMatch_::Ref Match(const T* text) const { return Match(ObjectString(text)); }
/// Match a prefix of the text, ignoring all capturing.
/// The text to match.
template
bool TestHead(const ObjectString& text)const;
template
bool TestHead(const T* text) const { return TestHead(ObjectString(text)); }
/// Match a sub string of the text, ignoring all capturing.
/// The text to match.
template
bool Test(const ObjectString& text)const;
template
bool Test(const T* text) const { return Test(ObjectString(text)); }
/// Find all matched fragments in the given text, returning all matched sub strings.
/// The text to match.
/// Returns all succeeded matches.
/// Result().Value());
/// }
/// }
/// ]]>
template
void Search(const ObjectString& text, typename RegexMatch_::List& matches)const;
template
void Search(const T* text, typename RegexMatch_::List& matches) const { return Search(ObjectString(text), matches); }
/// Split the text by matched sub strings, returning all unmatched sub strings.
/// The text to match.
/// Set to true to keep all empty unmatched sub strings. This could happen when there is nothing between two matched sub strings.
/// Returns all failed matches.
/// Result().Value());
/// }
/// }
/// ]]>
template
void Split(const ObjectString& text, bool keepEmptyMatch, typename RegexMatch_::List& matches)const;
template
void Split(const T* text, bool keepEmptyMatch, typename RegexMatch_::List& matches) const { return Split(ObjectString(text), keepEmptyMatch, matches); }
/// Cut the text by matched sub strings, returning all matched and unmatched sub strings.
/// The text to match.
/// Set to true to keep all empty matches. This could happen when there is nothing between two matched sub strings.
/// Returns all succeeded and failed matches.
/// Result().Value());
/// }
/// }
/// ]]>
template
void Cut(const ObjectString& text, bool keepEmptyMatch, typename RegexMatch_::List& matches)const;
template
void Cut(const T* text, bool keepEmptyMatch, typename RegexMatch_::List& matches) const { return Cut(ObjectString(text), keepEmptyMatch, matches); }
};
///
///
/// \\, \/, \(, \), \+, \*, \?, \{, \}, \[, \], \<, \>, \^, \$, \!, \=: represents itself
///
///
///
/// Escaped characters in charset defined in a square bracket:
///
/// - \r: the CR character
/// - \n: the LF character
/// - \t: the tab character
/// - \-, \[, \], \\, \/, \^, \$: represents itself
///
///
///
///
///
/// Loops:
///
/// - regex{3}: repeats 3 times
/// - regex{3,}: repeats 3 or more times
/// - regex{1,3}: repeats 1 to 3 times
/// - regex?: repeats 0 or 1 times
/// - regex*: repeats 0 or more times
/// - regex+: repeats 1 or more times
///
/// if you add an additional ? right after a loop, it means repeating as less as possible (DFA incompatible)
///
///
/// Capturing: (DFA incompatible)
///
/// - (regex): No capturing, just change the operators' association
/// - (?regex): Capture matched fragment
/// - (<name>regex): Capture matched fragment in a named group called "name"
/// - (<$i>): Match the i-th captured fragment, begins from 0
/// - (<$name;i>): Match the i-th captured fragment in the named group called "name", begins from 0
/// - (<$name>): Match any captured fragment in the named group called "name"
///
///
///
/// MISC
///
/// - (=regex): The prefix of the following text should match the regex, but it is not counted in the whole match (DFA incompatible)
/// - (!regex): Any prefix of the following text should not match the regex, and it is not counted in the whole match (DFA incompatible)
/// - (<#name>regex): Name the regex "name", and it applies here
/// - (<&name>): Copy the named regex "name" here and apply
///
///
///
///
///
/// The regular expression has pupre mode and rich mode.
/// Pure mode means the regular expression is driven by a DFA, while the rich mode is not.
///
///
/// The regular expression can test a string instead of matching.
/// Testing only returns a bool very indicating success or failure.
///
///
template
class Regex_ : public RegexBase_
{
protected:
collections::List> captureNames;
public:
NOT_COPYABLE(Regex_);
/// Create a regular expression. It will crash if the regular expression produces syntax error.
/// The regular expression in a string.
/// Set to true to use DFA if possible.
Regex_(const ObjectString& code, bool preferPure = true);
~Regex_() = default;
/// Get all names of named captures
/// All names of named captures.
const collections::List>& CaptureNames()const { return captureNames; }
};
/***********************************************************************
Tokenizer
***********************************************************************/
/// A token.
/// ) that matches this token. -1 means this token is produced by an error.
vint token;
/// The pointer to where this token starts in the input string .
/// This pointer comes from a that used to be analyzed. You should keep a variable to that string alive, so that to keep this pointer alive.
const T* reading;
/// The "codeIndex" argument from [M:vl.regex.RegexLexer.Parse].
vint codeIndex;
/// True if this token is complete. False if this token does not end here. This could happend when colorizing a text line by line.
bool completeToken;
/// Row number of the first character, begins at 0.
vint rowStart;
/// Column number of the first character, begins at 0.
vint columnStart;
/// Row number of the last character, begins at 0.
vint rowEnd;
/// Column number of the last character, begins at 0.
vint columnEnd;
bool operator==(const RegexToken_& _token)const
{
return length == _token.length && token == _token.token && reading == _token.reading;
}
};
/// Token information for .
struct RegexProcessingToken
{
///
/// The read only start position of the token.
/// This value will be -1 if is not null.
///
const vint start;
///
/// The length of the token, allowing to be updated by the callback.
/// When the callback returns, the length is not allowed to be decreased.
/// This value will be -1 if is not null.
///
vint length;
///
/// The id of the token, allowing to be updated by the callback.
///
vint token;
///
/// The flag indicating if this token is completed, allowing to be updated by the callback.
///
bool completeToken;
///
/// The inter token state object, allowing to be updated by the callback.
/// When the callback returns:
///
/// - if the completeText parameter is true in , it should be nullptr.
/// - if the token does not end at the end of the input, it should not be nullptr.
/// - if a token is completed in one attemp of extending, it should be nullptr.
///
///
void* interTokenState;
RegexProcessingToken(vint _start, vint _length, vint _token, bool _completeToken, void* _interTokenState)
:start(_start)
, length(_length)
, token(_token)
, completeToken(_completeToken)
, interTokenState(_interTokenState)
{
}
};
using RegexInterTokenStateDeleter = void(*)(void* interTokenState);
template
using RegexTokenExtendProc = void(*)(void* argument, const T* reading, vint length, bool completeText, RegexProcessingToken& processingToken);
using RegexTokenColorizeProc = void(*)(void* argument, vint start, vint length, vint token);
/// Callback procedures
/// created by .
/// This callback is not called automatically.
/// It is here to make the maintainance convenient for the caller.
///
RegexInterTokenStateDeleter deleter = nullptr;
///
/// The token extend callback. It is called after recognizing any token, and run a customized procedure to modify the token based on the given context.
/// If the length parameter is -1, it means the caller does not measure the incoming text buffer, which automatically indicates that the buffer is null-terminated.
/// If the length parameter is not -1, it means the number of available characters in the buffer.
/// The completeText parameter could be true or false. When it is false, it means that the buffer does not contain all the text.
///
///
///
/// This is very useful to recognize any token that cannot be expressed using a regular expression.
/// For example, a C++ literal string R"tag(the conteng)tag".
/// It is recommended to add a token for R"tag(,
/// and then use this extend proc to search for a )tag" to complete the token.
///
///
/// Important:
/// when colorizing a text line by line,
/// a cross-line token could be incomplete at the end of the line.
/// Because a given buffer ends at the end of that line,
/// the extend proc is not able to know right now about what is going on in the future.
/// Here is what is designed for,
/// the extend proc can store anything it wants using that pointer.
///
///
/// The caller can get this pointer from the return value of .
/// This pointer only available for cross-line tokens, it is obvious that one line produces at most one such pointer.
/// Then the caller keeps calling that function to walk throught the whole string.
/// When the return value is changed, the pointer is no longer used, and it can be deleted by calling manually.
///
///
/// The first argument is .
///
///
/// The second argument is a pointer to the buffer of the first character in this token.
/// If the previous token is incomplete, then the buffer begins at the first character of the new buffer.
///
///
/// The third argument is the length of the recognized token in characters.
///
///
/// The fourth character indicates if the token is completed.
/// Even if a token is completed, but the extend proc found that, the extend exceeds the end of the buffer,
/// then it can update the value to make it incomplete.
///
///
/// The fifth contains the context for this token. Fields except "start" are allowed to be updated by the extend proc.
///
///
/// tokenDefs;
/// tokenDefs.Add(L"/d+");
/// tokenDefs.Add(L"[a-zA-Z_]/w*");
/// tokenDefs.Add(L"\"([^\"/\\]|/\\/.)*\"");
/// tokenDefs.Add(L"R\"[^(]*/(");
/// tokenDefs.Add(L"[(){};]");
/// tokenDefs.Add(L"/s+");
/// tokenDefs.Add(L"///*+([^//*]|/*+[^//])*/*+//");
///
/// const wchar_t* lines[] = {
/// L"/*********************",
/// L"MAIN.CPP",
/// L"*********************/",
/// L"",
/// L"int main()",
/// L"{",
/// L" printf(\"This is a \\\"simple\\\" text.\");",
/// L" printf(R\"____(This is a",
/// L"\"multiple lined\"",
/// L"literal text)____\");",
/// L" return 0;",
/// L"}",
/// };
///
/// struct Argument
/// {
/// // for a real colorizer, you can put a color buffer here.
/// // the buffer is reused for every line of code.
/// // but for the demo, I put the current processing text instead.
/// // so that I am able to print what is processed.
/// const wchar_t* processingText = nullptr;
/// } argument;
///
/// struct InterTokenState
/// {
/// WString postfix;
/// };
///
/// RegexProc proc;
/// proc.argument = &argument;
/// proc.colorizeProc = [](void* argument, vint start, vint length, vint token)
/// {
/// // this is guaranteed by "proc.argument = &argument;"
/// auto text = reinterpret_cast(argument)->processingText;
/// Console::WriteLine(itow(token) + L": <" + WString(text + start, length) + L">");
/// };
/// proc.deleter = [](void* interTokenState)
/// {
/// delete reinterpret_cast(interTokenState);
/// };
/// proc.extendProc = [](void* argument, const wchar_t* reading, vint length, bool completeText, RegexProcessingToken& processingToken)
/// {
/// // 3 is R"[^(]*/(
/// // 7 is not used in tokenDefs, it is occupied to represent an extended literal string
/// if (processingToken.token == 3 || processingToken.token == 7)
/// {
/// // for calling wcsstr, create a buffer that is zero terminated
/// WString readingBuffer = length == -1 ? WString(reading, false) : WString(reading, length);
/// reading = readingBuffer.Buffer();
///
/// // get the postfix, which is )____" in this case
/// WString postfix;
/// if (processingToken.interTokenState)
/// {
/// postfix = reinterpret_cast(processingToken.interTokenState)->postfix;
/// }
/// else
/// {
/// postfix = L")" + WString(reading + 2, processingToken.length - 3) + L"\"";
/// }
///
/// // try to find if the postfix, which is )____" in this case, appear in the given buffer
/// auto find = wcsstr(reading, postfix.Buffer());
/// if (find)
/// {
/// // if we find the postfix, it means we find the end of the literal string
/// // here processingToken.token automatically becomes 7
/// // interTokenState needs to be nullptr to indicate this
/// processingToken.length = (vint)(find - reading) + postfix.Length();
/// processingToken.completeToken = true;
/// processingToken.interTokenState = nullptr;
/// }
/// else
/// {
/// // if we don't find the postfix, it means the end of the literal string is in future lines
/// // we need to set the token to 7, which is the real token id for literal strings
/// // since we change any token from 3 to 7, 3 will never be passed to colorizeProc in "token" argument
/// processingToken.length = readingBuffer.Length();
/// processingToken.token = 7;
/// processingToken.completeToken = false;
///
/// // we need to ensure that interTokenState is not nullptr, and we can save the postfix here
/// if (!completeText && !processingToken.interTokenState)
/// {
/// auto state = new InterTokenState;
/// state->postfix = postfix;
/// processingToken.interTokenState = state;
/// }
/// }
/// }
/// };
///
/// RegexLexer lexer(tokenDefs, proc);
/// RegexLexerColorizer colorizer = lexer.Colorize();
///
/// void* lastInterTokenState = nullptr;
/// for (auto [line, index] : indexed(From(lines)))
/// {
/// Console::WriteLine(L"Begin line " + itow(index));
/// argument.processingText = line;
/// void* interTokenState = colorizer.Colorize(line, wcslen(line));
///
/// if (lastInterTokenState && lastInterTokenState != interTokenState)
/// {
/// // call the deleter manually
/// proc.deleter(lastInterTokenState);
/// }
/// lastInterTokenState = interTokenState;
///
/// argument.processingText = nullptr;
/// colorizer.Pass(L'\r');
/// colorizer.Pass(L'\n');
/// Console::WriteLine(L"");
/// }
/// }
/// ]]>
RegexTokenExtendProc extendProc = nullptr;
///
///
/// The colorizer callback. It is called when a token is recognized.
///
///
/// The first argument is .
///
///
/// The second argument is the position of the first character of the token in characters.
///
///
/// The third argument is the length of the recognized token in characters.
///
///
/// The fourth character is the regular expression in the list (the first argument in the contructor of ) that matches this token.
///
///
RegexTokenColorizeProc colorizeProc = nullptr;
///
/// The argument object that is the first argument for and .
///
void* argument = nullptr;
};
/// Token collection representing the result from the lexical analyzer. Call to create this object.
/// ");
/// }
/// }
/// ]]>
template
class RegexTokens_ : public collections::EnumerableBase>
{
friend class RegexLexerBase_;
protected:
regex_internal::PureInterpretor* pure;
const collections::Array& stateTokens;
ObjectString code;
vint codeIndex;
RegexProc_ proc;
RegexTokens_(regex_internal::PureInterpretor* _pure, const collections::Array& _stateTokens, const ObjectString& _code, vint _codeIndex, RegexProc_ _proc);
public:
RegexTokens_(const RegexTokens_& tokens);
~RegexTokens_() = default;
collections::IEnumerator>* CreateEnumerator() const override;
/// Copy all tokens.
/// Returns all tokens.
/// A callback to decide which kind of tokens to discard. The input is [F:vl.regex.RegexToken.token]. Returns true to discard this kind of tokens.
/// tokenDefs;
/// tokenDefs.Add(L"/d+");
/// tokenDefs.Add(L"/w+");
/// tokenDefs.Add(L"/s+");
///
/// RegexLexer lexer(tokenDefs, {});
/// WString input = L"I have 2 books.";
/// auto tokenResult = lexer.Parse(input);
///
/// List filtered;
/// tokenResult.ReadToEnd(filtered, [](vint token) { return token < 0 || token == 2; });
///
/// for (auto token : tokenResult)
/// {
/// // input must be in a variable
/// // because token.reading points to a position from input.Buffer();
/// Console::WriteLine(itow(token.token) + L": <" + WString(token.reading, token.length) + L">");
/// }
/// }
/// ]]>
void ReadToEnd(collections::List>& tokens, bool(*discard)(vint)=0)const;
};
/***********************************************************************
RegexLexerWalker
***********************************************************************/
/// A type for walking through a text against a . Call to create this object.
/// ");
/// tokenBegin = reading;
/// tokenEnd = nullptr;
/// tokenId = -1;
/// state = walker.GetStartState();
/// }
/// else
/// {
/// Console::WriteLine(L"Recognized token: " + itow(tokenId) + L": <" + WString(tokenBegin, tokenEnd - tokenBegin) + L">");
/// tokenBegin = reading = tokenEnd;
/// tokenEnd = nullptr;
/// tokenId = -1;
/// state = walker.GetStartState();
/// }
/// }
/// else
/// {
/// Console::WriteLine(L"Unrecognized character: <" + WString(*tokenBegin) + L">");
/// tokenBegin++;
/// state = walker.GetStartState();
/// }
/// }
/// else if (finalState)
/// {
/// tokenEnd = reading;
/// tokenId = token;
/// }
/// }
/// }
/// ]]>
template
class RegexLexerWalker_ : public Object
{
friend class RegexLexerBase_;
protected:
regex_internal::PureInterpretor* pure;
const collections::Array& stateTokens;
RegexLexerWalker_(regex_internal::PureInterpretor* _pure, const collections::Array& _stateTokens);
public:
RegexLexerWalker_(const RegexLexerWalker_& tokens);
~RegexLexerWalker_() = default;
/// Get the start DFA state number, which represents the correct state before parsing any input.
/// The DFA state number.
/// When calling for the first character, the return value should be passed to the second parameter.
vint GetStartState()const;
/// Test if this state can only lead to the end of one kind of token.
/// Returns the token index if this state can only lead to the end of one kind of token. Returns -1 if not.
/// The DFA state number.
vint GetRelatedToken(vint state)const;
/// Step forward by one character.
/// The input character.
/// The current state. Returns the new current state when this function returns.
/// Returns the token index at the end of the token.
/// Returns true if it reach the end of the token.
/// Returns true if the previous character is the end of the token.
///
///
/// The "finalState" argument is important.
/// When "previousTokenStop" becomes true,
/// it tells you that this character can no longer form a token with previous consumed characters.
/// But it does not mean that the recognized token ends at the previous token.
/// The recognized token could end eariler,
/// which is indiated at the last time when "finalState" becomes true.
///
///
/// See the example for about how to use this function.
///
///
void Walk(T input, vint& state, vint& token, bool& finalState, bool& previousTokenStop)const;
/// Step forward by one character.
/// Returns the new current state. It is used to walk the next character.
/// The input character.
/// The current state.
vint Walk(T input, vint state)const;
/// Test if the input text is a closed token.
/// Returns true if the input text is a closed token.
/// The input text.
/// Size of the input text in characters.
///
///
/// A closed token means that,
/// there is a prefix that is a recognized token.
/// At the same time, the input string itself could not be a token, or a prefix of any token.
/// the recognized token has ended before reaching the end of the string.
///
///
/// An unrecognized token is also considered as closed.
///
///
/// For example, assume we have a token defined by "/d+./d+":
///
/// - "2" is not a closed token, because it has not ended.
/// -
/// "2.5." is a closed token, because it has ended at "2.5",
/// and "2.5." could never be a prefix of any token,
/// unless we have another token defined by "/d+./d+./d+".
///
///
///
///
/// tokenDefs;
/// tokenDefs.Add(L"/d+./d+");
/// tokenDefs.Add(L"/d+");
///
/// RegexLexer lexer(tokenDefs, {});
/// RegexLexerWalker walker = lexer.Walk();
///
/// WString tests[] = { L".", L"2", L"2.", L"2.5", L"2.5." };
/// for (auto test : From(tests))
/// {
/// if (walker.IsClosedToken(test.Buffer(), test.Length()))
/// {
/// Console::WriteLine(test + L" is a closed token.");
/// }
/// else
/// {
/// Console::WriteLine(test + L" is not a closed token.");
/// }
/// }
/// }
/// ]]>
bool IsClosedToken(const T* input, vint length)const;
/// Test if the input is a closed token.
/// Returns true if the input text is a closed token.
/// The input text.
///
///
/// A closed token means that,
/// there is a prefix that is a recognized token.
/// At the same time, the input string itself could not be a token, or a prefix of any token.
/// the recognized token has ended before reaching the end of the string.
///
///
/// An unrecognized token is also considered as closed.
///
///
/// For example, assume we have a token defined by "/d+./d+":
///
/// - "2" is not a closed token, because it has not ended.
/// -
/// "2.5." is a closed token, because it has ended at "2.5",
/// and "2.5." could never be a prefix of any token,
/// unless we have another token defined by "/d+./d+./d+".
///
///
///
///
/// tokenDefs;
/// tokenDefs.Add(L"/d+./d+");
/// tokenDefs.Add(L"/d+");
///
/// RegexLexer lexer(tokenDefs, {});
/// RegexLexerWalker walker = lexer.Walk();
///
/// WString tests[] = { L".", L"2", L"2.", L"2.5", L"2.5." };
/// for (auto test : From(tests))
/// {
/// if (walker.IsClosedToken(test))
/// {
/// Console::WriteLine(test + L" is a closed token.");
/// }
/// else
/// {
/// Console::WriteLine(test + L" is not a closed token.");
/// }
/// }
/// }
/// ]]>
bool IsClosedToken(const ObjectString& input)const;
};
/***********************************************************************
RegexLexerColorizer
***********************************************************************/
/// Lexical colorizer. Call to create this object.
/// (argument)->processingText;
/// Console::WriteLine(itow(token) + L": <" + WString(text + start, length) + L">");
/// };
///
/// RegexLexer lexer(tokenDefs, proc);
/// RegexLexerColorizer colorizer = lexer.Colorize();
///
/// for (auto [line, index] : indexed(From(lines)))
/// {
/// Console::WriteLine(L"Begin line " + itow(index));
/// argument.processingText = line;
/// colorizer.Colorize(line, wcslen(line));
///
/// argument.processingText = nullptr;
/// colorizer.Pass(L'\r');
/// colorizer.Pass(L'\n');
/// Console::WriteLine(L"");
/// }
/// }
/// ]]>
template
class RegexLexerColorizer_ : public Object
{
friend class RegexLexerBase_;
public:
struct InternalState
{
vint currentState = -1;
vint interTokenId = -1;
void* interTokenState = nullptr;
};
protected:
RegexLexerWalker_ walker;
RegexProc_ proc;
InternalState internalState;
void CallExtendProcAndColorizeProc(const T* input, vint length, RegexProcessingToken& token, bool colorize);
vint WalkOneToken(const T* input, vint length, vint start, bool colorize);
RegexLexerColorizer_(const RegexLexerWalker_& _walker, RegexProc_ _proc);
public:
RegexLexerColorizer_(const RegexLexerColorizer_& colorizer) = default;
~RegexLexerColorizer_() = default;
/// Get the internal state.
/// The internal state.
///
///
/// If has not been called, the return value of this function is the start state.
///
///
/// If a text is multi-lined, could be called line by line, and the internal state is changed.
///
///
/// In order to colorize another piece of multi-lined text,
/// you can either save the start state and call to reset the state,
/// or call for a new colorizer.
///
///
InternalState GetInternalState();
/// Restore the colorizer to a specified state.
/// The state to restore.
void SetInternalState(InternalState state);
/// Step forward by one character.
/// The input character.
/// Callbacks in will be called except colorizeProc, which is from the second argument of the constructor of .
void Pass(T input);
/// Get the start DFA state number, which represents the correct state before colorizing any characters.
/// The DFA state number.
vint GetStartState()const;
/// Colorize a text.
/// An inter token state at the end of this line. It could be the same object to which is returned from the previous call.
/// The text to colorize.
/// Size of the text in characters.
///
/// See and for more information about the return value.
/// Callbacks in will be called, which is from the second argument of the constructor of .
///
void* Colorize(const T* input, vint length);
};
/***********************************************************************
RegexLexer
***********************************************************************/
class RegexLexerBase_ abstract : public Object
{
protected:
regex_internal::PureInterpretor* pure = nullptr;
collections::Array stateTokens;
public:
~RegexLexerBase_();
/// Tokenize an input text.
/// The text to tokenize.
/// Configuration of all callbacks.
/// Extra information that will be copied to [F:vl.regex.RegexToken.codeIndex].
/// Callbacks in will be called when iterating through tokens, which is from the second argument of the constructor of .
template
RegexTokens_ Parse(const ObjectString& code, RegexProc_ proc = {}, vint codeIndex = -1)const;
template
RegexTokens_ Parse(const T* code, RegexProc_ proc = {}, vint codeIndex = -1) const { return Parse(ObjectString(code), proc, codeIndex); }
/// Create a equivalence walker from this lexical analyzer. A walker enable you to walk throught characters one by one,
/// The character type of the text to parse.
/// The colorizer.
/// Configuration of all callbacks.
template
RegexLexerColorizer_ Colorize(RegexProc_ proc)const;
};
/// Lexical analyzer.
/// ALl regular expression, each one represent a kind of tokens.
RegexLexer_(const collections::IEnumerable>& tokens);
RegexLexer_(stream::IStream& inputStream);
~RegexLexer_() = default;
void Serialize(stream::IStream & outputStream);
};
/***********************************************************************
Template Instantiation
***********************************************************************/
extern template class RegexString_;
extern template class RegexString_;
extern template class RegexString_;
extern template class RegexString_;
extern template class RegexMatch_;
extern template class RegexMatch_;
extern template class RegexMatch_;
extern template class RegexMatch_;
extern template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const;
extern template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const;
extern template bool RegexBase_::TestHead (const ObjectString& text)const;
extern template bool RegexBase_::Test (const ObjectString& text)const;
extern template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const;
extern template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const;
extern template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const;
extern template bool RegexBase_::TestHead (const ObjectString& text)const;
extern template bool RegexBase_::Test (const ObjectString& text)const;
extern template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const;
extern template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const;
extern template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const;
extern template bool RegexBase_::TestHead (const ObjectString& text)const;
extern template bool RegexBase_::Test (const ObjectString& text)const;
extern template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const;
extern template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template RegexMatch_::Ref RegexBase_::MatchHead (const ObjectString& text)const;
extern template RegexMatch_::Ref RegexBase_::Match (const ObjectString& text)const;
extern template bool RegexBase_::TestHead (const ObjectString& text)const;
extern template bool RegexBase_::Test (const ObjectString& text)const;
extern template void RegexBase_::Search (const ObjectString& text, RegexMatch_::List& matches)const;
extern template void RegexBase_::Split (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template void RegexBase_::Cut (const ObjectString& text, bool keepEmptyMatch, RegexMatch_::List& matches)const;
extern template class Regex_;
extern template class Regex_;
extern template class Regex_;
extern template class Regex_;
extern template class RegexTokens_;
extern template class RegexTokens_;
extern template class RegexTokens_;
extern template class RegexTokens_;
extern template class RegexLexerWalker_;
extern template class RegexLexerWalker_;
extern template class RegexLexerWalker_;
extern template class RegexLexerWalker_;
extern template class RegexLexerColorizer_;
extern template class RegexLexerColorizer_;
extern template class RegexLexerColorizer_;
extern template class RegexLexerColorizer_;
extern template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const;
extern template RegexLexerWalker_ RegexLexerBase_::Walk ()const;
extern template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const;
extern template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const;
extern template RegexLexerWalker_ RegexLexerBase_::Walk ()const;
extern template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const;
extern template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const;
extern template RegexLexerWalker_ RegexLexerBase_::Walk ()const;
extern template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const;
extern template RegexTokens_ RegexLexerBase_::Parse (const ObjectString& code, RegexProc_ _proc, vint codeIndex)const;
extern template RegexLexerWalker_ RegexLexerBase_::Walk ()const;
extern template RegexLexerColorizer_ RegexLexerBase_::Colorize (RegexProc_ _proc)const;
extern template class RegexLexer_;
extern template class RegexLexer_;
extern template class RegexLexer_;
extern template class RegexLexer_;
using RegexString = RegexString_;
using RegexMatch = RegexMatch_;
using Regex = Regex_;
using RegexToken = RegexToken_;
using RegexProc = RegexProc_;
using RegexTokens = RegexTokens_;
using RegexLexerWalker = RegexLexerWalker_;
using RegexLexerColorizer = RegexLexerColorizer_;
using RegexLexer = RegexLexer_;
}
}
#endif
/***********************************************************************
.\REGEXCHARREADER.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEXCHARREADER
#define VCZH_REGEX_REGEXCHARREADER
namespace vl
{
namespace regex_internal
{
template
struct CharReader
{
private:
encoding::UtfStringTo32Reader reader;
const T* input;
public:
CharReader(const T* _input)
: reader(_input)
, input(_input)
{
}
const T* Reading() { return input + reader.SourceCluster().index; }
vint Index() { return reader.SourceCluster().index; }
char32_t Read()
{
return reader.Read();
}
};
template<>
struct CharReader
{
private:
const char32_t* input;
vint index = 0;
bool finished = false;
public:
CharReader(const char32_t* _input)
: input(_input)
{
}
char32_t Read()
{
if (finished) return 0;
if (auto c = input[index])
{
index++;
return c;
}
else
{
finished = true;
return 0;
}
}
const char32_t* Reading() { return input + Index(); }
vint Index() { return finished ? index : index - 1; }
};
}
}
#endif
/***********************************************************************
.\AUTOMATON\REGEXDATA.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEXDATA
#define VCZH_REGEX_REGEXDATA
namespace vl
{
namespace regex_internal
{
/***********************************************************************
CharRange
***********************************************************************/
class CharRange
{
public:
typedef collections::SortedList List;
char32_t begin = 0;
char32_t end = 0;
CharRange() = default;
CharRange(char32_t _begin, char32_t _end) : begin(_begin), end(_end) {}
bool operator<(CharRange item) const
{
return end < item.begin;
}
bool operator<=(CharRange item) const
{
return *this < item || *this == item;
}
bool operator>(CharRange item) const
{
return item.end < begin;
}
bool operator>=(CharRange item) const
{
return *this > item || *this == item;
}
bool operator==(CharRange item) const
{
return begin == item.begin && end == item.end;
}
bool operator!=(CharRange item) const
{
return begin != item.begin || item.end != end;
}
bool operator<(char32_t item) const
{
return end < item;
}
bool operator<=(char32_t item) const
{
return begin <= item;
}
bool operator>(char32_t item) const
{
return item < begin;
}
bool operator>=(char32_t item) const
{
return item <= end;
}
bool operator==(char32_t item) const
{
return begin <= item && item <= end;
}
bool operator!=(char32_t item) const
{
return item < begin || end < item;
}
};
}
}
#endif
/***********************************************************************
.\AUTOMATON\REGEXAUTOMATON.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEXAUTOMATON
#define VCZH_REGEX_REGEXAUTOMATON
namespace vl
{
namespace regex_internal
{
constexpr char32_t MaxChar32 = 0x10FFFF;
class State;
class Transition;
class Transition
{
public:
enum Type
{
Chars, // Character range transition
Epsilon,
BeginString,
EndString,
Nop, // Non-epsilon transition with no input
Capture, // Begin capture transition
Match, // Capture matching transition
Positive, // Begin positive lookahead
Negative, // Begin negative lookahead
NegativeFail, // Negative lookahead failure
End // For Capture, Position, Negative
};
State* source;
State* target;
CharRange range;
Type type;
vint capture;
vint index;
};
class State
{
public:
collections::List transitions;
collections::List inputs;
bool finalState;
void* userData;
};
class Automaton
{
public:
typedef Ptr Ref;
collections::List> states;
collections::List> transitions;
collections::List captureNames;
State* startState;
Automaton();
State* NewState();
Transition* NewTransition(State* start, State* end);
Transition* NewChars(State* start, State* end, CharRange range);
Transition* NewEpsilon(State* start, State* end);
Transition* NewBeginString(State* start, State* end);
Transition* NewEndString(State* start, State* end);
Transition* NewNop(State* start, State* end);
Transition* NewCapture(State* start, State* end, vint capture);
Transition* NewMatch(State* start, State* end, vint capture, vint index=-1);
Transition* NewPositive(State* start, State* end);
Transition* NewNegative(State* start, State* end);
Transition* NewNegativeFail(State* start, State* end);
Transition* NewEnd(State* start, State* end);
};
extern bool PureEpsilonChecker(Transition* transition);
extern bool RichEpsilonChecker(Transition* transition);
extern bool AreEqual(Transition* transA, Transition* transB);
extern Automaton::Ref EpsilonNfaToNfa(Automaton::Ref source, bool(*epsilonChecker)(Transition*), collections::Dictionary& nfaStateMap);
extern Automaton::Ref NfaToDfa(Automaton::Ref source, collections::Group& dfaStateMap);
}
}
#endif
/***********************************************************************
.\AST\REGEXEXPRESSION.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
Classes:
Expression : Base class of expressions |
CharSetExpression : Character set | a, [a-b], [^a-b0_9], \.rnt\/()+*?{}[]<>^$!=SsDdLlWw, [\rnt-[]\/^$]
LoopExpression : Repeat | a{3}, a{3,}, a{1,3}, a+, a*, a?, LOOP?
SequenceExpression : Sequence of two regex | ab
AlternateExpression : Alternative of two regex | a|b
BeginExpression : String begin | ^
EndExpression : String end | $
CaptureExpression : Capture | (expr), (?expr)
MatchExpression : Capture matching | (<$name>), (<$name;i>), (<$i>)
PositiveExpression : Positive lookahead | (=expr)
NegativeExpression : Negative lookahead | (!expr)
UsingExpression : refer a regex | (<#name1>expr)...(<&name1>)...
RegexExpression : Regular Expression
Functions:
ParseRegexExpression : Regex Syntax Analyzer
***********************************************************************/
#ifndef VCZH_REGEX_REGEXEXPRESSION
#define VCZH_REGEX_REGEXEXPRESSION
namespace vl
{
namespace regex_internal
{
class IRegexExpressionAlgorithm;
/***********************************************************************
Regex Expression AST
***********************************************************************/
class Expression : public Object
{
public:
NOT_COPYABLE(Expression);
Expression() = default;
typedef Ptr Ref;
typedef collections::Dictionary Map;
virtual void Apply(IRegexExpressionAlgorithm& algorithm)=0;
bool IsEqual(Expression* expression);
bool HasNoExtension();
bool CanTreatAsPure();
void NormalizeCharSet(CharRange::List& subsets);
void CollectCharSet(CharRange::List& subsets);
void ApplyCharSet(CharRange::List& subsets);
Automaton::Ref GenerateEpsilonNfa();
};
class CharSetExpression : public Expression
{
public:
CharRange::List ranges;
bool reverse;
bool AddRangeWithConflict(CharRange range);
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class LoopExpression : public Expression
{
public:
Expression::Ref expression; // The regex to loop
vint min; // Minimum count of looping
vint max; // Maximum count of looping, -1 for infinite
bool preferLong; // Prefer longer matching
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class SequenceExpression : public Expression
{
public:
Expression::Ref left; // First regex to match
Expression::Ref right; // Last regex to match
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class AlternateExpression : public Expression
{
public:
Expression::Ref left; // First regex to match
Expression::Ref right; // Last regex to match
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class BeginExpression: public Expression
{
public:
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class EndExpression : public Expression
{
public:
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class CaptureExpression : public Expression
{
public:
U32String name; // Capture name, empty for anonymous capture
Expression::Ref expression; // Regex to match
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class MatchExpression : public Expression
{
public:
U32String name; // Capture name, empty for anonymous
vint index; // The index of captured text to match associated the name, -1 for all of them
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class PositiveExpression : public Expression
{
public:
Expression::Ref expression; // Regex to match
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class NegativeExpression : public Expression
{
public:
Expression::Ref expression; // Regex to match
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class UsingExpression : public Expression
{
public:
U32String name; // Name of the regex to refer
void Apply(IRegexExpressionAlgorithm& algorithm);
};
class RegexExpression : public Object
{
public:
typedef Ptr Ref;
Expression::Map definitions; // Named regex to be referred
Expression::Ref expression; // Regex to match
NOT_COPYABLE(RegexExpression);
RegexExpression() = default;
Expression::Ref Merge();
};
/***********************************************************************
Visitor
***********************************************************************/
class IRegexExpressionAlgorithm : public Interface
{
public:
virtual void Visit(CharSetExpression* expression)=0;
virtual void Visit(LoopExpression* expression)=0;
virtual void Visit(SequenceExpression* expression)=0;
virtual void Visit(AlternateExpression* expression)=0;
virtual void Visit(BeginExpression* expression)=0;
virtual void Visit(EndExpression* expression)=0;
virtual void Visit(CaptureExpression* expression)=0;
virtual void Visit(MatchExpression* expression)=0;
virtual void Visit(PositiveExpression* expression)=0;
virtual void Visit(NegativeExpression* expression)=0;
virtual void Visit(UsingExpression* expression)=0;
};
template
class RegexExpressionAlgorithm : public Object, public IRegexExpressionAlgorithm
{
private:
ReturnType returnValue;
ParameterType* parameterValue;
public:
ReturnType Invoke(Expression* expression, ParameterType parameter)
{
parameterValue=¶meter;
expression->Apply(*this);
return returnValue;
}
ReturnType Invoke(Expression::Ref expression, ParameterType parameter)
{
parameterValue=¶meter;
expression->Apply(*this);
return returnValue;
}
virtual ReturnType Apply(CharSetExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(LoopExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(SequenceExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(AlternateExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(BeginExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(EndExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(CaptureExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(MatchExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(PositiveExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(NegativeExpression* expression, ParameterType parameter)=0;
virtual ReturnType Apply(UsingExpression* expression, ParameterType parameter)=0;
public:
void Visit(CharSetExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(LoopExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(SequenceExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(AlternateExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(BeginExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(EndExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(CaptureExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(MatchExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(PositiveExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(NegativeExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
void Visit(UsingExpression* expression)
{
returnValue=Apply(expression, *parameterValue);
}
};
template
class RegexExpressionAlgorithm : public Object, public IRegexExpressionAlgorithm
{
private:
ParameterType* parameterValue;
public:
void Invoke(Expression* expression, ParameterType parameter)
{
parameterValue=¶meter;
expression->Apply(*this);
}
void Invoke(Expression::Ref expression, ParameterType parameter)
{
parameterValue=¶meter;
expression->Apply(*this);
}
virtual void Apply(CharSetExpression* expression, ParameterType parameter)=0;
virtual void Apply(LoopExpression* expression, ParameterType parameter)=0;
virtual void Apply(SequenceExpression* expression, ParameterType parameter)=0;
virtual void Apply(AlternateExpression* expression, ParameterType parameter)=0;
virtual void Apply(BeginExpression* expression, ParameterType parameter)=0;
virtual void Apply(EndExpression* expression, ParameterType parameter)=0;
virtual void Apply(CaptureExpression* expression, ParameterType parameter)=0;
virtual void Apply(MatchExpression* expression, ParameterType parameter)=0;
virtual void Apply(PositiveExpression* expression, ParameterType parameter)=0;
virtual void Apply(NegativeExpression* expression, ParameterType parameter)=0;
virtual void Apply(UsingExpression* expression, ParameterType parameter)=0;
public:
void Visit(CharSetExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(LoopExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(SequenceExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(AlternateExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(BeginExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(EndExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(CaptureExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(MatchExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(PositiveExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(NegativeExpression* expression)
{
Apply(expression, *parameterValue);
}
void Visit(UsingExpression* expression)
{
Apply(expression, *parameterValue);
}
};
/***********************************************************************
Helper Functions
***********************************************************************/
extern Ptr ParseLoop(const char32_t*& input);
extern Ptr ParseCharSet(const char32_t*& input);
extern Ptr ParseFunction(const char32_t*& input);
extern Ptr ParseUnit(const char32_t*& input);
extern Ptr ParseJoin(const char32_t*& input);
extern Ptr ParseAlt(const char32_t*& input);
extern Ptr ParseExpression(const char32_t*& input);
extern RegexExpression::Ref ParseRegexExpression(const U32String& code);
extern U32String EscapeTextForRegex(const U32String& literalString);
extern U32String UnescapeTextForRegex(const U32String& escapedText);
extern U32String NormalizeEscapedTextForRegex(const U32String& escapedText);
extern bool IsRegexEscapedLiteralString(const U32String& regex);
class RegexException : public Exception
{
public:
U32String code;
vint position;
public:
RegexException(const WString& _message, const U32String& _code, vint _position)
: Exception(_message)
, code(_code)
, position(_position)
{
}
const U32String& GetCode() const { return code; }
vint GetPosition() const { return position; }
};
}
}
#endif
/***********************************************************************
.\AST\REGEXWRITER.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEXWRITER
#define VCZH_REGEX_REGEXWRITER
namespace vl
{
namespace regex
{
class RegexNode : public Object
{
public:
vl::regex_internal::Expression::Ref expression;
RegexNode(vl::regex_internal::Expression::Ref _expression);
RegexNode Some()const;
RegexNode Any()const;
RegexNode Opt()const;
RegexNode Loop(vint min, vint max)const;
RegexNode AtLeast(vint min)const;
RegexNode operator+(const RegexNode& node)const;
RegexNode operator|(const RegexNode& node)const;
RegexNode operator+()const;
RegexNode operator-()const;
RegexNode operator!()const;
RegexNode operator%(const RegexNode& node)const;
};
extern RegexNode rCapture(const U32String& name, const RegexNode& node);
extern RegexNode rUsing(const U32String& name);
extern RegexNode rMatch(const U32String& name, vint index=-1);
extern RegexNode rMatch(vint index);
extern RegexNode rBegin();
extern RegexNode rEnd();
extern RegexNode rC(char32_t a, char32_t b=0);
extern RegexNode r_d();
extern RegexNode r_l();
extern RegexNode r_w();
extern RegexNode rAnyChar();
}
}
#endif
/***********************************************************************
.\REGEXPURE.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEXPURE
#define VCZH_REGEX_REGEXPURE
namespace vl
{
namespace stream
{
class IStream;
}
namespace regex_internal
{
class PureResult
{
public:
vint start;
vint length;
vint finalState;
vint terminateState;
};
class PureInterpretor : public Object
{
using CharRangeArray = collections::Array;
protected:
static const vint SupportedCharCount = MaxChar32 + 1;
CharRangeArray charRanges;
vint charMap[SupportedCharCount]; // char -> char set index
vint* transitions = nullptr; // (state * charSetCount + charSetIndex) -> state
bool* finalState = nullptr; // state -> bool
vint* relatedFinalState = nullptr; // state -> (finalState or -1)
vint stateCount;
vint charSetCount;
vint startState;
void ExpandCharRanges();
public:
PureInterpretor(Automaton::Ref dfa, CharRange::List& subsets);
PureInterpretor(stream::IStream& inputStream);
~PureInterpretor();
void Serialize(stream::IStream& outputStream);
template
bool MatchHead(const TChar* input, const TChar* start, PureResult& result);
template
bool Match(const TChar* input, const TChar* start, PureResult& result);
vint GetStartState();
vint Transit(char32_t input, vint state);
bool IsFinalState(vint state);
bool IsDeadState(vint state);
void PrepareForRelatedFinalStateTable();
vint GetRelatedFinalState(vint state);
};
extern template bool PureInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, PureResult& result);
extern template bool PureInterpretor::MatchHead(const char8_t* input, const char8_t* start, PureResult& result);
extern template bool PureInterpretor::MatchHead(const char16_t* input, const char16_t* start, PureResult& result);
extern template bool PureInterpretor::MatchHead(const char32_t* input, const char32_t* start, PureResult& result);
extern template bool PureInterpretor::Match(const wchar_t* input, const wchar_t* start, PureResult& result);
extern template bool PureInterpretor::Match(const char8_t* input, const char8_t* start, PureResult& result);
extern template bool PureInterpretor::Match(const char16_t* input, const char16_t* start, PureResult& result);
extern template bool PureInterpretor::Match(const char32_t* input, const char32_t* start, PureResult& result);
}
}
#endif
/***********************************************************************
.\REGEXRICH.H
***********************************************************************/
/***********************************************************************
Author: Zihan Chen (vczh)
Licensed under https://github.com/vczh-libraries/License
***********************************************************************/
#ifndef VCZH_REGEX_REGEXRICH
#define VCZH_REGEX_REGEXRICH
namespace vl
{
namespace regex_internal
{
class CaptureRecord
{
public:
vint capture;
vint start;
vint length;
bool operator==(const CaptureRecord& record)const;
};
}
namespace regex_internal
{
class RichResult
{
public:
vint start;
vint length;
collections::List captures;
};
class RichInterpretor : public Object
{
public:
protected:
class UserData
{
public:
bool NeedKeepState;
};
Automaton::Ref dfa;
UserData* datas;
public:
RichInterpretor(Automaton::Ref _dfa);
~RichInterpretor();
template
bool MatchHead(const TChar* input, const TChar* start, RichResult& result);
template
bool Match(const TChar* input, const TChar* start, RichResult& result);
const collections::List& CaptureNames();
};
extern template bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, RichResult& result);
extern template bool RichInterpretor::MatchHead(const char8_t* input, const char8_t* start, RichResult& result);
extern template bool RichInterpretor::MatchHead(const char16_t* input, const char16_t* start, RichResult& result);
extern template bool RichInterpretor::MatchHead(const char32_t* input, const char32_t* start, RichResult& result);
extern template bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, RichResult& result);
extern template bool RichInterpretor::Match(const char8_t* input, const char8_t* start, RichResult& result);
extern template bool RichInterpretor::Match(const char16_t* input, const char16_t* start, RichResult& result);
extern template bool RichInterpretor::Match