...

2026-06-01 23:06:39 +08:00 · 2018-09-18 01:45:51 -07:00
parent 90c212ee9f
commit df13c62a90
4 changed files with 425 additions and 233 deletions
@@ -17814,7 +17814,12 @@ GuiTextBoxRegexColorizer
 				}
 				else
 				{
-					lexer=new regex::RegexLexer(tokenRegexes);
+					{
 						regex::RegexProc proc;
 						proc.colorizeProc = &GuiTextBoxRegexColorizer::ColorizerProc;
 						proc.argument = colorizerArgument;
 						lexer = new regex::RegexLexer(tokenRegexes, proc);
 					}
 					colors.Resize(1 + tokenRegexes.Count() + extraTokenColors.Count());
 					colors[0] = defaultColor;
 					for (vint i = 0; i < tokenColors.Count(); i++)
@@ -17855,10 +17860,13 @@ GuiTextBoxRegexColorizer
 					data.colors = colors;
 					data.contextState = contextState;
-					colorizer->Reset(lexerState);
+					regex::RegexLexerColorizer::InternalState internalState;
-					colorizer->Colorize(text, length, &GuiTextBoxRegexColorizer::ColorizerProc, &data);
+					internalState.currentState = lexerState;
 					colorizer->SetInternalState(internalState);
 					colorizerArgument[0] = &data;
 					colorizer->Colorize(text, length);
-					lexerState=colorizer->GetCurrentState();
+					lexerState = colorizer->GetInternalState().currentState;
 					contextState = data.contextState;
 				}
 				else
@@ -16207,6 +16207,7 @@ GuiTextBoxRegexColorizer
 			protected:
 				Ptr<regex::RegexLexer>										lexer;
 				Ptr<regex::RegexLexerColorizer>								colorizer;
 				void*														colorizerArgument[1] { nullptr };
 				ColorArray													colors;
 				elements::text::ColorEntry									defaultColor;
@@ -13318,7 +13318,7 @@ ParsingTable
 				{
 					discardTokenInfos[i].regexTokenIndex = regexTokenIndex++;
 				}
-				lexer=new RegexLexer(tokens);
+				lexer = new RegexLexer(tokens, {});
 				ruleMap.Clear();
 				FOREACH_INDEXER(RuleInfo, rule, index, ruleInfos)
@@ -20107,17 +20107,18 @@ RegexTokens
 		{
 		protected:
 			RegexToken				token;
-			vint					index;
+			vint					index = -1;
 			PureInterpretor*		pure;
 			const Array<vint>&		stateTokens;
 			const wchar_t*			start;
 			vint					codeIndex;
 			RegexProc				proc;
 			const wchar_t*			reading;
-			vint					rowStart;
+			vint					rowStart = 0;
-			vint					columnStart;
+			vint					columnStart = 0;
-			bool					cacheAvailable;
+			bool					cacheAvailable = false;
 			RegexToken				cacheToken;
 		public:
@@ -20126,6 +20127,7 @@ RegexTokens
 				, index(enumerator.index)
 				, pure(enumerator.pure)
 				, stateTokens(enumerator.stateTokens)
 				, proc(enumerator.proc)
 				, reading(enumerator.reading)
 				, start(enumerator.start)
 				, rowStart(enumerator.rowStart)
@@ -20136,16 +20138,14 @@ RegexTokens
 			{
 			}
-			RegexTokenEnumerator(PureInterpretor* _pure, const Array<vint>& _stateTokens, const wchar_t* _start, vint _codeIndex)
+			RegexTokenEnumerator(PureInterpretor* _pure, const Array<vint>& _stateTokens, const wchar_t* _start, vint _codeIndex, RegexProc _proc)
 				:index(-1)
 				, pure(_pure)
 				, stateTokens(_stateTokens)
 				,reading(_start)
 				, start(_start)
 				,rowStart(0)
 				,columnStart(0)
 				, codeIndex(_codeIndex)
-				,cacheAvailable(false)
+				, proc(_proc)
 				, reading(_start)
 			{
 			}
@@ -20180,6 +20180,7 @@ RegexTokens
 					token.token = -2;
 					token.completeToken = true;
 				}
 				token.rowStart = rowStart;
 				token.columnStart = columnStart;
 				token.rowEnd = rowStart;
@@ -20217,6 +20218,19 @@ RegexTokens
 					{
 						id = stateTokens.Get(result.finalState);
 					}
 					if (id != -1 && proc.extendProc)
 					{
 						RegexProcessingToken token(result.start, result.length, id, completeToken, nullptr);
 						proc.extendProc(proc.argument, reading, -1, true, token);
 #if _DEBUG
 						CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer.");
 #endif
 						result.length = token.length;
 						id = token.token;
 						completeToken = token.completeToken;
 					}
 					if (token.token == -2)
 					{
 						token.start = result.start;
@@ -20239,6 +20253,7 @@ RegexTokens
 						cacheToken.completeToken = completeToken;
 					}
 					reading += result.length;
 					if (cacheAvailable)
 					{
 						break;
@@ -20283,11 +20298,12 @@ RegexTokens
 			}
 		};
-		RegexTokens::RegexTokens(PureInterpretor* _pure, const Array<vint>& _stateTokens, const WString& _code, vint _codeIndex)
+		RegexTokens::RegexTokens(PureInterpretor* _pure, const Array<vint>& _stateTokens, const WString& _code, vint _codeIndex, RegexProc _proc)
 			:pure(_pure)
 			, stateTokens(_stateTokens)
 			, code(_code)
 			, codeIndex(_codeIndex)
 			, proc(_proc)
 		{
 		}
@@ -20296,12 +20312,13 @@ RegexTokens
 			, stateTokens(tokens.stateTokens)
 			, code(tokens.code)
 			, codeIndex(tokens.codeIndex)
 			, proc(tokens.proc)
 		{
 		}
 		IEnumerator<RegexToken>* RegexTokens::CreateEnumerator()const
 		{
-			return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex);
+			return new RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc);
 		}
 		bool DefaultDiscard(vint token)
@@ -20315,7 +20332,7 @@ RegexTokens
 			{
 				discard=&DefaultDiscard;
 			}
-			RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex).ReadToEnd(tokens, discard);
+			RegexTokenEnumerator(pure, stateTokens, code.Buffer(), codeIndex, proc).ReadToEnd(tokens, discard);
 		}
 /***********************************************************************
@@ -20328,9 +20345,9 @@ RegexLexerWalker
 		{
 		}
-		RegexLexerWalker::RegexLexerWalker(const RegexLexerWalker& walker)
+		RegexLexerWalker::RegexLexerWalker(const RegexLexerWalker& tokens)
-			:pure(walker.pure)
+			: pure(tokens.pure)
-			,stateTokens(walker.stateTokens)
+			, stateTokens(tokens.stateTokens)
 		{
 		}
@@ -20338,6 +20355,10 @@ RegexLexerWalker
 		{
 		}
 		RegexTokens::~RegexTokens()
 		{
 		}
 		vint RegexLexerWalker::GetStartState()const
 		{
 			return pure->GetStartState();
@@ -20418,15 +20439,17 @@ RegexLexerWalker
 RegexLexerColorizer
 ***********************************************************************/
-		RegexLexerColorizer::RegexLexerColorizer(const RegexLexerWalker& _walker)
+		RegexLexerColorizer::RegexLexerColorizer(const RegexLexerWalker& _walker, RegexProc _proc)
 			:walker(_walker)
-			,currentState(_walker.GetStartState())
+			, proc(_proc)
 		{
 			internalState.currentState = walker.GetStartState();
 		}
 		RegexLexerColorizer::RegexLexerColorizer(const RegexLexerColorizer& colorizer)
 			:walker(colorizer.walker)
-			,currentState(colorizer.currentState)
+			, proc(colorizer.proc)
 			, internalState(colorizer.internalState)
 		{
 		}
@@ -20434,14 +20457,18 @@ RegexLexerColorizer
 		{
 		}
-		void RegexLexerColorizer::Reset(vint state)
+		RegexLexerColorizer::InternalState RegexLexerColorizer::GetInternalState()
 		{
-			currentState=state;
+			return internalState;
 		}
 		void RegexLexerColorizer::SetInternalState(InternalState state)
 		{
 			internalState = state;
 		}
 		void RegexLexerColorizer::Pass(wchar_t input)
 		{
-			currentState=walker.Walk(input, currentState);
+			WalkOneToken(&input, 1, 0, false);
 		}
 		vint RegexLexerColorizer::GetStartState()const
@@ -20449,80 +20476,155 @@ RegexLexerColorizer
 			return walker.GetStartState();
 		}
-		vint RegexLexerColorizer::GetCurrentState()const
+		void RegexLexerColorizer::CallExtendProcAndColorizeProc(const wchar_t* input, vint length, RegexProcessingToken& token, bool colorize)
 		{
-			return currentState;
+			vint oldTokenLength = token.length;
 			proc.extendProc(proc.argument, input + token.start, length - token.start, false, token);
 #if _DEBUG
 			{
 				bool pausedAtTheEnd = token.start + token.length == length && !token.completeToken;
 				CHECK_ERROR(
 					token.completeToken || pausedAtTheEnd,
 					L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input."
 				);
 				CHECK_ERROR(
 					token.completeToken || token.token != -1,
 					L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id."
 				);
 				CHECK_ERROR(
 					oldTokenLength <= token.length,
 					L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length."
 				);
 				CHECK_ERROR(
 					(token.interTokenState == nullptr) == !pausedAtTheEnd,
 					L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
 				);
 			}
 #endif
 			if ((internalState.interTokenState = token.interTokenState))
 			{
 				internalState.interTokenId = token.token;
 				internalState.currentState = walker.GetStartState();
 			}
 			if (colorize)
 			{
 				proc.colorizeProc(proc.argument, token.start, token.length, token.token);
 			}
 		}
-		void RegexLexerColorizer::Colorize(const wchar_t* input, vint length, TokenProc tokenProc, void* tokenProcArgument)
+		vint RegexLexerColorizer::WalkOneToken(const wchar_t* input, vint length, vint start, bool colorize)
 		{
-			vint start=0;
+			if (internalState.interTokenState)
-			vint stop=0;
+			{
-			vint state=-1;
+				RegexProcessingToken token(-1, -1, internalState.interTokenId, false, internalState.interTokenState);
-			vint token=-1;
+				proc.extendProc(proc.argument, input, length, false, token);
 #if _DEBUG
 				{
 					bool pausedAtTheEnd = token.length == length && !token.completeToken;
 					CHECK_ERROR(
 						token.completeToken || pausedAtTheEnd,
 						L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input."
 					);
 					CHECK_ERROR(
 						token.completeToken || token.token == internalState.interTokenId,
 						L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id."
 					);
 					CHECK_ERROR(
 						(token.interTokenState == nullptr) == !pausedAtTheEnd,
 						L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."
 					);
 				}
 #endif
 				if (colorize)
 				{
 					proc.colorizeProc(proc.argument, 0, token.length, token.token);
 				}
 				if (!(internalState.interTokenState = token.interTokenState))
 				{
 					internalState.interTokenId = -1;
 				}
 				return token.length;
 			}
-			vint index=0;
+			vint lastFinalStateLength = 0;
 			vint lastFinalStateToken = -1;
 			for (vint i = start; i < length; i++)
 			{
 				vint currentToken = -1;
 				bool finalState = false;
 				bool previousTokenStop = false;
-
+				walker.Walk(input[i], internalState.currentState, currentToken, finalState, previousTokenStop);
 			while(index<length)
 			{
 				currentToken=-1;
 				finalState=false;
 				previousTokenStop=false;
 				walker.Walk(input[index], currentState, currentToken, finalState, previousTokenStop);
 				if (previousTokenStop)
 				{
-					vint tokenLength=stop-start;
+					internalState.currentState = walker.GetStartState();
-					if(tokenLength>0)
+					if (proc.extendProc && lastFinalStateToken != -1)
 					{
-						tokenProc(tokenProcArgument, start, tokenLength, token);
+						RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
-						currentState=state;
+						CallExtendProcAndColorizeProc(input, length, token, colorize);
-						start=stop;
+						return start + token.length;
 						index=stop-1;
 						state=-1;
 						token=-1;
 						finalState=false;
 					}
-					else if(stop<index)
+					else if (i == start)
 					{
-						stop=index+1;
+						if (colorize)
 						tokenProc(tokenProcArgument, start, stop-start, -1);
 						start=index+1;
 						state=-1;
 						token=-1;
 					}
 				}
 				if(finalState)
 						{
-					stop=index+1;
+							proc.colorizeProc(proc.argument, start, 1, -1);
 					state=currentState;
 					token=currentToken;
 						}
-
+						return i + 1;
 				index++;
 			}
 			if(start<length)
 			{
 				if(finalState)
 				{
 					tokenProc(tokenProcArgument, start, length-start, token);
 					}
 					else
 					{
-					tokenProc(tokenProcArgument, start, length-start, walker.GetRelatedToken(currentState));
+						if (colorize)
 						{
 							proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
 						}
 						return start + lastFinalStateLength;
 					}
 				}
 				if (finalState)
 				{
 					lastFinalStateLength = i + 1 - start;
 					lastFinalStateToken = currentToken;
 				}
 			}
 			if (lastFinalStateToken != -1)
 			{
 				if (proc.extendProc)
 				{
 					RegexProcessingToken token(start, lastFinalStateLength, lastFinalStateToken, true, nullptr);
 					CallExtendProcAndColorizeProc(input, length, token, colorize);
 				}
 				else if (colorize)
 				{
 					proc.colorizeProc(proc.argument, start, lastFinalStateLength, lastFinalStateToken);
 				}
 			}
 			else if (colorize)
 			{
 				proc.colorizeProc(proc.argument, start, length - start, walker.GetRelatedToken(internalState.currentState));
 			}
 			return length;
 		}
 		void* RegexLexerColorizer::Colorize(const wchar_t* input, vint length)
 		{
 			vint index = 0;
 			while (index != length)
 			{
 				index = WalkOneToken(input, length, index, true);
 			}
 			return internalState.interTokenState;
 		}
 /***********************************************************************
 RegexLexer
 ***********************************************************************/
-		RegexLexer::RegexLexer(const collections::IEnumerable<WString>& tokens)
+		RegexLexer::RegexLexer(const collections::IEnumerable<WString>& tokens, RegexProc _proc)
-			:pure(0)
+			:proc(_proc)
 		{
 			// Build DFA for all tokens
 			List<Expression::Ref> expressions;
@@ -20622,7 +20724,7 @@ RegexLexer
 		RegexTokens RegexLexer::Parse(const WString& code, vint codeIndex)const
 		{
 			pure->PrepareForRelatedFinalStateTable();
-			return RegexTokens(pure, stateTokens, code, codeIndex);
+			return RegexTokens(pure, stateTokens, code, codeIndex, proc);
 		}
 		RegexLexerWalker RegexLexer::Walk()const
@@ -20633,7 +20735,7 @@ RegexLexer
 		RegexLexerColorizer RegexLexer::Colorize()const
 		{
-			return RegexLexerColorizer(Walk());
+			return RegexLexerColorizer(Walk(), proc);
 		}
 	}
 }
@@ -6820,9 +6820,8 @@ Tokenizer
 ***********************************************************************/
 		/// <summary>A token.</summary>
-		class RegexToken
+		struct RegexToken
 		{
 		public:
 			/// <summary>Position in the input string.</summary>
 			vint										start;
 			/// <summary>Size of this token in characters.</summary>
@@ -6849,6 +6848,75 @@ Tokenizer
 			bool										operator==(const wchar_t* _token)const;
 		};
 		/// <summary>Token information for <see cref="RegexProc::extendProc"/>.</summary>
 		struct RegexProcessingToken
 		{
 			/// <summary>
 			/// The read only start position of the token.
 			/// This value will be -1 if <see cref="interTokenState"/> is not null.
 			/// </summary>
 			const vint									start;
 			/// <summary>
 			/// The length of the token, could be modified after the callback.
 			/// When the callback returns, the length is not allowed to be decreased.
 			/// This value will be -1 if <see cref="interTokenState"/> is not null.
 			/// </summary>
 			vint										length;
 			/// <summary>
 			/// The id of the token, could be modified after the callback.
 			/// </summary>
 			vint										token;
 			/// <summary>
 			/// The flag indicating if this token is completed, could be modified after the callback.
 			/// </summary>
 			bool										completeToken;
 			/// <summary>
 			/// The inter token state object, could be modified after the callback.
 			/// When the callback returns:
 			///   if the completeText parameter is true in <see cref="RegexProc::extendProc"/>, it should be nullptr.
 			///   if the token does not end at the end of the input, it should not be nullptr.
 			///   if a token is completed, it should be nullptr.
 			/// </summary>
 			void*										interTokenState;
 			RegexProcessingToken(vint _start, vint _length, vint _token, bool _completeToken, void* _interTokenState)
 				:start(_start)
 				, length(_length)
 				, token(_token)
 				, completeToken(_completeToken)
 				, interTokenState(_interTokenState)
 			{
 			}
 		};
 		using RegexInterTokenStateDeleter = void(*)(void* interTokenState);
 		using RegexTokenExtendProc = void(*)(void* argument, const wchar_t* reading, vint length, bool completeText, RegexProcessingToken& processingToken);
 		using RegexTokenColorizeProc =  void(*)(void* argument, vint start, vint length, vint token);
 		/// <summary>Callback procedures</summary>
 		struct RegexProc
 		{
 			/// <summary>
 			/// The deleter which deletes inter token state objects created by <see cref="extendProc"/>. This callback is not called automatically.
 			/// </summary>
 			RegexInterTokenStateDeleter					deleter = nullptr;
 			/// <summary>
 			/// The token extend callback. It is called after recognizing any token, and run a customized procedure to modify the token based on the given context.
 			/// If the length parameter is -1, it means the caller does not measure the incoming text buffer, which automatically indicates that the buffer is null-terminated.
 			/// If the length parameter is not -1, it means the number of available characters in the buffer.
 			/// The completeText parameter could be true or false. When it is false, it means that the buffer does not contain all the text.
 			/// </summary>
 			RegexTokenExtendProc						extendProc = nullptr;
 			/// <summary>
 			/// The colorizer callback. It is called when a token is recognized.
 			/// </summary>
 			RegexTokenColorizeProc						colorizeProc = nullptr;
 			/// <summary>
 			/// The argument object that is the first argument for <see cref="extendProc"/> and <see cref="colorizeProc"/>.
 			/// </summary>
 			void*										argument = nullptr;
 		};
 		/// <summary>Token collection representing the result from the lexical analyzer.</summary>
 		class RegexTokens : public Object, public collections::IEnumerable<RegexToken>
 		{
@@ -6858,10 +6926,12 @@ Tokenizer
 			const collections::Array<vint>&				stateTokens;
 			WString										code;
 			vint										codeIndex;
 			RegexProc									proc;
-			RegexTokens(regex_internal::PureInterpretor* _pure, const collections::Array<vint>& _stateTokens, const WString& _code, vint _codeIndex);
+			RegexTokens(regex_internal::PureInterpretor* _pure, const collections::Array<vint>& _stateTokens, const WString& _code, vint _codeIndex, RegexProc _proc);
 		public:
 			RegexTokens(const RegexTokens& tokens);
 			~RegexTokens();
 			collections::IEnumerator<RegexToken>*		CreateEnumerator()const;
@@ -6881,7 +6951,7 @@ Tokenizer
 			RegexLexerWalker(regex_internal::PureInterpretor* _pure, const collections::Array<vint>& _stateTokens);
 		public:
-			RegexLexerWalker(const RegexLexerWalker& walker);
+			RegexLexerWalker(const RegexLexerWalker& tokens);
 			~RegexLexerWalker();
 			/// <summary>Get the start DFA state number, which represents the correct state before parsing any input.</summary>
@@ -6919,48 +6989,59 @@ Tokenizer
 		{
 			friend class RegexLexer;
 		public:
-			typedef void(*TokenProc)(void* argument, vint start, vint length, vint token);
+			struct InternalState
 			{
 				vint									currentState = -1;
 				vint									interTokenId = -1;
 				void*									interTokenState = nullptr;
 			};
 		protected:
 			RegexLexerWalker							walker;
-			vint										currentState;
+			RegexProc									proc;
 			InternalState								internalState;
-			RegexLexerColorizer(const RegexLexerWalker& _walker);
+			void										CallExtendProcAndColorizeProc(const wchar_t* input, vint length, RegexProcessingToken& token, bool colorize);
 			vint										WalkOneToken(const wchar_t* input, vint length, vint start, bool colorize);
 			RegexLexerColorizer(const RegexLexerWalker& _walker, RegexProc _proc);
 		public:
 			RegexLexerColorizer(const RegexLexerColorizer& colorizer);
 			~RegexLexerColorizer();
-			/// <summary>Reset the colorizer using the DFA state number.</summary>
+			/// <summary>Get the internal state.</summary>
-			/// <param name="state">The DFA state number.</param>
+			/// <returns>The internal state.</returns>
-			void										Reset(vint state);
+			InternalState								GetInternalState();
 			/// <summary>Restore the colorizer to a internal state.</summary>
 			/// <param name="value">The internal state.</param>
 			void										SetInternalState(InternalState state);
 			/// <summary>Step forward by one character.</summary>
 			/// <param name="input">The input character.</param>
 			void										Pass(wchar_t input);
 			/// <summary>Get the start DFA state number, which represents the correct state before colorizing any characters.</summary>
 			/// <returns>The DFA state number.</returns>
 			vint										GetStartState()const;
-			/// <summary>Get the current DFA state number.</summary>
+			/// <summary>Colorize a text.</summary>	GetCurrentState()const;
-			/// <returns>The DFA state number.</returns>
+			/// <returns>An inter token state at the end of this line. It could be the same object which is returned from the previous call.</returns>
 			vint										GetCurrentState()const;
 			/// <summary>Colorize a text.</summary>
 			/// <param name="input">The text to colorize.</param>
 			/// <param name="length">Size of the text in characters.</param>
-			/// <param name="tokenProc">Colorizer callback. This callback will be called if any token is found..</param>
+			void*										Colorize(const wchar_t* input, vint length);
 			/// <param name="tokenProcArgument">The argument to call the callback.</param>
 			void										Colorize(const wchar_t* input, vint length, TokenProc tokenProc, void* tokenProcArgument);
 		};
 		/// <summary>Lexical analyzer.</summary>
 		class RegexLexer : public Object, private NotCopyable
 		{
 		protected:
-			regex_internal::PureInterpretor*			pure;
+			regex_internal::PureInterpretor*			pure = nullptr;
 			collections::Array<vint>					ids;
 			collections::Array<vint>					stateTokens;
 			RegexProc									proc;
 		public:
 			/// <summary>Create a lexical analyzer by a set of regular expressions. [F:vl.regex.RegexToken.token] will be the index of the matched regular expression.</summary>
 			/// <param name="tokens">The regular expressions.</param>
-			RegexLexer(const collections::IEnumerable<WString>& tokens);
+			/// <param name="_proc">Callback procedures.</param>
 			RegexLexer(const collections::IEnumerable<WString>& tokens, RegexProc _proc);
 			~RegexLexer();
 			/// <summary>Tokenize a input text.</summary>