Remove redundant logic and simplify string tokenization

spoonmilk · spoonmilk · commit 6e2915d449da · 2025-06-16T17:42:43.000-04:00
diff --git a/formatter/generic/genericformatter.cpp b/formatter/generic/genericformatter.cpp
@@ -19,7 +19,6 @@ enum ItemType
 	StringComponent,
 	StringSeparator,
 	StringWhitespace,
-	StringNewline,
 	FormatSpecifier,
 	EscapeSequence,
 	Group,
@@ -263,16 +262,26 @@ static vector<InstructionTextToken> ParseStringToken(
     const auto& src = unprocessedStringToken.text;
     const size_t tail = src.size();
 
-	// Max parsing length set for performance reasons, increase at your own peril!
+	// Max parsing length set to max annotation length
     if (tail > maxParsingLength)
         return { unprocessedStringToken };
-
     vector<InstructionTextToken> result;
     size_t curStart = 0, curEnd = 0;
+
     auto ConstructToken = [&](size_t start, size_t end) {
-        result.emplace_back(StringToken, string(src.substr(start, end - start)));
+    	InstructionTextToken token = unprocessedStringToken;
+    	const string newTxt = string(src.substr(start, end - start));
+    	token.text = newTxt;
+    	token.width = newTxt.size();
+        result.emplace_back(token);
     };
 
+	auto flushToken = [&](size_t start, size_t end)
+	{
+		if (start < end)
+			ConstructToken(start, end);
+	};
+
 	// We generally split along spaces while keeping words intact, but some cases have
 	// specific splitting behavior:
 	//
@@ -288,8 +297,7 @@ static vector<InstructionTextToken> ParseStringToken(
         if (c == '%')
         {
         	// Flush before format specifier
-            if (curStart < curEnd)
-                ConstructToken(curStart, curEnd);
+        	flushToken(curStart, curEnd);
 
             size_t start = curEnd;
             curEnd++;
@@ -301,8 +309,7 @@ static vector<InstructionTextToken> ParseStringToken(
         else if (c == '\\')
         {
         	// Flush before escape sequence
-            if (curStart < curEnd)
-                ConstructToken(curStart, curEnd);
+			flushToken(curStart, curEnd);
 
             size_t start = curEnd;
             curEnd++;  // consume '\'
@@ -314,8 +321,8 @@ static vector<InstructionTextToken> ParseStringToken(
         else if (c == ',' || c == '.' || c == ':' || c == ';' || isspace(c))
         {
         	// Flush before punctuation
-            if (curStart < curEnd)
-                ConstructToken(curStart, curEnd);
+        	flushToken(curStart, curEnd);
+
 			// Group together repeated punctuation
             size_t start = curEnd;
             while (curEnd < tail && src[curEnd] == c)
@@ -329,9 +336,7 @@ static vector<InstructionTextToken> ParseStringToken(
         }
     }
 
-    if (curStart < curEnd)
-        ConstructToken(curStart, curEnd);
-
+	flushToken(curStart, curEnd);
     return result;
 }
 
@@ -341,7 +346,7 @@ static vector<Item> CreateStringGroups(const vector<Item>& items)
     bool hasStrings = false;
     for (auto& i : items)
     {
-		if ((i.type == StringSeparator) && !i.tokens.empty())
+		if (i.type == StringSeparator && !i.tokens.empty())
 		{
 			// We try to push separators onto a preceding word, otherwise treat as
 			// a singular atom
@@ -370,13 +375,16 @@ static vector<Item> CreateStringGroups(const vector<Item>& items)
     	}
     	else if (i.type == FormatSpecifier || i.type == EscapeSequence)
     	{
+    		// Flush previous tokens before special sequences like format specifiers or
+    		// escape sequences
     		if (!pending.empty())
     		{
     			result.push_back(Item {StringComponent, pending, {}, 0 });
     			pending.clear();
     		}
     		result.push_back(Item { Atom, i.items, i.tokens, i.width});
     	}
+
     	else if (i.type == StartOfContainer && pending.empty())
     	{
     		result.push_back(i);
@@ -739,6 +747,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 			switch (token.type)
 			{
 			case BraceToken:
+				// Beginning of string
 				if (tokenIndex + 1 < currentLine.tokens.size()
 					&& currentLine.tokens[tokenIndex + 1].type == StringToken)
 				{
@@ -751,7 +760,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 					items.clear();
 					items.push_back(Item {StartOfContainer, {}, {token}, 0});
 				}
-				// Check for end of string - gross!
+				// End of string
 				else if (currentLine.tokens[tokenIndex].type == StringToken
 					&& tokenIndex + 1 < currentLine.tokens.size()
 					&& currentLine.tokens[tokenIndex + 1].type == BraceToken)
@@ -817,24 +826,15 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 			case StringToken:
 			{
 				vector<InstructionTextToken> stringTokens = ParseStringToken(token, settings.maximumAnnotationLength);
-				for (size_t k = 0; k < stringTokens.size(); k++)
+				for (auto subToken : stringTokens)
 				{
-					InstructionTextToken subToken = stringTokens[k];
 					string trimmedSubText = TrimString(subToken.text);
 					if (trimmedSubText.empty())
 						items.push_back(Item {StringWhitespace, {}, {subToken}, 0});
 					if (trimmedSubText[0] == '%')
 						items.push_back(Item {FormatSpecifier, {}, {subToken}, 0});
 					else if (!trimmedSubText.empty() && trimmedSubText[0] == '\\')
-					{
-						if (trimmedSubText.size() > 1)
-						{
-							if (trimmedSubText[1] == 'n')
-								items.push_back(Item {StringNewline, {}, {subToken}, 0});
-							continue;
-						}
 						items.push_back(Item {EscapeSequence, {}, {subToken}, 0});
-					}
 					else if (trimmedSubText[0] == ',' || trimmedSubText[0] == '.' || trimmedSubText[0] == ':' || trimmedSubText[0] == ';')
 						items.push_back(Item {StringSeparator, {}, {subToken}, 0});
 					else
@@ -937,9 +937,16 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 
 			for (auto item = items.begin(); item != items.end();)
 			{
-				if (currentWidth + item->width > desiredWidth && item->type != StringWhitespace)
+				if (item->type == StringComponent && currentWidth + item->width > desiredWidth)
+				{
+					// If a string is too wide to fit on the current line, create a newline
+					// without additional indentation
+					newLine();
+				}
+				else if (currentWidth + item->width > desiredWidth && item->type != StringWhitespace)
 				{
 					// Current item is too wide to fit on the current line, will need to start a new line.
+					// Whitespace is allowed to be too wide; we push it on as the preceding word is wrapped.
 					auto next = item;
 					++next;
 
@@ -948,7 +955,7 @@ vector<DisassemblyTextLine> GenericLineFormatter::FormatLines(
 					// is a container, always use the splitting behavior.
 					if (currentWidth == 0 || item->width > desiredContinuationWidth || item->type == Container)
 					{
-						if ((item->type == Argument || item->type == StringComponent) && currentWidth != 0)
+						if (item->type == Argument && currentWidth != 0)
 						{
 							// If an argument is too wide to show on a single line all by itself, start the argument
 							// on a new line, and add additional indentation for the continuation of the argument.