Starting out language annotation support implementation

dyegoaurelio · dyegoaurelio · commit 6830219a733c · 2025-09-22T14:20:22.000-03:00
Initial support for language annotations like `/* lua */`
that should remain as block comments when directly preceding string literals,
while other block comments get converted to line comments.

- Detect language annotations: single-line, non-doc comments with valid language identifiers
- Preserve as `/* lang */` block comment syntax instead of converting to `# lang` line comments
- Works with both regular strings `"..."` and indented strings `''...''`
diff --git a/src/Nixfmt/Lexer.hs b/src/Nixfmt/Lexer.hs
@@ -6,11 +6,13 @@
 module Nixfmt.Lexer (lexeme, pushTrivia, takeTrivia, whole) where
 
 import Control.Monad.State.Strict (MonadState, evalStateT, get, modify, put)
-import Data.Char (isSpace)
+import Data.Char (isAlphaNum, isSpace)
 import Data.List (dropWhileEnd)
 import Data.Maybe (fromMaybe)
 import Data.Text as Text (
   Text,
+  all,
+  any,
   isPrefixOf,
   length,
   lines,
@@ -29,6 +31,7 @@ import Data.Void (Void)
 import Nixfmt.Types (
   Ann (..),
   Parser,
+  Token (TDoubleQuote, TDoubleSingleQuote),
   TrailingComment (..),
   Trivia,
   Trivium (..),
@@ -43,9 +46,11 @@ import Text.Megaparsec (
   chunk,
   getSourcePos,
   hidden,
+  lookAhead,
   many,
   manyTill,
   notFollowedBy,
+  optional,
   some,
   try,
   unPos,
@@ -59,6 +64,8 @@ data ParseTrivium
     PTLineComment Text Pos
   | -- Track whether it is a doc comment
     PTBlockComment Bool [Text]
+  | -- | Language annotation like /* lua */ (single line, non-doc)
+    PTLanguageAnnotation Text
   deriving (Show)
 
 preLexeme :: Parser a -> Parser a
@@ -133,6 +140,7 @@ convertTrailing = toMaybe . join . map toText
   where
     toText (PTLineComment c _) = strip c
     toText (PTBlockComment False [c]) = strip c
+    toText (PTLanguageAnnotation _) = "" -- Language annotations don't become trailing comments
     toText _ = ""
     join = Text.unwords . filter (/= "")
     toMaybe "" = Nothing
@@ -148,6 +156,7 @@ convertLeading =
         PTBlockComment _ [] -> []
         PTBlockComment False [c] -> [LineComment $ " " <> strip c]
         PTBlockComment isDoc cs -> [BlockComment isDoc cs]
+        PTLanguageAnnotation c -> [LanguageAnnotation c]
     )
 
 isTrailing :: ParseTrivium -> Bool
@@ -156,17 +165,75 @@ isTrailing (PTBlockComment False []) = True
 isTrailing (PTBlockComment False [_]) = True
 isTrailing _ = False
 
-convertTrivia :: [ParseTrivium] -> Pos -> (Maybe TrailingComment, Trivia)
-convertTrivia pts nextCol =
+-- Check if a text is a valid language identifier for language annotations
+isLanguageIdentifier :: Text -> Bool
+isLanguageIdentifier content =
+  let stripped = strip content
+  in not (Text.null stripped)
+      && Text.length stripped <= 30 -- TODO: make configurable or remove limit
+      && Text.all (\c -> isAlphaNum c || c `elem` ['-', '+', '.', '_']) stripped
+      && not (Text.any (`elem` ['\n', '\r']) content)
+
+-- Check if next token is a string literal
+isStringToken :: Maybe Token -> Bool
+isStringToken (Just TDoubleQuote) = True
+isStringToken (Just TDoubleSingleQuote) = True
+isStringToken _ = False
+
+-- Convert a single block comment to language annotation if it matches criteria
+toLangAnnotation :: Text -> Maybe Token -> Maybe ParseTrivium
+toLangAnnotation content nextToken
+  | isStringToken nextToken && isLanguageIdentifier content =
+      Just (PTLanguageAnnotation (strip content))
+  | otherwise = Nothing
+
+convertTrivia :: [ParseTrivium] -> Pos -> Maybe Token -> (Maybe TrailingComment, Trivia)
+convertTrivia pts nextCol nextToken =
   let (trailing, leading) = span isTrailing pts
-  in case (trailing, leading) of
+      -- Check if we should convert trailing block comment to language annotation
+      -- If so, move it to leading position instead of keeping it as trailing
+      (trailing', leading') = case (trailing, nextToken) of
+        -- If next token is a string and trailing trivia is a potential language annotation
+        ([PTBlockComment False [content]], _)
+          | Just langAnnotation <- toLangAnnotation content nextToken ->
+              ([], langAnnotation : leading)
+        _ ->
+          -- Check if we should convert leading block comment to language annotation
+          case (leading, nextToken) of
+            -- If next token is a string and last trivia is a potential language annotation
+            (PTBlockComment False [content] : rest, _)
+              | Just langAnnotation <- toLangAnnotation content nextToken ->
+                  (trailing, langAnnotation : rest)
+            -- Handle case where block comment is not first in leading trivia
+            _ ->
+              let findAndReplace [] = Nothing
+                  findAndReplace (PTBlockComment False [content] : rest)
+                    | Just langAnnotation <- toLangAnnotation content nextToken =
+                        Just (langAnnotation : rest)
+                  findAndReplace (x : xs) = (x :) <$> findAndReplace xs
+              in case findAndReplace leading of
+                  Just newLeading -> (trailing, newLeading)
+                  Nothing -> (trailing, leading)
+  in case (trailing', leading') of
       -- Special case: if the trailing comment visually forms a block with the start of the following line,
       -- then treat it like part of those comments instead of a distinct trailing comment.
       -- This happens especially often after `{` or `[` tokens, where the comment of the first item
       -- starts on the same line ase the opening token.
       ([PTLineComment _ pos], (PTNewlines 1) : (PTLineComment _ pos') : _) | pos == pos' -> (Nothing, convertLeading pts)
       ([PTLineComment _ pos], [PTNewlines 1]) | pos == nextCol -> (Nothing, convertLeading pts)
-      _ -> (convertTrailing trailing, convertLeading leading)
+      _ -> (convertTrailing trailing', convertLeading leading')
+
+-- Parser to peek at the next token type without consuming input
+parseNextTokenType :: Parser Token
+parseNextTokenType = do
+  -- Skip any trivia that might appear before the next token
+  _ <- many (hidden $ lineComment <|> blockComment <|> newlines)
+  -- Skip any remaining whitespace
+  _ <- manyP (\x -> isSpace x && x /= '\n' && x /= '\r')
+  TDoubleQuote
+    <$ chunk "\""
+      <|> TDoubleSingleQuote
+    <$ chunk "''"
 
 trivia :: Parser [ParseTrivium]
 trivia = many $ hidden $ lineComment <|> blockComment <|> newlines
@@ -188,7 +255,11 @@ lexeme p = do
   parsedTrivia <- trivia
   -- This is the position of the next lexeme after the currently parsed one
   SourcePos{sourceColumn = col} <- getSourcePos
-  let (trailing, nextLeading) = convertTrivia parsedTrivia col
+
+  -- Add lookahead for next token
+  nextToken <- optional (try $ lookAhead $ preLexeme parseNextTokenType)
+
+  let (trailing, nextLeading) = convertTrivia parsedTrivia col nextToken
   pushTrivia nextLeading
   return $
     Ann
diff --git a/src/Nixfmt/Pretty.hs b/src/Nixfmt/Pretty.hs
@@ -86,6 +86,7 @@ instance Pretty TrailingComment where
 instance Pretty Trivium where
   pretty EmptyLine = emptyline
   pretty (LineComment c) = comment ("#" <> c) <> hardline
+  pretty (LanguageAnnotation lang) = comment ("/* " <> lang <> " */") <> hardspace
   pretty (BlockComment isDoc c) =
     comment (if isDoc then "/**" else "/*")
       <> hardline
@@ -109,6 +110,8 @@ prettyItems (Items items) = sepBy hardline items
 
 instance Pretty [Trivium] where
   pretty [] = mempty
+  -- Special case: if trivia consists only of a single language annotation, render it inline without a preceding hardline
+  pretty [langAnnotation@(LanguageAnnotation _)] = pretty langAnnotation
   pretty trivia = hardline <> hcat trivia
 
 instance (Pretty a) => Pretty (Ann a) where
diff --git a/src/Nixfmt/Types.hs b/src/Nixfmt/Types.hs
@@ -72,6 +72,8 @@ data Trivium
   | -- Multi-line comments with /* or /**. Multiple # comments are treated as a list of `LineComment`.
     -- The bool indicates a doc comment (/**)
     BlockComment Bool [Text]
+  | -- | Language annotation comments like /* lua */ that should remain as block comments before strings
+    LanguageAnnotation Text
   deriving (Eq, Show)
 
 type Trivia = [Trivium]