@@ -76,6 +76,7 @@ import Control.Monad.Reader (ask, asks, local)
76
76
import Data.Array qualified as Array
77
77
import Data.ByteString qualified as BS
78
78
import Data.ByteString.Base16 qualified as Base16
79
+ import Data.ByteString.Char8 qualified as BSC
79
80
import Data.Generics.Uniplate.Data (transform , universeBi )
80
81
import Data.List (elemIndex , isPrefixOf , isSuffixOf )
81
82
import Data.Map qualified as Map
@@ -109,17 +110,34 @@ String literals are handled specially, see Note [String literals].
109
110
-}
110
111
111
112
{- Note [unpackFoldrCString#]
112
- This function is introduced by rewrite rules, and usually eliminated by them in concert with `build`.
113
+ This function is introduced by rewrite rules, and usually eliminated by them
114
+ in concert with `build`.
113
115
114
- However, since we often mark things as INLINABLE, we get pre-optimization Core where only the
115
- first transformation has fired. So we need to do something with the function.
116
+ However, since we often mark things as INLINABLE, we get pre-optimization Core
117
+ where only the first transformation has fired. So we need to do something with
118
+ the function.
116
119
117
- - We can't easily turn it into a normal fold expression, since we'd need to make a lambda and
118
- we're not in 'CoreM' so we can't make fresh names.
119
- - We can't easily translate it to a builtin, since we don't support higher-order functions.
120
+ - We can't easily turn it into a normal fold expression, since we'd need to make
121
+ a lambda and we're not in 'CoreM' so we can't make fresh names.
120
122
121
- So we use a horrible hack and match on `build . unpackFoldrCString#` to "undo" the original rewrite
122
- rule.
123
+ - We can't easily translate it to a builtin, since we don't support higher-order
124
+ functions.
125
+
126
+ So we use a horrible hack and match on `build . unpackFoldrCString#` to "undo"
127
+ the original rewrite rule.
128
+
129
+ Moreover, in some cases GHC generates such expressions where the first literal
130
+ character is "un-consed" from its tail, for example:
131
+
132
+ GHC.Types.:
133
+ @GHC.Types.Char
134
+ (GHC.Types.C# 'f'#)
135
+ (GHC.Base.build
136
+ @GHC.Types.Char
137
+ (\@b -> GHC.CString.unpackFoldrCString# @b "0d1"#)
138
+ )
139
+
140
+ Then we re-do the cons after un-doing the original rewrite rule.
123
141
-}
124
142
125
143
compileLiteral ::
@@ -149,14 +167,43 @@ tryStringLiteralAsBytes coreExpr = case coreExpr of
149
167
GHC. Var isUnpackCStringUtf8 `GHC. App ` GHC. Lit (GHC. LitString bytes)
150
168
| GHC. getName isUnpackCStringUtf8 == GHC. unpackCStringUtf8Name ->
151
169
Just bytes
152
- -- See Note [unpackFoldrCString#]
153
- GHC. Var build `GHC. App ` _ `GHC. App ` GHC. Lam _ (GHC. Var unpack `GHC. App ` _ `GHC. App ` expr)
154
- | GHC. getName build == GHC. buildName && GHC. getName unpack == GHC. unpackCStringFoldrName ->
155
- tryStringLiteralAsBytes expr
156
- -- GHC helpfully generates an empty list for the empty string literal instead of a 'LitString'
170
+
171
+ {- See Note [unpackFoldrCString#]
172
+
173
+ Example GHC Core expr this pattern matches:
174
+ GHC.Base.build
175
+ @GHC.Types.Char
176
+ (\@b -> GHC.CString.unpackFoldrCString# @b "0d1"#)
177
+ -}
178
+ GHC. Var build
179
+ `GHC. App ` GHC. Type (GHC. TyConApp charTyCon _kindOrType)
180
+ `GHC. App ` GHC. Lam _ (GHC. Var unpack `GHC. App ` _ `GHC. App ` expr)
181
+ | GHC. getName build == GHC. buildName
182
+ , GHC. getName unpack == GHC. unpackCStringFoldrName
183
+ , GHC. getName charTyCon == GHC. charTyConName ->
184
+ tryStringLiteralAsBytes expr
185
+ {-
186
+ Example GHC Core expr this pattern matches:
187
+ GHC.Types.: @GHC.Types.Char (GHC.Types.C# 'f'#) expr
188
+ -}
189
+ GHC. Var consId
190
+ `GHC. App ` GHC. Type (GHC. TyConApp charTyCon _kindOrType)
191
+ `GHC. App ` (GHC. Var cSharp `GHC. App ` GHC. Lit (GHC. LitChar c))
192
+ `GHC. App ` expr
193
+ | GHC. getName charTyCon == GHC. charTyConName
194
+ , Just consDataCon <- GHC. isDataConId_maybe consId
195
+ , GHC. consDataCon == consDataCon
196
+ , Just charDataCon <- GHC. isDataConId_maybe cSharp
197
+ , GHC. charDataCon == charDataCon ->
198
+ BSC. cons c <$> tryStringLiteralAsBytes expr
199
+
200
+ -- GHC helpfully generates an empty list for the empty string literal instead
201
+ -- of a 'LitString'
157
202
GHC. Var nil `GHC. App ` GHC. Type (GHC. tyConAppTyCon_maybe -> Just tc)
158
- | nil == GHC. dataConWorkId GHC. nilDataCon, GHC. getName tc == GHC. charTyConName ->
203
+ | nil == GHC. dataConWorkId GHC. nilDataCon
204
+ , GHC. getName tc == GHC. charTyConName ->
159
205
Just mempty
206
+
160
207
-- Chase variable references! GHC likes to lift string constants to variables,
161
208
-- that is not good for us!
162
209
GHC. Var (GHC. maybeUnfoldingTemplate . GHC. realIdUnfolding -> Just unfolding) ->
@@ -201,10 +248,13 @@ If 'ByteString' contains a codepoint that is not in this range, the function wil
201
248
-}
202
249
utf8CodePointsAsBytes :: Compiling uni fun m ann => BS. ByteString -> m BS. ByteString
203
250
utf8CodePointsAsBytes bs =
204
- case gracefullyDecodeUtf8Bytes ( BS. unpack bs) of
205
- Just bytes -> pure $ BS. pack bytes
251
+ case tryUtf8CodePointsAsBytes bs of
252
+ Just bytes -> pure bytes
206
253
Nothing -> throwPlain . CompilationError $
207
254
" ByteString literal is expected to contain only codepoints in the range 0 - 255 (0x00 - 0xFF)"
255
+
256
+ tryUtf8CodePointsAsBytes :: BS. ByteString -> Maybe BS. ByteString
257
+ tryUtf8CodePointsAsBytes = fmap BS. pack . gracefullyDecodeUtf8Bytes . BS. unpack
208
258
where
209
259
{-
210
260
Why not use 'Data.Text.Encoding'?
0 commit comments