@@ -76,6 +76,7 @@ import Control.Monad.Reader (ask, asks, local)
76
76
import Data.Array qualified as Array
77
77
import Data.ByteString qualified as BS
78
78
import Data.ByteString.Base16 qualified as Base16
79
+ import Data.ByteString.Char8 qualified as BSC
79
80
import Data.Generics.Uniplate.Data (transform , universeBi )
80
81
import Data.List (elemIndex , isPrefixOf , isSuffixOf )
81
82
import Data.Map qualified as Map
@@ -109,17 +110,34 @@ String literals are handled specially, see Note [String literals].
109
110
-}
110
111
111
112
{- Note [unpackFoldrCString#]
112
- This function is introduced by rewrite rules, and usually eliminated by them in concert with `build`.
113
+ This function is introduced by rewrite rules, and usually eliminated by them
114
+ in concert with `build`.
113
115
114
- However, since we often mark things as INLINABLE, we get pre-optimization Core where only the
115
- first transformation has fired. So we need to do something with the function.
116
+ However, since we often mark things as INLINABLE, we get pre-optimization Core
117
+ where only the first transformation has fired. So we need to do something with
118
+ the function.
116
119
117
- - We can't easily turn it into a normal fold expression, since we'd need to make a lambda and
118
- we're not in 'CoreM' so we can't make fresh names.
119
- - We can't easily translate it to a builtin, since we don't support higher-order functions.
120
+ - We can't easily turn it into a normal fold expression, since we'd need to make
121
+ a lambda and we're not in 'CoreM' so we can't make fresh names.
120
122
121
- So we use a horrible hack and match on `build . unpackFoldrCString#` to "undo" the original rewrite
122
- rule.
123
+ - We can't easily translate it to a builtin, since we don't support higher-order
124
+ functions.
125
+
126
+ So we use a horrible hack and match on `build . unpackFoldrCString#` to "undo"
127
+ the original rewrite rule.
128
+
129
+ Moreover, in some cases GHC generates such expressions where the first literal
130
+ character is "un-consed" from its tail, for example:
131
+
132
+ GHC.Types.:
133
+ @GHC.Types.Char
134
+ (GHC.Types.C# 'f'#)
135
+ (GHC.Base.build
136
+ @GHC.Types.Char
137
+ (\@b -> GHC.CString.unpackFoldrCString# @b "0d1"#)
138
+ )
139
+
140
+ Then we re-do the cons after un-doing the original rewrite rule.
123
141
-}
124
142
125
143
compileLiteral ::
@@ -149,14 +167,44 @@ tryStringLiteralAsBytes coreExpr = case coreExpr of
149
167
GHC. Var isUnpackCStringUtf8 `GHC. App ` GHC. Lit (GHC. LitString bytes)
150
168
| GHC. getName isUnpackCStringUtf8 == GHC. unpackCStringUtf8Name ->
151
169
Just bytes
152
- -- See Note [unpackFoldrCString#]
153
- GHC. Var build `GHC. App ` _ `GHC. App ` GHC. Lam _ (GHC. Var unpack `GHC. App ` _ `GHC. App ` expr)
154
- | GHC. getName build == GHC. buildName && GHC. getName unpack == GHC. unpackCStringFoldrName ->
155
- tryStringLiteralAsBytes expr
156
- -- GHC helpfully generates an empty list for the empty string literal instead of a 'LitString'
170
+
171
+ {- See Note [unpackFoldrCString#]
172
+
173
+ Example GHC Core expr this pattern matches:
174
+ GHC.Base.build
175
+ @GHC.Types.Char
176
+ (\@b -> GHC.CString.unpackFoldrCString# @b "0d1"#)
177
+
178
+ -}
179
+ GHC. Var build
180
+ `GHC. App ` GHC. Type (GHC. TyConApp charTyCon _kindOrType)
181
+ `GHC. App ` GHC. Lam _ (GHC. Var unpack `GHC. App ` _ `GHC. App ` expr)
182
+ | GHC. getName build == GHC. buildName
183
+ , GHC. getName unpack == GHC. unpackCStringFoldrName
184
+ , GHC. getName charTyCon == GHC. charTyConName ->
185
+ tryStringLiteralAsBytes expr
186
+ {-
187
+ Example GHC Core expr this pattern matches:
188
+ GHC.Types.: @GHC.Types.Char (GHC.Types.C# 'f'#) expr
189
+ -}
190
+ ( GHC. Var consId
191
+ `GHC. App ` (GHC. Type (GHC. TyConApp charTyCon _kindOrType))
192
+ `GHC. App ` (GHC. Var cSharp `GHC. App ` GHC. Lit (GHC. LitChar c))
193
+ ) `GHC. App ` expr
194
+ | GHC. getName charTyCon == GHC. charTyConName
195
+ , Just consDataCon <- GHC. isDataConId_maybe consId
196
+ , GHC. consDataCon == consDataCon
197
+ , Just charDataCon <- GHC. isDataConId_maybe cSharp
198
+ , GHC. charDataCon == charDataCon ->
199
+ BSC. cons c <$> tryStringLiteralAsBytes expr
200
+
201
+ -- GHC helpfully generates an empty list for the empty string literal instead
202
+ -- of a 'LitString'
157
203
GHC. Var nil `GHC. App ` GHC. Type (GHC. tyConAppTyCon_maybe -> Just tc)
158
- | nil == GHC. dataConWorkId GHC. nilDataCon, GHC. getName tc == GHC. charTyConName ->
204
+ | nil == GHC. dataConWorkId GHC. nilDataCon
205
+ , GHC. getName tc == GHC. charTyConName ->
159
206
Just mempty
207
+
160
208
-- Chase variable references! GHC likes to lift string constants to variables,
161
209
-- that is not good for us!
162
210
GHC. Var (GHC. maybeUnfoldingTemplate . GHC. realIdUnfolding -> Just unfolding) ->
@@ -201,10 +249,13 @@ If 'ByteString' contains a codepoint that is not in this range, the function wil
201
249
-}
202
250
utf8CodePointsAsBytes :: Compiling uni fun m ann => BS. ByteString -> m BS. ByteString
203
251
utf8CodePointsAsBytes bs =
204
- case gracefullyDecodeUtf8Bytes ( BS. unpack bs) of
205
- Just bytes -> pure $ BS. pack bytes
252
+ case tryUtf8CodePointsAsBytes bs of
253
+ Just bytes -> pure bytes
206
254
Nothing -> throwPlain . CompilationError $
207
255
" ByteString literal is expected to contain only codepoints in the range 0 - 255 (0x00 - 0xFF)"
256
+
257
+ tryUtf8CodePointsAsBytes :: BS. ByteString -> Maybe BS. ByteString
258
+ tryUtf8CodePointsAsBytes = fmap BS. pack . gracefullyDecodeUtf8Bytes . BS. unpack
208
259
where
209
260
{-
210
261
Why not use 'Data.Text.Encoding'?
0 commit comments