forked from svenssonjoel/Obsidian
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExamplesNoCUDA.hs
378 lines (293 loc) · 11.4 KB
/
ExamplesNoCUDA.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
module ExamplesNoCuda where
import qualified Obsidian.CodeGen.CUDA as CUDA
import Obsidian
import Data.Word
import Data.Int
import Data.Bits
import qualified Data.Vector.Storable as V
import Control.Monad.State
import Prelude hiding (zipWith,sum,replicate,take,drop)
import qualified Prelude as P
---------------------------------------------------------------------------
-- Util
---------------------------------------------------------------------------
quickPrint :: ToProgram prg => prg -> InputList prg -> IO ()
quickPrint prg input =
putStrLn $ CUDA.genKernel "kernel" prg input
---------------------------------------------------------------------------
-- MapFusion example
---------------------------------------------------------------------------
mapFusion :: Pull Word32 EInt -> BProgram (Pull Word32 EInt)
mapFusion arr =
do
imm <- force $ (fmap (+1) . fmap (*2)) arr
force $ (fmap (+3) . fmap (*4)) imm
splitUp :: (ASize l, Num l)
=> l -> Pull (Exp Word32) a -> Pull (Exp Word32) (Pull l a)
splitUp n (Pull m ixf) = Pull (m `div` fromIntegral n) $
\i -> Pull n $ \j -> ixf (i * (sizeConv n) + j)
splitUpS :: Word32 -> Pull Word32 a -> Pull Word32 (Pull Word32 a)
splitUpS n (Pull m ixf) = Pull (m `div` n) $
\i -> Pull n $ \j -> ixf (i * (fromIntegral n) + j)
--test1 :: Pull (Exp Word32) EInt -> GProgram (Push Grid (Exp Word32) EInt)
--test1 input = liftG $ fmap mapFusion (splitUp 256 input)
input1 :: Pull (Exp Word32) EInt
input1 = namedGlobal "apa" (variable "X")
---------------------------------------------------------------------------
-- Scans
---------------------------------------------------------------------------
{-
sklansky :: (Choice a, MemoryOps a)
=> Int
-> (a -> a -> a)
-> Pull Word32 a
-> BProgram (Pull Word32 a)
sklansky 0 op arr = return arr
sklansky n op arr =
do
let arr1 = binSplit (n-1) (fan op) arr
arr2 <- force arr1
sklansky (n-1) op arr2
-- fan :: (Choice a, ASize l) => (a -> a -> a) -> Pull l a -> Pull l a
fan :: Choice a => (a -> a -> a) -> SPull a -> SPull a
fan op arr = a1 `conc` fmap (op c) a2
where
(a1,a2) = halve arr
c = a1 ! sizeConv (len a1 - 1)
sklanskyG logbs op arr =
mapG (sklansky logbs op) (splitUp (2^logbs) arr)
getSklansky =
quickPrint (sklanskyG 8 (+))
((undefined :: Pull (Exp Word32) EInt32) :- ())
-}
---------------------------------------------------------------------------
-- kStone (TEST THAT THIS IS REALLY A SCAN!)
---------------------------------------------------------------------------
{-
kStone :: (Choice a, MemoryOps a)
=> Int -> (a -> a -> a) -> Pull Word32 a -> BProgram (Pull Word32 a)
kStone 0 op arr = return arr
kStone n op arr =
do
res <- kStone (n-1) op arr
let r1 = drop (2^(n-1)) res
r1' = take (2^(n-1)) res
r2 = zipWith op res r1
force (r1' `conc` r2)
-- Push array version
kStoneP :: (Choice a, MemoryOps a)
=> Int -> (a -> a -> a) -> Pull Word32 a -> BProgram (Pull Word32 a)
kStoneP 0 op arr = return arr
kStoneP n op arr =
do
res <- kStoneP (n-1) op arr
let r1 = drop (2^(n-1)) res
r1' = take (2^(n-1)) res
r2 = zipWith op res r1
force (concP Block r1' r2)
-}
--kStoneG logbs op =
--join . liftM forceG . liftG . fmap (kStone logbs op) . splitUp (2^logbs)
--kStonePG logbs op =
-- join . liftM forceG . liftG . fmap (kStoneP logbs op) . splitUp (2^logbs)
--getKStone =
-- quickPrint (kStoneG 8 (+))
-- (undefinedGlobal (variable "X") :: Pull (Exp Word32) EInt32)
--getKStoneP =
-- quickPrint (kStonePG 8 (+))
-- (undefinedGlobal (variable "X") :: Pull (Exp Word32) EInt32)
---------------------------------------------------------------------------
-- Brent Kung
---------------------------------------------------------------------------
bKung :: (Choice a, MemoryOps a)
=> (a -> a -> a) -> Pull Word32 a -> BProgram (Pull Word32 a)
bKung op arr | len arr == 1 = return arr
bKung op arr = undefined
--bKungG op =
-- join . liftM forceG . liftG . fmap (bKung op) . splitUp 256
--getBKung =
-- quickPrint (bKungG (+))
-- (undefinedGlobal (variable "X") :: Pull (Exp Word32) EInt32)
---------------------------------------------------------------------------
-- Go Towards Counting sort again.
---------------------------------------------------------------------------
histogram :: Pull EWord32 EInt32 -> GProgram ()
histogram arr = do
global <- Output $ Pointer Word32
forAllT (len arr) $ \gix -> atomicOp global (i32ToW32 (arr ! gix)) AtomicInc
atomicOp n e1 a = AtomicOp n e1 a >> return ()
getHist =
quickPrint histogram
((undefinedGlobal (variable "X") :: Pull (Exp Word32) EInt32) :- ())
reconstruct :: Pull EWord32 EWord32 -> Push Grid EWord32 EInt32
reconstruct arr = Push (len arr) f
where
f k = do forAllT (len arr) $ \gix ->
let startIx = arr ! gix
in SeqFor (arr ! (gix+1) - startIx) $ \ix ->
do
k (w32ToI32 gix) (ix + startIx)
getRec =
quickPrint reconstruct
((undefinedGlobal (variable "X") :: Pull (EWord32) EWord32) :- ())
---------------------------------------------------------------------------
-- Testing some sequential loop approaches
---------------------------------------------------------------------------
{-
testFold :: Pull Word32 EWord32 -> Pull Word32 (Program Thread EWord32)
testFold arr = fmap (seqFold (+) 0) (splitUpS (32 :: Word32) arr)
testFold2 :: Pull Word32 EWord32 -> BProgram (Pull Word32 EWord32)
testFold2 = liftB . testFold
testFold3 :: Pull EWord32 EWord32
-> Pull EWord32 (BProgram (Pull Word32 EWord32))
testFold3 arr = fmap (testFold2) (splitUp 256 arr)
testFold4 :: Pull EWord32 EWord32 -> Program Grid ()
testFold4 = join . liftM forceG . liftG . testFold3
flatten :: ASize l => Pull EWord32 (Pull l a) -> Pull EWord32 a
flatten pp =
Pull (n*m) $ \ix -> (pp ! (ix `div` m)) ! (ix `mod` m)
where
n = len pp
m = sizeConv (len (pp ! 0))
inputFold :: Pull Word32 EWord32
inputFold = namedPull "apa" 256
inputF :: Pull EWord32 EWord32
inputF = namedPull "apa" (variable "X")
-- reverseglobal
revG :: Pull EWord32 a -> Pull EWord32 a
revG arr = mkPullArray n $ \ix -> arr ! (sizeConv n - 1 - ix)
where
n = len arr
testRev :: Scalar a=> Pull EWord32 (Exp a) -> GProgram ()
testRev = forceG . push Grid . revG
-}
---------------------------------------------------------------------------
-- Simple
---------------------------------------------------------------------------
s1 :: ( Num a, MemoryOps a) =>
Pull Word32 a -> BProgram (Pull Word32 a)
s1 arr = do
a1 <- force (fmap (+3) arr)
a2 <- force (fmap (+2) a1)
force (fmap (+1) a2)
--gs1 :: (Num a, MemoryOps a) =>
-- Pull EWord32 a -> Program Grid (Push Grid EWord32 a)
--gs1 = liftG . (fmap s1) . splitUp 256
--getgs1 =
-- quickPrint (join . liftM forceG . gs1)
-- (undefinedGlobal (variable "X") :: Pull (EWord32) EWord32)
---------------------------------------------------------------------------
-- Matrix Mul
---------------------------------------------------------------------------
type SMatrix a = Pull Word32 (Pull Word32 a)
{-
transpose :: (ASize l1, ASize l2) => Pull l1 (Pull l2 a) -> Pull l2 (Pull l1 a)
transpose arr = mkPullArray m
$ \i -> mkPullArray n
$ \j -> (arr ! j) ! i
where
n = len arr
m = len (arr ! 0)
-}
transpose :: SMatrix a -> SMatrix a
transpose arr = mkPullArray m
$ \i -> mkPullArray n
$ \j -> (arr ! j) ! i
where
n = len arr
m = len (arr ! 0)
{-
matMul :: (Num a1, ASize l1, ASize l, MemoryOps a1, LiftB a1)
=> Pull l1 (Pull l a1)
-> Pull l (Pull Word32 a1) -> Program Grid (Push Grid l1 a1)
matMul x y = liftG
-- Pull l (BProgram (Pull l EFloat))
$ fmap liftB
-- Pull l (Pull l (Program Thread EFloat))
$ mkPullArray n
$ \i -> mkPullArray m
$ \j -> cell i j
where cell i j = seqFold (+) 0 $ zipWith (*) (x ! i) (y' ! j)
y' = transpose y
n = len x
m = len y'
-}
mkMatrix n m f = mkPullArray n $ \i -> mkPullArray m $ \j -> f i j
{-
matMul :: (Num a, MemoryOps a, LiftB a)
=> SMatrix a -> SMatrix a -> Program Grid (Push Grid Word32 a)
matMul x y = liftG
-- :: Pull l (BProgram (Pull l a))
$ fmap liftB
-- :: Pull l (Pull l (Program Thread a))
$ mkMatrix n m cell
where cell i j = seqFold (+) 0 $ zipWith (*) (x ! i) (y' ! j)
y' = transpose y
n = len x
m = len y'
-}
--matMul2 :: Num a
-- => SMatrix a -> SMatrix a -> Push Grid Word32 a
{-
matMul :: (Num c, MemoryOps c)
=> SPull (SPull c)
-> SPull (SPull c) -> SPush Grid c
matMul x y = zipWithG body (replicate n x) (replicate m (transpose y))
where
n = len x
m = len (y ! 0)
body a b = force (zipWithT cell a b)
cell i j = do
let arr = zipWith (*) i j
r <- seqReduce (+) arr
return (singleton r)
-}
-- where cell i j = seqFold (+) 0 $ zipWith (*) (x ! i) (y' ! j)
-- y' = transpose y
-- n = len x
-- m = len y'
{-
matMulIn a b = matMul (toMatrix 256 256 a) (toMatrix 256 256 b)
toMatrix :: Word32 -> Word32 -> Pull Word32 a -> SMatrix a
toMatrix n m arr = Pull n $ \i -> Pull m $ \j -> arr ! (i * (sizeConv m) + j)
getMM =
quickPrint matMulIn
((undefinedGlobal (256*256) {-(variable "X")-} :: Pull Word32 EFloat) :-
(undefinedGlobal (256*256) {-(variable "Y")-} :: Pull Word32 EFloat) :- ())
-}
{-
getMM2 =
quickPrint matMulIn2
((undefinedGlobal (256*256) {-(variable "X")-} :: Pull Word32 EFloat) :->
(undefinedGlobal (256*256) {-(variable "Y")-} :: Pull Word32 EFloat))
-}
{-
inc :: SPull EFloat -> SPull EFloat
inc = fmap (+1)
getIncP = putStrLn $ genKernel "incP" incP (input :- ())
input :: DPull EFloat
input = namedGlobal "apa" (variable "X")
incP :: DPull EFloat -> DPush Grid EFloat
incP arr = mapG (return . inc) ((splitUp 512 . ixMap (vperm2 12 3 1. vperm 11 1 0)) arr)
swapBitBlocks :: Int -> Int -> Int -> Exp Word32 -> Exp Word32
swapBitBlocks l m r i = f .|. (lbs `shiftR` (m-r)) .|. (rbs `shiftL` (l-m))
where
f = i .&. complement (oneBitsFT r l)
lbs = i .&. (oneBitsFT m l)
rbs = i .&. (oneBitsFT r m)
oneBitsFT :: Int -> Int -> Exp Word32
oneBitsFT i j = (1 `shiftL` j) - (1 `shiftL` i)
-- r > 0 xor the bit at r-1 with all bits in the block from r to l
bitBlockXor :: Int -> Int -> Exp Word32 -> Exp Word32
bitBlockXor l r i = i `xor` (((b `shiftL` (l-r)) - b)`shiftL` 1)
where
b = i .&. (1 `shiftL` r)
vperm l m r = bitBlockXor (l-1) (r+l-m-1) . swapBitBlocks l m r
vperm2 l m r = swapBitBlocks l (r+l-m) r . bitBlockXor (l-1) (r+l-m-1)
-}
convToPush :: SPull a -> SPush Block a
convToPush arr =
Push n $ \wf ->
forAll (fromIntegral n) $ \tid -> wf (arr ! tid) tid
where
n = len arr