@@ -4,23 +4,21 @@ use cust::function::{BlockSize, GridSize};
44use cust:: launch;
55use cust:: memory:: { AsyncCopyDestination , DeviceBuffer , LockedBuffer } ;
66use cust:: module:: Module ;
7- use cust:: prelude:: EventStatus ;
87use cust:: stream:: { Stream , StreamFlags } ;
9- use std:: time:: Instant ;
108
119static PTX : & str = include_str ! ( concat!( env!( "OUT_DIR" ) , "/kernels.ptx" ) ) ;
1210
13- fn matrix_multiply ( block_size : usize , dimsA : ( usize , usize , usize ) , dimsB : ( usize , usize , usize ) ) -> Result < ( ) , cust:: error:: CudaError > {
14- let dimsC = ( dimsB . 0 , dimsA . 1 , 1 ) ;
15- let size_a = dimsA . 0 * dimsA . 1 ;
11+ fn matrix_multiply ( block_size : usize , dims_a : ( usize , usize , usize ) , dims_b : ( usize , usize , usize ) ) -> Result < ( ) , cust:: error:: CudaError > {
12+ let dims_c = ( dims_b . 0 , dims_a . 1 , 1 ) ;
13+ let size_a = dims_a . 0 * dims_a . 1 ;
1614 let h_a = LockedBuffer :: new ( & 1.0f32 , size_a) . expect ( "host array couldn't be initialized!" ) ;
1715
18- let size_b = dimsB . 0 * dimsB . 1 ;
16+ let size_b = dims_b . 0 * dims_b . 1 ;
1917 let h_b = LockedBuffer :: new ( & 0.01f32 , size_b) . expect ( "host array couldn't be initialized!" ) ;
2018
2119 let stream = Stream :: new ( StreamFlags :: NON_BLOCKING , None ) . expect ( "Stream couldn't be init!" ) ;
2220
23- let mut size_c = dimsB . 0 * dimsA . 1 ;
21+ let size_c = dims_b . 0 * dims_a . 1 ;
2422 let mut h_c =
2523 LockedBuffer :: new ( & 0.0f32 , size_c) . expect ( "host array couldn't be initialized!" ) ;
2624
@@ -29,11 +27,11 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
2927
3028 let d_a = DeviceBuffer :: from_slice ( h_a. as_slice ( ) ) . expect ( "device array couldn't be initialized!" ) ;
3129 let d_b = DeviceBuffer :: from_slice ( h_b. as_slice ( ) ) . expect ( "device array couldn't be initialized!" ) ;
32- let mut d_c = DeviceBuffer :: from_slice ( h_c. as_slice ( ) ) . expect ( "device array couldn't be initialized!" ) ;
30+ let d_c = DeviceBuffer :: from_slice ( h_c. as_slice ( ) ) . expect ( "device array couldn't be initialized!" ) ;
3331
3432 stream. synchronize ( ) . expect ( "Stream couldn't synchronize!" ) ;
3533 let threads = BlockSize :: xy ( block_size as u32 , block_size as u32 ) ;
36- let grid = GridSize :: xy ( ( dimsB . 0 / ( threads. x as usize ) ) . try_into ( ) . unwrap ( ) , ( dimsA . 1 / ( threads. y as usize ) ) . try_into ( ) . unwrap ( ) ) ;
34+ let grid = GridSize :: xy ( ( dims_b . 0 / ( threads. x as usize ) ) . try_into ( ) . unwrap ( ) , ( dims_a . 1 / ( threads. y as usize ) ) . try_into ( ) . unwrap ( ) ) ;
3735
3836 println ! ( "Computing result using CUDA Kernel..." ) ;
3937
@@ -47,8 +45,8 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
4745 d_c. as_device_ptr( ) ,
4846 d_a. as_device_ptr( ) ,
4947 d_b. as_device_ptr( ) ,
50- dimsA . 0 as usize ,
51- dimsB . 0 as usize
48+ dims_a . 0 as usize ,
49+ dims_b . 0 as usize
5250 ) ) ?;
5351 }
5452
@@ -67,8 +65,8 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
6765 d_c. as_device_ptr( ) ,
6866 d_a. as_device_ptr( ) ,
6967 d_b. as_device_ptr( ) ,
70- dimsA . 0 as usize ,
71- dimsB . 0 as usize ,
68+ dims_a . 0 as usize ,
69+ dims_b . 0 as usize ,
7270 ) ) ?;
7371 }
7472 }
@@ -86,30 +84,31 @@ fn matrix_multiply(block_size: usize, dimsA: (usize, usize, usize), dimsB: (usiz
8684
8785 let avg_time = gpu_time as f32 / N_ITER as f32 ;
8886 println ! ( "Average time spent executing by the GPU: {} microseconds" , avg_time) ;
89- let flopsPerMatrixMul = 2.0 * ( dimsA . 0 as f32 ) * ( dimsA . 1 as f32 ) * ( dimsB . 0 as f32 ) ;
90- let gigaFlops = ( flopsPerMatrixMul / ( avg_time) ) / 1000.0 ;
91- println ! ( "Performance = {} GFlop/s" , gigaFlops ) ;
87+ let flops_per_matrix_mul = 2.0 * ( dims_a . 0 as f32 ) * ( dims_a . 1 as f32 ) * ( dims_b . 0 as f32 ) ;
88+ let giga_flops = ( flops_per_matrix_mul / ( avg_time) ) / 1000.0 ;
89+ println ! ( "Performance = {} GFlop/s" , giga_flops ) ;
9290
9391 unsafe {
9492 d_c
9593 . async_copy_to ( & mut h_c, & stream)
9694 . expect ( "Could not copy from device to host!" ) ;
9795 }
96+ stream. synchronize ( ) . expect ( "Stream couldn't synchronize!" ) ;
9897
9998 // checking computed result
10099 // test relative error by the formula
101100 // |<x, y>_cpu - <x, y>_gpu| / |<x, y>_cpu|
102101 let machine_epsilon = 1.19209290E-07f32 ;
103102 let mut correct = true ;
104103
105- for i in 0 ..( dimsC . 0 * dimsC . 1 ) {
106- let abs_err = ( h_c[ i] - ( dimsA . 0 as f32 * 0.01f32 ) ) . abs ( ) ;
107- let dot_length = ( dimsA . 0 as f32 ) . abs ( ) ;
104+ for i in 0 ..( dims_c . 0 * dims_c . 1 ) {
105+ let abs_err = ( h_c[ i] - ( dims_a . 0 as f32 * 0.01f32 ) ) . abs ( ) ;
106+ let dot_length = ( dims_a . 0 as f32 ) . abs ( ) ;
108107 let abs_val = h_c[ i] . abs ( ) ;
109108 let rel_err = abs_err / abs_val. max ( dot_length * machine_epsilon) ;
110109
111110 if rel_err > 1e-6 {
112- println ! ( "Error at index {}: CPU = {}, GPU = {}, rel_err = {}" , i, dimsA . 0 as f32 * 0.01f32 , h_c[ i] , rel_err) ;
111+ println ! ( "Error at index {}: CPU = {}, GPU = {}, rel_err = {}" , i, dims_a . 0 as f32 * 0.01f32 , h_c[ i] , rel_err) ;
113112 correct = false ;
114113 }
115114 }
@@ -132,14 +131,13 @@ fn main() -> Result<(), cust::error::CudaError> {
132131 println ! ( "Device Name: {}" , device. name( ) . unwrap( ) ) ;
133132
134133 let block_size: u32 = 32 ;
135- let dimsA : ( usize , usize , usize ) = ( block_size as usize , block_size as usize , 1 ) ;
136- let dimsB : ( usize , usize , usize ) = ( block_size as usize , block_size as usize , 1 ) ;
134+ let dims_a : ( usize , usize , usize ) = ( block_size as usize , block_size as usize , 1 ) ;
135+ let dims_b : ( usize , usize , usize ) = ( block_size as usize , block_size as usize , 1 ) ;
137136
138- if dimsA . 0 != dimsB . 1 {
137+ if dims_a . 0 != dims_b . 1 {
139138 panic ! ( "Matrix multiplication not possible with the given dimensions!" ) ;
140139 }
141140
142- matrix_multiply ( block_size as usize , dimsA, dimsB) ;
143-
141+ matrix_multiply ( block_size as usize , dims_a, dims_b) ?;
144142 Ok ( ( ) )
145143}
0 commit comments