@@ -18,28 +18,28 @@ namespace mim
18
18
uint32_t uint;
19
19
};
20
20
21
- static int32_t const shift = 13 ; // mantissa bits: 13, exponent bits: 16
22
- static int32_t const shiftSign = 16 ; // sign bits: 1, left shift amount: 15
21
+ static int32_t constexpr shift = 13 ; // mantissa bits: 13, exponent bits: 16
22
+ static int32_t constexpr shiftSign = 16 ; // sign bits: 1, left shift amount: 15
23
23
24
- static int32_t const inf32 = 0x7F800000 ; // FP32 infinity bit pattern
25
- static int32_t const max32 = 0x477FE000 ; // max FP16 normal as a FP32 bit pattern
26
- static int32_t const min32 = 0x38800000 ; // min FP16 normal as a FP32 bit pattern
27
- static int32_t const sign32 = 0x80000000 ; // FP32 sign bit pattern
24
+ static int32_t constexpr inf32 = 0x7F800000 ; // FP32 infinity bit pattern
25
+ static int32_t constexpr max32 = 0x477FE000 ; // max FP16 normal as a FP32 bit pattern
26
+ static int32_t constexpr min32 = 0x38800000 ; // min FP16 normal as a FP32 bit pattern
27
+ static int32_t constexpr sign32 = 0x80000000 ; // FP32 sign bit pattern
28
28
29
- static int32_t const inf16 = inf32 >> shift; // FP16 infinity bit pattern
30
- static int32_t const nan16 = (inf16 + 1 ) << shift; // min FP16 NaN as a FP32 bit pattern
31
- static int32_t const max16 = max32 >> shift; // max FP16 normal bit pattern
32
- static int32_t const min16 = min32 >> shift; // min FP16 normal bit pattern
33
- static int32_t const sign16 = sign32 >> shiftSign; // FP16 sign bit pattern
29
+ static int32_t constexpr inf16 = inf32 >> shift; // FP16 infinity bit pattern
30
+ static int32_t constexpr nan16 = (inf16 + 1 ) << shift; // min FP16 NaN as a FP32 bit pattern
31
+ static int32_t constexpr max16 = max32 >> shift; // max FP16 normal bit pattern
32
+ static int32_t constexpr min16 = min32 >> shift; // min FP16 normal bit pattern
33
+ static int32_t constexpr sign16 = sign32 >> shiftSign; // FP16 sign bit pattern
34
34
35
- static int32_t const mul32to16 = 0x52000000 ; // multiplier to convert FP32 to FP16
36
- static int32_t const mul16to32 = 0x33800000 ; // multiplier to convert FP16 to FP32
35
+ static int32_t constexpr mul32to16 = 0x52000000 ; // multiplier to convert FP32 to FP16
36
+ static int32_t constexpr mul16to32 = 0x33800000 ; // multiplier to convert FP16 to FP32
37
37
38
- static int32_t const sub32 = 0x003FF ; // maximum float32 subnormal value, down shifted
39
- static int32_t const nor32 = 0x00400 ; // minimum float32 normal value, down shifted
38
+ static int32_t constexpr sub32 = 0x003FF ; // maximum float32 subnormal value, down shifted
39
+ static int32_t constexpr nor32 = 0x00400 ; // minimum float32 normal value, down shifted
40
40
41
- static int32_t const maxDiff = inf16 - max16 - 1 ; // diff. between max FP16 normal and infinity
42
- static int32_t const minDiff = min16 - sub32 - 1 ; // diff. between min FP16 normal and max float32 subnormal
41
+ static int32_t constexpr maxDiff = inf16 - max16 - 1 ; // diff. between max FP16 normal and infinity
42
+ static int32_t constexpr minDiff = min16 - sub32 - 1 ; // diff. between min FP16 normal and max float32 subnormal
43
43
44
44
45
45
public:
@@ -55,7 +55,7 @@ namespace mim
55
55
56
56
// Calculate the shifted bits of the floating-point number.
57
57
shift_bits.sint = mul32to16;
58
- shift_bits.sint = ( int32_t ) (shift_bits.fl * value_bits.fl ); // correct subnormals
58
+ shift_bits.sint = static_cast < int32_t > (shift_bits.fl * value_bits.fl ); // correct subnormals
59
59
60
60
// Adjust the floating-point number's bits based on certain conditions.
61
61
value_bits.sint ^= (shift_bits.sint ^ value_bits.sint ) & -(min32 > value_bits.sint );
@@ -70,7 +70,7 @@ namespace mim
70
70
value_bits.sint ^= ((value_bits.sint - minDiff) ^ value_bits.sint ) & -(value_bits.sint > sub32);
71
71
72
72
// Combine the sign and bits of the compressed number and return it.
73
- return ( uint16_t ) (value_bits.uint | sign_bits);
73
+ return static_cast < uint16_t > (value_bits.uint | sign_bits);
74
74
}
75
75
76
76
static float Decompress (uint16_t compressed_value)
0 commit comments