-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathintrinsics.h
153 lines (140 loc) · 4.56 KB
/
intrinsics.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// INTRINSICS
// D. TAYLOR 2014
#ifndef __INTRINSICS_H__
#define __INTRINSICS_H__
#include "config.h"
// issue warnings when not using full hardware acceleration
#if defined(__ARMCC_VERSION) || (defined(__GNUC__) && defined(__arm__))
#if (__CORTEX_M < 0x03)
#warning "Cortex-M core < M3 detected; hardware acceleration for math operations not supported"
#elif (__CORTEX_M < 0x04)
#warning "Cortex-M core < M4 detected; partial hardware acceleration for math operations supported"
#endif
#endif
// reverse bits (ARM: RBIT)
__INLINE
uint32_t rbit(uint32_t x) {
uint32_t result;
#if defined(__ARMCC_VERSION) && ((__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300))
__asm{ rbit result, x }
#elif defined(__GNUC__) && defined(__arm__) && ((__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300))
__asm("rbit %0, %1":"=r"(result):"r"(x));
#else
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
result = (x >> 16) | (x << 16);
#endif
return result;
}
#define RBITS(W, BITS) (rbit(W) >> (32 - (BITS)))
// count leading zeroes (ARM: CLZ)
__INLINE
uint32_t clz(uint32_t x) {
uint32_t result;
#if defined(__ARMCC_VERSION) && ((__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300))
__asm{ clz result, x }
#elif defined(__GNUC__) && defined(__arm__) && ((__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300))
__asm("clz %0, %1":"=r"(result):"r"(x));
#else
x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16;
x -= 0x55555555 & (x >> 1);
x = (0x33333333 & x) + (0x33333333 & (x >> 2));
result = 32 - ((0x01010101 * (0x0F0F0F0F & (x + (x >> 4)))) >> 24);
#endif
return result;
}
// 32-bit signed multiply -> 32-bit result, add 32-bit (ARM: SMMLAR)
// floating point equivalent: return c + a * b
__INLINE
int32_t smmlar(int32_t a, int32_t b, int32_t c) {
int32_t result;
#if defined(__ARMCC_VERSION) && (__CORTEX_M >= 0x04U)
__asm{ smmlar result, a, b, c }
#elif defined(__GNUC__) && defined(__arm__) && (__CORTEX_M >= 0x04U)
__asm("smmlar %0, %1, %2, %3":"=r"(result):"r"(a),"r"(b),"r"(c));
#else
result = c + ((((int64_t)a * b) + 0x80000000) >> 32);
#endif
return result;
}
// 32-bit signed multiply -> 32-bit result, subtract 32-bit (ARM: SMMLSR)
// floating point equivalent: return c - a * b
__INLINE
int32_t smmlsr(int32_t a, int32_t b, int32_t c) {
int32_t result;
#if defined(__ARMCC_VERSION) && (__CORTEX_M >= 0x04U)
__asm{ smmlsr result, a, b, c }
#elif defined(__GNUC__) && defined(__arm__) && (__CORTEX_M >= 0x04U)
__asm("smmlsr %0, %1, %2, %3":"=r"(result):"r"(a),"r"(b),"r"(c));
#else
result = c - ((((int64_t)a * b) + 0x80000000) >> 32);
#endif
return result;
}
// 32-bit signed multiply -> 32-bit result (ARM: SMMULR)
// floating point equivalent: return a * b
__INLINE
int32_t smmulr(int32_t a, int32_t b) {
int32_t result;
#if defined(__ARMCC_VERSION) && (__CORTEX_M >= 0x04U)
__asm{ smmulr result, a, b }
#elif defined(__GNUC__) && defined(__arm__) && (__CORTEX_M >= 0x04U)
__asm("smmulr %0, %1, %2":"=r"(result):"r"(a),"r"(b));
#else
result = ((((int64_t)a * b) + 0x80000000) >> 32);
#endif
return result;
}
// saturating add (ARM: qadd)
// floating point equivalent: return max(min(a + b, 1), -1)
__INLINE
int32_t qadd(int32_t a, int32_t b) {
uint32_t result;
#if defined(__ARMCC_VERSION)
__asm{ qadd result, a, b }
#elif defined(__GNUC__)
__asm("qadd %0, %1, %2":"=r"(result):"r"(a),"r"(b));
#else
int64_t c = (int64_t)a + b;
if(c > 2147483647) c = 2147483647;
if(c < -2147483648) c = -2147483648;
result = c;
#endif
return result;
}
// saturating subtract (ARM: qsub)
// floating point equivalent: return max(min(a - b, 1), -1)
__INLINE
int32_t qsub(int32_t a, int32_t b) {
uint32_t result;
#if defined(__ARMCC_VERSION)
__asm{ qsub result, a, b }
#elif defined(__GNUC__) && defined(__arm__)
__asm("qsub %0, %1, %2":"=r"(result):"r"(a),"r"(b));
#else
int64_t c = (int64_t)a - b;
if(c > 2147483647) c = 2147483647;
if(c < -2147483648) c = -2147483648;
result = c;
#endif
return result;
}
// 32-bit arithmetic shift right with rounding (ARM: ASRS + ADC)
// floating point equivalent: return v / pow(2, s)
__INLINE
int32_t asrr(int32_t v, int32_t s) {
int32_t result;
#if defined(__ARMCC_VERSION)
__asm{ asrs result, v, s };
__asm{ adc result, result };
#elif defined(__GNUC__) && defined(__arm__)
__asm("asrs %0, %1, %2":"=r"(result):"r"(v),"r"(s):"cc");
__asm("adc %0, %1, #0":"=r"(result):"r"(result));
#else
result = (v + (1 << (s - 1))) >> s;
#endif
return result;
}
#endif