matrix.h: Fix const-correctness for some *_fast function parameters.

rogerman · rogerman · commit e2379a66d670 · 2025-03-16T16:13:12.000-07:00
- In practice, this only affected compiling for NEON on certain compilers. Other SIMD ISAs should remain unaffected.
diff --git a/desmume/src/matrix.h b/desmume/src/matrix.h
@@ -185,13 +185,13 @@ static void memset_u32_fast(void *dst, const u32 val)
 }
 
 template <size_t LENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+static void stream_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	MACRODO_N( LENGTH / sizeof(v512s8), _mm512_stream_si512((v512s8 *)dst + (X), _mm512_stream_load_si512((v512s8 *)src + (X))) );
 }
 
 template <size_t LENGTH>
-static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
+static void buffer_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	MACRODO_N( LENGTH / sizeof(v512s8), _mm512_store_si512((v512s8 *)dst + (X), _mm512_load_si512((v512s8 *)src + (X))) );
 }
@@ -479,7 +479,7 @@ static void memset_u32_fast(void *dst, const u32 val)
 }
 
 template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+static void stream_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 #ifdef ENABLE_SSE4_1
 	MACRODO_N( VECLENGTH / sizeof(v128s8), _mm_stream_si128((v128s8 *)dst + (X), _mm_stream_load_si128((v128s8 *)src + (X))) );
@@ -489,7 +489,7 @@ static void stream_copy_fast(void *__restrict dst, void *__restrict src)
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
+static void buffer_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(v128s8), _mm_store_si128((v128s8 *)dst + (X), _mm_load_si128((v128s8 *)src + (X))) );
 }
@@ -606,13 +606,13 @@ static void memset_u32_fast(void *dst, const u32 val)
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
+static void buffer_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 *)dst + ((X) * sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 *)src + ((X) * sizeof(uint8x16x4_t)))) );
 }
 
 template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+static void stream_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,
 	// so just use buffer_copy_fast() for this function too.
@@ -656,10 +656,10 @@ static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restr
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_or_constant_s8_fast(void *__restrict dst, void *__restrict src, const s8 c)
+static void buffer_copy_or_constant_s8_fast(void *__restrict dst, const void *__restrict src, const s8 c)
 {
 	const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) );
-	__buffer_copy_or_constant_fast<VECLENGTH, false>(dst, src, c_vec);
+	__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
 }
 
 template <bool NEEDENDIANSWAP>
@@ -670,7 +670,7 @@ static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__rest
 }
 
 template <size_t VECLENGTH, bool NEEDENDIANSWAP>
-static void buffer_copy_or_constant_s16_fast(void *__restrict dst, void *__restrict src, const s16 c)
+static void buffer_copy_or_constant_s16_fast(void *__restrict dst, const void *__restrict src, const s16 c)
 {
 	const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) );
 	__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
@@ -684,7 +684,7 @@ static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__rest
 }
 
 template <size_t VECLENGTH, bool NEEDENDIANSWAP>
-static void buffer_copy_or_constant_s32_fast(void *__restrict dst, void *__restrict src, const s32 c)
+static void buffer_copy_or_constant_s32_fast(void *__restrict dst, const void *__restrict src, const s32 c)
 {
 	const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) );
 	__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
@@ -731,13 +731,13 @@ static void memset_u32_fast(void *dst, const u32 val)
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
+static void buffer_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)*sizeof(v128s8),(u8 *__restrict)src), (X)*sizeof(v128s8), (u8 *__restrict)dst) );
 }
 
 template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+static void stream_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,
 	// so just use buffer_copy_fast() for this function too.
@@ -782,7 +782,7 @@ static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restr
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_or_constant_s8_fast(void *__restrict dst, void *__restrict src, const s8 c)
+static void buffer_copy_or_constant_s8_fast(void *__restrict dst, const void *__restrict src, const s8 c)
 {
 	const v128s8 c_vec = {c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
 	__buffer_copy_or_constant_fast<v128s8, VECLENGTH>(dst, src, c_vec);
@@ -797,7 +797,7 @@ static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__rest
 }
 
 template <size_t VECLENGTH, bool NEEDENDIANSWAP>
-static void buffer_copy_or_constant_s16_fast(void *__restrict dst, void *__restrict src, const s16 c)
+static void buffer_copy_or_constant_s16_fast(void *__restrict dst, const void *__restrict src, const s16 c)
 {
 	const s16 c_16 = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16(c) : c;
 	const v128s16 c_vec = {c_16, c_16, c_16, c_16, c_16, c_16, c_16, c_16};
@@ -813,7 +813,7 @@ static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__rest
 }
 
 template <size_t VECLENGTH, bool NEEDENDIANSWAP>
-static void buffer_copy_or_constant_s32_fast(void *__restrict dst, void *__restrict src, const s32 c)
+static void buffer_copy_or_constant_s32_fast(void *__restrict dst, const void *__restrict src, const s32 c)
 {
 	const s32 c_32 = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32(c) : c;
 	const v128s32 c_vec = {c_32, c_32, c_32, c_32};
@@ -889,13 +889,13 @@ static void memset_u32_fast(void *dst, const u32 val)
 // vector intrinsics to control the temporal/caching behavior.
 
 template <size_t VECLENGTH>
-static void stream_copy_fast(void *__restrict dst, void *__restrict src)
+static void stream_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	memcpy(dst, src, VECLENGTH);
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_fast(void *__restrict dst, void *__restrict src)
+static void buffer_copy_fast(void *__restrict dst, const void *__restrict src)
 {
 	memcpy(dst, src, VECLENGTH);
 }
@@ -920,7 +920,7 @@ static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restr
 }
 
 template <size_t VECLENGTH>
-static void buffer_copy_or_constant_s8_fast(void *__restrict dst, void *__restrict src, const s8 c)
+static void buffer_copy_or_constant_s8_fast(void *__restrict dst, const void *__restrict src, const s8 c)
 {
 #ifdef HOST_64
 	s64 *src_64 = (s64 *)src;
@@ -980,7 +980,7 @@ static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__rest
 }
 
 template <size_t VECLENGTH, bool NEEDENDIANSWAP>
-static void buffer_copy_or_constant_s16_fast(void *__restrict dst, void *__restrict src, const s16 c)
+static void buffer_copy_or_constant_s16_fast(void *__restrict dst, const void *__restrict src, const s16 c)
 {
 #ifdef HOST_64
 	s64 *src_64 = (s64 *)src;
@@ -1049,7 +1049,7 @@ static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__rest
 }
 
 template <size_t VECLENGTH, bool NEEDENDIANSWAP>
-static void buffer_copy_or_constant_s32_fast(void *__restrict dst, void *__restrict src, const s32 c)
+static void buffer_copy_or_constant_s32_fast(void *__restrict dst, const void *__restrict src, const s32 c)
 {
 #ifdef HOST_64
 	s64 *src_64 = (s64 *)src;

Original file line number	Diff line number	Diff line change
`@@ -185,13 +185,13 @@ static void memset_u32_fast(void *dst, const u32 val)`
`185`	`185`	`}`
`186`	`186`
`187`	`187`	`template <size_t LENGTH>`
`188`		`-static void stream_copy_fast(void __restrict dst, void __restrict src)`
	`188`	`+static void stream_copy_fast(void __restrict dst, const void __restrict src)`
`189`	`189`	`{`
`190`	`190`	`MACRODO_N( LENGTH / sizeof(v512s8), _mm512_stream_si512((v512s8 )dst + (X), _mm512_stream_load_si512((v512s8 )src + (X))) );`
`191`	`191`	`}`
`192`	`192`
`193`	`193`	`template <size_t LENGTH>`
`194`		`-static void buffer_copy_fast(void __restrict dst, void __restrict src)`
	`194`	`+static void buffer_copy_fast(void __restrict dst, const void __restrict src)`
`195`	`195`	`{`
`196`	`196`	`MACRODO_N( LENGTH / sizeof(v512s8), _mm512_store_si512((v512s8 )dst + (X), _mm512_load_si512((v512s8 )src + (X))) );`
`197`	`197`	`}`
`@@ -479,7 +479,7 @@ static void memset_u32_fast(void *dst, const u32 val)`
`479`	`479`	`}`
`480`	`480`
`481`	`481`	`template <size_t VECLENGTH>`
`482`		`-static void stream_copy_fast(void __restrict dst, void __restrict src)`
	`482`	`+static void stream_copy_fast(void __restrict dst, const void __restrict src)`
`483`	`483`	`{`
`484`	`484`	`#ifdef ENABLE_SSE4_1`
`485`	`485`	`MACRODO_N( VECLENGTH / sizeof(v128s8), _mm_stream_si128((v128s8 )dst + (X), _mm_stream_load_si128((v128s8 )src + (X))) );`
`@@ -489,7 +489,7 @@ static void stream_copy_fast(void __restrict dst, void __restrict src)`
`489`	`489`	`}`
`490`	`490`
`491`	`491`	`template <size_t VECLENGTH>`
`492`		`-static void buffer_copy_fast(void __restrict dst, void __restrict src)`
	`492`	`+static void buffer_copy_fast(void __restrict dst, const void __restrict src)`
`493`	`493`	`{`
`494`	`494`	`MACRODO_N( VECLENGTH / sizeof(v128s8), _mm_store_si128((v128s8 )dst + (X), _mm_load_si128((v128s8 )src + (X))) );`
`495`	`495`	`}`
`@@ -606,13 +606,13 @@ static void memset_u32_fast(void *dst, const u32 val)`
`606`	`606`	`}`
`607`	`607`
`608`	`608`	`template <size_t VECLENGTH>`
`609`		`-static void buffer_copy_fast(void __restrict dst, void __restrict src)`
	`609`	`+static void buffer_copy_fast(void __restrict dst, const void __restrict src)`
`610`	`610`	`{`
`611`	`611`	`MACRODO_N( VECLENGTH / sizeof(uint8x16x4_t), vst1q_u8_x4((u8 )dst + ((X) sizeof(uint8x16x4_t)), vld1q_u8_x4((u8 )src + ((X) sizeof(uint8x16x4_t)))) );`
`612`	`612`	`}`
`613`	`613`
`614`	`614`	`template <size_t VECLENGTH>`
`615`		`-static void stream_copy_fast(void __restrict dst, void __restrict src)`
	`615`	`+static void stream_copy_fast(void __restrict dst, const void __restrict src)`
`616`	`616`	`{`
`617`	`617`	`// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,`
`618`	`618`	`// so just use buffer_copy_fast() for this function too.`
`@@ -656,10 +656,10 @@ static void buffer_copy_or_constant_s8(void __restrict dst, const void __restr`
`656`	`656`	`}`
`657`	`657`
`658`	`658`	`template <size_t VECLENGTH>`
`659`		`-static void buffer_copy_or_constant_s8_fast(void __restrict dst, void __restrict src, const s8 c)`
	`659`	`+static void buffer_copy_or_constant_s8_fast(void __restrict dst, const void __restrict src, const s8 c)`
`660`	`660`	`{`
`661`	`661`	`const v128u8 c_vec = vreinterpretq_u8_s8( vdupq_n_s8(c) );`
`662`		`- __buffer_copy_or_constant_fast<VECLENGTH, false>(dst, src, c_vec);`
	`662`	`+ __buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);`
`663`	`663`	`}`
`664`	`664`
`665`	`665`	`template <bool NEEDENDIANSWAP>`
`@@ -670,7 +670,7 @@ static void buffer_copy_or_constant_s16(void __restrict dst, const void __rest`
`670`	`670`	`}`
`671`	`671`
`672`	`672`	`template <size_t VECLENGTH, bool NEEDENDIANSWAP>`
`673`		`-static void buffer_copy_or_constant_s16_fast(void __restrict dst, void __restrict src, const s16 c)`
	`673`	`+static void buffer_copy_or_constant_s16_fast(void __restrict dst, const void __restrict src, const s16 c)`
`674`	`674`	`{`
`675`	`675`	`const v128u8 c_vec = vreinterpretq_u8_s16( vdupq_n_s16(c) );`
`676`	`676`	`__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);`
`@@ -684,7 +684,7 @@ static void buffer_copy_or_constant_s32(void __restrict dst, const void __rest`
`684`	`684`	`}`
`685`	`685`
`686`	`686`	`template <size_t VECLENGTH, bool NEEDENDIANSWAP>`
`687`		`-static void buffer_copy_or_constant_s32_fast(void __restrict dst, void __restrict src, const s32 c)`
	`687`	`+static void buffer_copy_or_constant_s32_fast(void __restrict dst, const void __restrict src, const s32 c)`
`688`	`688`	`{`
`689`	`689`	`const v128u8 c_vec = vreinterpretq_u8_s32( vdupq_n_s32(c) );`
`690`	`690`	`__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);`
`@@ -731,13 +731,13 @@ static void memset_u32_fast(void *dst, const u32 val)`
`731`	`731`	`}`
`732`	`732`
`733`	`733`	`template <size_t VECLENGTH>`
`734`		`-static void buffer_copy_fast(void __restrict dst, void __restrict src)`
	`734`	`+static void buffer_copy_fast(void __restrict dst, const void __restrict src)`
`735`	`735`	`{`
`736`	`736`	`MACRODO_N( VECLENGTH / sizeof(v128s8), vec_st(vec_ld((X)sizeof(v128s8),(u8 __restrict)src), (X)sizeof(v128s8), (u8 __restrict)dst) );`
`737`	`737`	`}`
`738`	`738`
`739`	`739`	`template <size_t VECLENGTH>`
`740`		`-static void stream_copy_fast(void __restrict dst, void __restrict src)`
	`740`	`+static void stream_copy_fast(void __restrict dst, const void __restrict src)`
`741`	`741`	`{`
`742`	`742`	`// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,`
`743`	`743`	`// so just use buffer_copy_fast() for this function too.`
`@@ -782,7 +782,7 @@ static void buffer_copy_or_constant_s8(void __restrict dst, const void __restr`
`782`	`782`	`}`
`783`	`783`
`784`	`784`	`template <size_t VECLENGTH>`
`785`		`-static void buffer_copy_or_constant_s8_fast(void __restrict dst, void __restrict src, const s8 c)`
	`785`	`+static void buffer_copy_or_constant_s8_fast(void __restrict dst, const void __restrict src, const s8 c)`
`786`	`786`	`{`
`787`	`787`	`const v128s8 c_vec = {c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};`
`788`	`788`	`__buffer_copy_or_constant_fast<v128s8, VECLENGTH>(dst, src, c_vec);`
`@@ -797,7 +797,7 @@ static void buffer_copy_or_constant_s16(void __restrict dst, const void __rest`
`797`	`797`	`}`
`798`	`798`
`799`	`799`	`template <size_t VECLENGTH, bool NEEDENDIANSWAP>`
`800`		`-static void buffer_copy_or_constant_s16_fast(void __restrict dst, void __restrict src, const s16 c)`
	`800`	`+static void buffer_copy_or_constant_s16_fast(void __restrict dst, const void __restrict src, const s16 c)`
`801`	`801`	`{`
`802`	`802`	`const s16 c_16 = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16(c) : c;`
`803`	`803`	`const v128s16 c_vec = {c_16, c_16, c_16, c_16, c_16, c_16, c_16, c_16};`
`@@ -813,7 +813,7 @@ static void buffer_copy_or_constant_s32(void __restrict dst, const void __rest`
`813`	`813`	`}`
`814`	`814`
`815`	`815`	`template <size_t VECLENGTH, bool NEEDENDIANSWAP>`
`816`		`-static void buffer_copy_or_constant_s32_fast(void __restrict dst, void __restrict src, const s32 c)`
	`816`	`+static void buffer_copy_or_constant_s32_fast(void __restrict dst, const void __restrict src, const s32 c)`
`817`	`817`	`{`
`818`	`818`	`const s32 c_32 = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32(c) : c;`
`819`	`819`	`const v128s32 c_vec = {c_32, c_32, c_32, c_32};`
`@@ -889,13 +889,13 @@ static void memset_u32_fast(void *dst, const u32 val)`
`889`	`889`	`// vector intrinsics to control the temporal/caching behavior.`
`890`	`890`
`891`	`891`	`template <size_t VECLENGTH>`
`892`		`-static void stream_copy_fast(void __restrict dst, void __restrict src)`
	`892`	`+static void stream_copy_fast(void __restrict dst, const void __restrict src)`
`893`	`893`	`{`
`894`	`894`	`memcpy(dst, src, VECLENGTH);`
`895`	`895`	`}`
`896`	`896`
`897`	`897`	`template <size_t VECLENGTH>`
`898`		`-static void buffer_copy_fast(void __restrict dst, void __restrict src)`
	`898`	`+static void buffer_copy_fast(void __restrict dst, const void __restrict src)`
`899`	`899`	`{`
`900`	`900`	`memcpy(dst, src, VECLENGTH);`
`901`	`901`	`}`
`@@ -920,7 +920,7 @@ static void buffer_copy_or_constant_s8(void __restrict dst, const void __restr`
`920`	`920`	`}`
`921`	`921`
`922`	`922`	`template <size_t VECLENGTH>`
`923`		`-static void buffer_copy_or_constant_s8_fast(void __restrict dst, void __restrict src, const s8 c)`
	`923`	`+static void buffer_copy_or_constant_s8_fast(void __restrict dst, const void __restrict src, const s8 c)`
`924`	`924`	`{`
`925`	`925`	`#ifdef HOST_64`
`926`	`926`	`s64 src_64 = (s64 )src;`
`@@ -980,7 +980,7 @@ static void buffer_copy_or_constant_s16(void __restrict dst, const void __rest`
`980`	`980`	`}`
`981`	`981`
`982`	`982`	`template <size_t VECLENGTH, bool NEEDENDIANSWAP>`
`983`		`-static void buffer_copy_or_constant_s16_fast(void __restrict dst, void __restrict src, const s16 c)`
	`983`	`+static void buffer_copy_or_constant_s16_fast(void __restrict dst, const void __restrict src, const s16 c)`
`984`	`984`	`{`
`985`	`985`	`#ifdef HOST_64`
`986`	`986`	`s64 src_64 = (s64 )src;`
`@@ -1049,7 +1049,7 @@ static void buffer_copy_or_constant_s32(void __restrict dst, const void __rest`
`1049`	`1049`	`}`
`1050`	`1050`
`1051`	`1051`	`template <size_t VECLENGTH, bool NEEDENDIANSWAP>`
`1052`		`-static void buffer_copy_or_constant_s32_fast(void __restrict dst, void __restrict src, const s32 c)`
	`1052`	`+static void buffer_copy_or_constant_s32_fast(void __restrict dst, const void __restrict src, const s32 c)`
`1053`	`1053`	`{`
`1054`	`1054`	`#ifdef HOST_64`
`1055`	`1055`	`s64 src_64 = (s64 )src;`