@@ -185,13 +185,13 @@ static void memset_u32_fast(void *dst, const u32 val)
185
185
}
186
186
187
187
template <size_t LENGTH>
188
- static void stream_copy_fast (void *__restrict dst, void *__restrict src)
188
+ static void stream_copy_fast (void *__restrict dst, const void *__restrict src)
189
189
{
190
190
MACRODO_N ( LENGTH / sizeof (v512s8), _mm512_stream_si512 ((v512s8 *)dst + (X), _mm512_stream_load_si512 ((v512s8 *)src + (X))) );
191
191
}
192
192
193
193
template <size_t LENGTH>
194
- static void buffer_copy_fast (void *__restrict dst, void *__restrict src)
194
+ static void buffer_copy_fast (void *__restrict dst, const void *__restrict src)
195
195
{
196
196
MACRODO_N ( LENGTH / sizeof (v512s8), _mm512_store_si512 ((v512s8 *)dst + (X), _mm512_load_si512 ((v512s8 *)src + (X))) );
197
197
}
@@ -479,7 +479,7 @@ static void memset_u32_fast(void *dst, const u32 val)
479
479
}
480
480
481
481
template <size_t VECLENGTH>
482
- static void stream_copy_fast (void *__restrict dst, void *__restrict src)
482
+ static void stream_copy_fast (void *__restrict dst, const void *__restrict src)
483
483
{
484
484
#ifdef ENABLE_SSE4_1
485
485
MACRODO_N ( VECLENGTH / sizeof (v128s8), _mm_stream_si128 ((v128s8 *)dst + (X), _mm_stream_load_si128 ((v128s8 *)src + (X))) );
@@ -489,7 +489,7 @@ static void stream_copy_fast(void *__restrict dst, void *__restrict src)
489
489
}
490
490
491
491
template <size_t VECLENGTH>
492
- static void buffer_copy_fast (void *__restrict dst, void *__restrict src)
492
+ static void buffer_copy_fast (void *__restrict dst, const void *__restrict src)
493
493
{
494
494
MACRODO_N ( VECLENGTH / sizeof (v128s8), _mm_store_si128 ((v128s8 *)dst + (X), _mm_load_si128 ((v128s8 *)src + (X))) );
495
495
}
@@ -606,13 +606,13 @@ static void memset_u32_fast(void *dst, const u32 val)
606
606
}
607
607
608
608
template <size_t VECLENGTH>
609
- static void buffer_copy_fast (void *__restrict dst, void *__restrict src)
609
+ static void buffer_copy_fast (void *__restrict dst, const void *__restrict src)
610
610
{
611
611
MACRODO_N ( VECLENGTH / sizeof (uint8x16x4_t ), vst1q_u8_x4 ((u8 *)dst + ((X) * sizeof (uint8x16x4_t )), vld1q_u8_x4 ((u8 *)src + ((X) * sizeof (uint8x16x4_t )))) );
612
612
}
613
613
614
614
template <size_t VECLENGTH>
615
- static void stream_copy_fast (void *__restrict dst, void *__restrict src)
615
+ static void stream_copy_fast (void *__restrict dst, const void *__restrict src)
616
616
{
617
617
// NEON doesn't have the same temporal/caching distinctions that SSE and AVX do,
618
618
// so just use buffer_copy_fast() for this function too.
@@ -656,10 +656,10 @@ static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restr
656
656
}
657
657
658
658
template <size_t VECLENGTH>
659
- static void buffer_copy_or_constant_s8_fast (void *__restrict dst, void *__restrict src, const s8 c)
659
+ static void buffer_copy_or_constant_s8_fast (void *__restrict dst, const void *__restrict src, const s8 c)
660
660
{
661
661
const v128u8 c_vec = vreinterpretq_u8_s8 ( vdupq_n_s8 (c) );
662
- __buffer_copy_or_constant_fast<VECLENGTH, false >(dst, src, c_vec);
662
+ __buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
663
663
}
664
664
665
665
template <bool NEEDENDIANSWAP>
@@ -670,7 +670,7 @@ static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__rest
670
670
}
671
671
672
672
template <size_t VECLENGTH, bool NEEDENDIANSWAP>
673
- static void buffer_copy_or_constant_s16_fast (void *__restrict dst, void *__restrict src, const s16 c)
673
+ static void buffer_copy_or_constant_s16_fast (void *__restrict dst, const void *__restrict src, const s16 c)
674
674
{
675
675
const v128u8 c_vec = vreinterpretq_u8_s16 ( vdupq_n_s16 (c) );
676
676
__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
@@ -684,7 +684,7 @@ static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__rest
684
684
}
685
685
686
686
template <size_t VECLENGTH, bool NEEDENDIANSWAP>
687
- static void buffer_copy_or_constant_s32_fast (void *__restrict dst, void *__restrict src, const s32 c)
687
+ static void buffer_copy_or_constant_s32_fast (void *__restrict dst, const void *__restrict src, const s32 c)
688
688
{
689
689
const v128u8 c_vec = vreinterpretq_u8_s32 ( vdupq_n_s32 (c) );
690
690
__buffer_copy_or_constant_fast<VECLENGTH>(dst, src, c_vec);
@@ -731,13 +731,13 @@ static void memset_u32_fast(void *dst, const u32 val)
731
731
}
732
732
733
733
template <size_t VECLENGTH>
734
- static void buffer_copy_fast (void *__restrict dst, void *__restrict src)
734
+ static void buffer_copy_fast (void *__restrict dst, const void *__restrict src)
735
735
{
736
736
MACRODO_N ( VECLENGTH / sizeof (v128s8), vec_st (vec_ld ((X)*sizeof (v128s8),(u8 *__restrict)src), (X)*sizeof (v128s8), (u8 *__restrict)dst) );
737
737
}
738
738
739
739
template <size_t VECLENGTH>
740
- static void stream_copy_fast (void *__restrict dst, void *__restrict src)
740
+ static void stream_copy_fast (void *__restrict dst, const void *__restrict src)
741
741
{
742
742
// AltiVec doesn't have the same temporal/caching distinctions that SSE and AVX do,
743
743
// so just use buffer_copy_fast() for this function too.
@@ -782,7 +782,7 @@ static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restr
782
782
}
783
783
784
784
template <size_t VECLENGTH>
785
- static void buffer_copy_or_constant_s8_fast (void *__restrict dst, void *__restrict src, const s8 c)
785
+ static void buffer_copy_or_constant_s8_fast (void *__restrict dst, const void *__restrict src, const s8 c)
786
786
{
787
787
const v128s8 c_vec = {c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
788
788
__buffer_copy_or_constant_fast<v128s8, VECLENGTH>(dst, src, c_vec);
@@ -797,7 +797,7 @@ static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__rest
797
797
}
798
798
799
799
template <size_t VECLENGTH, bool NEEDENDIANSWAP>
800
- static void buffer_copy_or_constant_s16_fast (void *__restrict dst, void *__restrict src, const s16 c)
800
+ static void buffer_copy_or_constant_s16_fast (void *__restrict dst, const void *__restrict src, const s16 c)
801
801
{
802
802
const s16 c_16 = (NEEDENDIANSWAP) ? LE_TO_LOCAL_16 (c) : c;
803
803
const v128s16 c_vec = {c_16, c_16, c_16, c_16, c_16, c_16, c_16, c_16};
@@ -813,7 +813,7 @@ static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__rest
813
813
}
814
814
815
815
template <size_t VECLENGTH, bool NEEDENDIANSWAP>
816
- static void buffer_copy_or_constant_s32_fast (void *__restrict dst, void *__restrict src, const s32 c)
816
+ static void buffer_copy_or_constant_s32_fast (void *__restrict dst, const void *__restrict src, const s32 c)
817
817
{
818
818
const s32 c_32 = (NEEDENDIANSWAP) ? LE_TO_LOCAL_32 (c) : c;
819
819
const v128s32 c_vec = {c_32, c_32, c_32, c_32};
@@ -889,13 +889,13 @@ static void memset_u32_fast(void *dst, const u32 val)
889
889
// vector intrinsics to control the temporal/caching behavior.
890
890
891
891
template <size_t VECLENGTH>
892
- static void stream_copy_fast (void *__restrict dst, void *__restrict src)
892
+ static void stream_copy_fast (void *__restrict dst, const void *__restrict src)
893
893
{
894
894
memcpy (dst, src, VECLENGTH);
895
895
}
896
896
897
897
template <size_t VECLENGTH>
898
- static void buffer_copy_fast (void *__restrict dst, void *__restrict src)
898
+ static void buffer_copy_fast (void *__restrict dst, const void *__restrict src)
899
899
{
900
900
memcpy (dst, src, VECLENGTH);
901
901
}
@@ -920,7 +920,7 @@ static void buffer_copy_or_constant_s8(void *__restrict dst, const void *__restr
920
920
}
921
921
922
922
template <size_t VECLENGTH>
923
- static void buffer_copy_or_constant_s8_fast (void *__restrict dst, void *__restrict src, const s8 c)
923
+ static void buffer_copy_or_constant_s8_fast (void *__restrict dst, const void *__restrict src, const s8 c)
924
924
{
925
925
#ifdef HOST_64
926
926
s64 *src_64 = (s64 *)src;
@@ -980,7 +980,7 @@ static void buffer_copy_or_constant_s16(void *__restrict dst, const void *__rest
980
980
}
981
981
982
982
template <size_t VECLENGTH, bool NEEDENDIANSWAP>
983
- static void buffer_copy_or_constant_s16_fast (void *__restrict dst, void *__restrict src, const s16 c)
983
+ static void buffer_copy_or_constant_s16_fast (void *__restrict dst, const void *__restrict src, const s16 c)
984
984
{
985
985
#ifdef HOST_64
986
986
s64 *src_64 = (s64 *)src;
@@ -1049,7 +1049,7 @@ static void buffer_copy_or_constant_s32(void *__restrict dst, const void *__rest
1049
1049
}
1050
1050
1051
1051
template <size_t VECLENGTH, bool NEEDENDIANSWAP>
1052
- static void buffer_copy_or_constant_s32_fast (void *__restrict dst, void *__restrict src, const s32 c)
1052
+ static void buffer_copy_or_constant_s32_fast (void *__restrict dst, const void *__restrict src, const s32 c)
1053
1053
{
1054
1054
#ifdef HOST_64
1055
1055
s64 *src_64 = (s64 *)src;
0 commit comments