GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H 00002 #define INCLUDED_volk_16i_permute_and_scalar_add_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 00010 00011 #ifdef LV_HAVE_SSE2 00012 00013 #include<xmmintrin.h> 00014 #include<emmintrin.h> 00015 00016 static inline void volk_16i_permute_and_scalar_add_a_sse2(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) { 00017 00018 const unsigned int num_bytes = num_points*2; 00019 00020 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 00021 00022 __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars; 00023 00024 short* p_permute_indexes = permute_indexes; 00025 00026 p_target = (__m128i*)target; 00027 p_cntl0 = (__m128i*)cntl0; 00028 p_cntl1 = (__m128i*)cntl1; 00029 p_cntl2 = (__m128i*)cntl2; 00030 p_cntl3 = (__m128i*)cntl3; 00031 p_scalars = (__m128i*)scalars; 00032 00033 int i = 0; 00034 00035 int bound = (num_bytes >> 4); 00036 int leftovers = (num_bytes >> 1) & 7; 00037 00038 xmm0 = _mm_load_si128(p_scalars); 00039 00040 xmm1 = _mm_shufflelo_epi16(xmm0, 0); 00041 xmm2 = _mm_shufflelo_epi16(xmm0, 0x55); 00042 xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa); 00043 xmm4 = _mm_shufflelo_epi16(xmm0, 0xff); 00044 00045 xmm1 = _mm_shuffle_epi32(xmm1, 0x00); 00046 xmm2 = _mm_shuffle_epi32(xmm2, 0x00); 00047 xmm3 = _mm_shuffle_epi32(xmm3, 0x00); 00048 xmm4 = _mm_shuffle_epi32(xmm4, 0x00); 00049 00050 00051 for(; i < bound; ++i) { 00052 xmm0 = _mm_setzero_si128(); 00053 xmm5 = _mm_setzero_si128(); 00054 xmm6 = _mm_setzero_si128(); 00055 xmm7 = _mm_setzero_si128(); 00056 00057 xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0); 00058 xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1); 00059 xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2); 00060 xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3); 00061 xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4); 00062 xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5); 00063 xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6); 00064 xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7); 00065 00066 xmm0 = _mm_add_epi16(xmm0, xmm5); 00067 xmm6 = _mm_add_epi16(xmm6, xmm7); 00068 00069 p_permute_indexes += 8; 00070 00071 xmm0 = _mm_add_epi16(xmm0, xmm6); 00072 00073 xmm5 = _mm_load_si128(p_cntl0); 00074 xmm6 = _mm_load_si128(p_cntl1); 00075 xmm7 = _mm_load_si128(p_cntl2); 00076 00077 xmm5 = _mm_and_si128(xmm5, xmm1); 00078 xmm6 = _mm_and_si128(xmm6, xmm2); 00079 xmm7 = _mm_and_si128(xmm7, xmm3); 00080 00081 xmm0 = _mm_add_epi16(xmm0, xmm5); 00082 00083 xmm5 = _mm_load_si128(p_cntl3); 00084 00085 xmm6 = _mm_add_epi16(xmm6, xmm7); 00086 00087 p_cntl0 += 1; 00088 00089 xmm5 = _mm_and_si128(xmm5, xmm4); 00090 00091 xmm0 = _mm_add_epi16(xmm0, xmm6); 00092 00093 p_cntl1 += 1; 00094 p_cntl2 += 1; 00095 00096 xmm0 = _mm_add_epi16(xmm0, xmm5); 00097 00098 p_cntl3 += 1; 00099 00100 _mm_store_si128(p_target, xmm0); 00101 00102 p_target += 1; 00103 } 00104 00105 00106 00107 00108 00109 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { 00110 target[i] = src0[permute_indexes[i]] 00111 + (cntl0[i] & scalars[0]) 00112 + (cntl1[i] & scalars[1]) 00113 + (cntl2[i] & scalars[2]) 00114 + (cntl3[i] & scalars[3]); 00115 } 00116 } 00117 #endif /*LV_HAVE_SSEs*/ 00118 00119 00120 #ifdef LV_HAVE_GENERIC 00121 static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) { 00122 00123 const unsigned int num_bytes = num_points*2; 00124 00125 int i = 0; 00126 00127 int bound = num_bytes >> 1; 00128 00129 for(i = 0; i < bound; ++i) { 00130 target[i] = src0[permute_indexes[i]] 00131 + (cntl0[i] & scalars[0]) 00132 + (cntl1[i] & scalars[1]) 00133 + (cntl2[i] & scalars[2]) 00134 + (cntl3[i] & scalars[3]); 00135 00136 } 00137 } 00138 00139 #endif /*LV_HAVE_GENERIC*/ 00140 00141 00142 #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/