GNU Radio 3.6.4.2 C++ API
volk_16i_permute_and_scalar_add.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_16i_permute_and_scalar_add_a_H
00002 #define INCLUDED_volk_16i_permute_and_scalar_add_a_H
00003 
00004 
00005 #include<inttypes.h>
00006 #include<stdio.h>
00007 
00008 
00009 
00010 
00011 #ifdef LV_HAVE_SSE2
00012 
00013 #include<xmmintrin.h>
00014 #include<emmintrin.h>
00015 
00016 static inline  void volk_16i_permute_and_scalar_add_a_sse2(short* target,  short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
00017 
00018   const unsigned int num_bytes = num_points*2;
00019 
00020   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
00021 
00022   __m128i *p_target, *p_cntl0, *p_cntl1, *p_cntl2, *p_cntl3, *p_scalars;
00023 
00024   short* p_permute_indexes = permute_indexes;
00025 
00026   p_target = (__m128i*)target;
00027   p_cntl0 = (__m128i*)cntl0;
00028   p_cntl1 = (__m128i*)cntl1;
00029   p_cntl2 = (__m128i*)cntl2;
00030   p_cntl3 = (__m128i*)cntl3;
00031   p_scalars = (__m128i*)scalars;
00032 
00033   int i = 0;
00034 
00035   int bound = (num_bytes >> 4);
00036   int leftovers = (num_bytes >> 1) & 7;
00037 
00038   xmm0 = _mm_load_si128(p_scalars);
00039 
00040   xmm1 = _mm_shufflelo_epi16(xmm0, 0);
00041   xmm2 = _mm_shufflelo_epi16(xmm0, 0x55);
00042   xmm3 = _mm_shufflelo_epi16(xmm0, 0xaa);
00043   xmm4 = _mm_shufflelo_epi16(xmm0, 0xff);
00044 
00045   xmm1 = _mm_shuffle_epi32(xmm1, 0x00);
00046   xmm2 = _mm_shuffle_epi32(xmm2, 0x00);
00047   xmm3 = _mm_shuffle_epi32(xmm3, 0x00);
00048   xmm4 = _mm_shuffle_epi32(xmm4, 0x00);
00049 
00050 
00051   for(; i < bound; ++i) {
00052     xmm0 = _mm_setzero_si128();
00053     xmm5 = _mm_setzero_si128();
00054     xmm6 = _mm_setzero_si128();
00055     xmm7 = _mm_setzero_si128();
00056 
00057     xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[0]], 0);
00058     xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[1]], 1);
00059     xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[2]], 2);
00060     xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[3]], 3);
00061     xmm0 = _mm_insert_epi16(xmm0, src0[p_permute_indexes[4]], 4);
00062     xmm5 = _mm_insert_epi16(xmm5, src0[p_permute_indexes[5]], 5);
00063     xmm6 = _mm_insert_epi16(xmm6, src0[p_permute_indexes[6]], 6);
00064     xmm7 = _mm_insert_epi16(xmm7, src0[p_permute_indexes[7]], 7);
00065 
00066     xmm0 = _mm_add_epi16(xmm0, xmm5);
00067     xmm6 = _mm_add_epi16(xmm6, xmm7);
00068 
00069     p_permute_indexes += 8;
00070 
00071     xmm0 = _mm_add_epi16(xmm0, xmm6);
00072 
00073     xmm5 = _mm_load_si128(p_cntl0);
00074     xmm6 = _mm_load_si128(p_cntl1);
00075     xmm7 = _mm_load_si128(p_cntl2);
00076 
00077     xmm5 = _mm_and_si128(xmm5, xmm1);
00078     xmm6 = _mm_and_si128(xmm6, xmm2);
00079     xmm7 = _mm_and_si128(xmm7, xmm3);
00080 
00081     xmm0 = _mm_add_epi16(xmm0, xmm5);
00082 
00083     xmm5 = _mm_load_si128(p_cntl3);
00084 
00085     xmm6 = _mm_add_epi16(xmm6, xmm7);
00086 
00087     p_cntl0 += 1;
00088 
00089     xmm5 = _mm_and_si128(xmm5, xmm4);
00090 
00091     xmm0 = _mm_add_epi16(xmm0, xmm6);
00092 
00093     p_cntl1 += 1;
00094     p_cntl2 += 1;
00095 
00096     xmm0 = _mm_add_epi16(xmm0, xmm5);
00097 
00098     p_cntl3 += 1;
00099 
00100     _mm_store_si128(p_target, xmm0);
00101 
00102     p_target += 1;
00103   }
00104 
00105 
00106 
00107 
00108 
00109   for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
00110     target[i] = src0[permute_indexes[i]]
00111       + (cntl0[i] & scalars[0])
00112       + (cntl1[i] & scalars[1])
00113       + (cntl2[i] & scalars[2])
00114       + (cntl3[i] & scalars[3]);
00115   }
00116 }
00117 #endif /*LV_HAVE_SSEs*/
00118 
00119 
00120 #ifdef LV_HAVE_GENERIC
00121 static inline void volk_16i_permute_and_scalar_add_generic(short* target, short* src0, short* permute_indexes, short* cntl0, short* cntl1, short* cntl2, short* cntl3, short* scalars, unsigned int num_points) {
00122 
00123         const unsigned int num_bytes = num_points*2;
00124 
00125         int i = 0;
00126 
00127         int bound = num_bytes >> 1;
00128 
00129         for(i = 0; i < bound; ++i) {
00130                 target[i] = src0[permute_indexes[i]]
00131                         + (cntl0[i] & scalars[0])
00132                         + (cntl1[i] & scalars[1])
00133                         + (cntl2[i] & scalars[2])
00134                         + (cntl3[i] & scalars[3]);
00135 
00136         }
00137 }
00138 
00139 #endif /*LV_HAVE_GENERIC*/
00140 
00141 
00142 #endif /*INCLUDED_volk_16i_permute_and_scalar_add_a_H*/