GNU Radio 3.6.4.2 C++ API
volk_16i_x4_quad_max_star_16i.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
00002 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H
00003 
00004 
00005 #include<inttypes.h>
00006 #include<stdio.h>
00007 
00008 
00009 
00010 
00011 
00012 #ifdef LV_HAVE_SSE2
00013 
00014 #include<emmintrin.h>
00015 
00016 static inline  void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
00017 
00018         const unsigned int num_bytes = num_points*2;
00019 
00020         int i = 0;
00021 
00022         int bound = (num_bytes >> 4);
00023         int bound_copy = bound;
00024         int leftovers = (num_bytes >> 1) & 7;
00025 
00026         __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3;
00027         p_target = (__m128i*) target;
00028         p_src0 =  (__m128i*)src0;
00029         p_src1 =  (__m128i*)src1;
00030         p_src2 =  (__m128i*)src2;
00031         p_src3 =  (__m128i*)src3;
00032 
00033 
00034 
00035         __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
00036 
00037         while(bound_copy > 0) {
00038 
00039           xmm1 = _mm_load_si128(p_src0);
00040           xmm2 = _mm_load_si128(p_src1);
00041           xmm3 = _mm_load_si128(p_src2);
00042           xmm4 = _mm_load_si128(p_src3);
00043 
00044           xmm5 = _mm_setzero_si128();
00045           xmm6 = _mm_setzero_si128();
00046           xmm7 = xmm1;
00047           xmm8 = xmm3;
00048 
00049 
00050           xmm1 = _mm_sub_epi16(xmm2, xmm1);
00051 
00052 
00053 
00054           xmm3 = _mm_sub_epi16(xmm4, xmm3);
00055 
00056           xmm5 = _mm_cmpgt_epi16(xmm1, xmm5);
00057           xmm6 = _mm_cmpgt_epi16(xmm3, xmm6);
00058 
00059 
00060 
00061           xmm2 = _mm_and_si128(xmm5, xmm2);
00062           xmm4 = _mm_and_si128(xmm6, xmm4);
00063           xmm5 = _mm_andnot_si128(xmm5, xmm7);
00064           xmm6 = _mm_andnot_si128(xmm6, xmm8);
00065 
00066           xmm5 = _mm_add_epi16(xmm2, xmm5);
00067           xmm6 = _mm_add_epi16(xmm4, xmm6);
00068 
00069 
00070           xmm1 = _mm_xor_si128(xmm1, xmm1);
00071           xmm2 = xmm5;
00072           xmm5 = _mm_sub_epi16(xmm6, xmm5);
00073           p_src0 += 1;
00074           bound_copy -= 1;
00075 
00076           xmm1 = _mm_cmpgt_epi16(xmm5, xmm1);
00077           p_src1 += 1;
00078 
00079           xmm6 = _mm_and_si128(xmm1, xmm6);
00080 
00081           xmm1 = _mm_andnot_si128(xmm1, xmm2);
00082           p_src2 += 1;
00083 
00084 
00085 
00086           xmm1 = _mm_add_epi16(xmm6, xmm1);
00087           p_src3 += 1;
00088 
00089 
00090           _mm_store_si128(p_target, xmm1);
00091           p_target += 1;
00092 
00093         }
00094 
00095 
00096         /*asm volatile
00097                 (
00098                  "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t"
00099                  "cmp $0, %[bound]\n\t"
00100                  "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t"
00101 
00102                  "movaps (%[src0]), %%xmm1\n\t"
00103                  "movaps (%[src1]), %%xmm2\n\t"
00104                  "movaps (%[src2]), %%xmm3\n\t"
00105                  "movaps (%[src3]), %%xmm4\n\t"
00106 
00107                  "pxor %%xmm5, %%xmm5\n\t"
00108                  "pxor %%xmm6, %%xmm6\n\t"
00109                  "movaps %%xmm1, %%xmm7\n\t"
00110                  "movaps %%xmm3, %%xmm8\n\t"
00111                  "psubw %%xmm2, %%xmm1\n\t"
00112                  "psubw %%xmm4, %%xmm3\n\t"
00113 
00114                  "pcmpgtw %%xmm1, %%xmm5\n\t"
00115                  "pcmpgtw %%xmm3, %%xmm6\n\t"
00116 
00117                  "pand %%xmm5, %%xmm2\n\t"
00118                  "pand %%xmm6, %%xmm4\n\t"
00119                  "pandn %%xmm7, %%xmm5\n\t"
00120                  "pandn %%xmm8, %%xmm6\n\t"
00121 
00122                  "paddw %%xmm2, %%xmm5\n\t"
00123                  "paddw %%xmm4, %%xmm6\n\t"
00124 
00125                  "pxor %%xmm1, %%xmm1\n\t"
00126                  "movaps %%xmm5, %%xmm2\n\t"
00127 
00128                  "psubw %%xmm6, %%xmm5\n\t"
00129                  "add $16, %[src0]\n\t"
00130                  "add $-1, %[bound]\n\t"
00131 
00132                  "pcmpgtw %%xmm5, %%xmm1\n\t"
00133                  "add $16, %[src1]\n\t"
00134 
00135                  "pand %%xmm1, %%xmm6\n\t"
00136 
00137                  "pandn %%xmm2, %%xmm1\n\t"
00138                  "add $16, %[src2]\n\t"
00139 
00140                  "paddw %%xmm6, %%xmm1\n\t"
00141                  "add $16, %[src3]\n\t"
00142 
00143                  "movaps %%xmm1, (%[target])\n\t"
00144                  "addw $16, %[target]\n\t"
00145                  "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t"
00146 
00147                  "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t"
00148                  :
00149                  :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target)
00150                  :
00151                  );
00152         */
00153 
00154         short temp0 = 0;
00155         short temp1 = 0;
00156         for(i = bound * 8; i < (bound * 8) + leftovers; ++i) {
00157           temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
00158           temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
00159           target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
00160         }
00161         return;
00162 
00163 
00164 }
00165 
00166 #endif /*LV_HAVE_SSE2*/
00167 
00168 
00169 #ifdef LV_HAVE_GENERIC
00170 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) {
00171 
00172         const unsigned int num_bytes = num_points*2;
00173 
00174         int i = 0;
00175 
00176         int bound = num_bytes >> 1;
00177 
00178         short temp0 = 0;
00179         short temp1 = 0;
00180         for(i = 0; i < bound; ++i) {
00181           temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i];
00182           temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i];
00183           target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1;
00184         }
00185 }
00186 
00187 
00188 
00189 
00190 #endif /*LV_HAVE_GENERIC*/
00191 
00192 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/