GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_16i_x4_quad_max_star_16i_a_H 00002 #define INCLUDED_volk_16i_x4_quad_max_star_16i_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 00010 00011 00012 #ifdef LV_HAVE_SSE2 00013 00014 #include<emmintrin.h> 00015 00016 static inline void volk_16i_x4_quad_max_star_16i_a_sse2(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { 00017 00018 const unsigned int num_bytes = num_points*2; 00019 00020 int i = 0; 00021 00022 int bound = (num_bytes >> 4); 00023 int bound_copy = bound; 00024 int leftovers = (num_bytes >> 1) & 7; 00025 00026 __m128i *p_target, *p_src0, *p_src1, *p_src2, *p_src3; 00027 p_target = (__m128i*) target; 00028 p_src0 = (__m128i*)src0; 00029 p_src1 = (__m128i*)src1; 00030 p_src2 = (__m128i*)src2; 00031 p_src3 = (__m128i*)src3; 00032 00033 00034 00035 __m128i xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 00036 00037 while(bound_copy > 0) { 00038 00039 xmm1 = _mm_load_si128(p_src0); 00040 xmm2 = _mm_load_si128(p_src1); 00041 xmm3 = _mm_load_si128(p_src2); 00042 xmm4 = _mm_load_si128(p_src3); 00043 00044 xmm5 = _mm_setzero_si128(); 00045 xmm6 = _mm_setzero_si128(); 00046 xmm7 = xmm1; 00047 xmm8 = xmm3; 00048 00049 00050 xmm1 = _mm_sub_epi16(xmm2, xmm1); 00051 00052 00053 00054 xmm3 = _mm_sub_epi16(xmm4, xmm3); 00055 00056 xmm5 = _mm_cmpgt_epi16(xmm1, xmm5); 00057 xmm6 = _mm_cmpgt_epi16(xmm3, xmm6); 00058 00059 00060 00061 xmm2 = _mm_and_si128(xmm5, xmm2); 00062 xmm4 = _mm_and_si128(xmm6, xmm4); 00063 xmm5 = _mm_andnot_si128(xmm5, xmm7); 00064 xmm6 = _mm_andnot_si128(xmm6, xmm8); 00065 00066 xmm5 = _mm_add_epi16(xmm2, xmm5); 00067 xmm6 = _mm_add_epi16(xmm4, xmm6); 00068 00069 00070 xmm1 = _mm_xor_si128(xmm1, xmm1); 00071 xmm2 = xmm5; 00072 xmm5 = _mm_sub_epi16(xmm6, xmm5); 00073 p_src0 += 1; 00074 bound_copy -= 1; 00075 00076 xmm1 = _mm_cmpgt_epi16(xmm5, xmm1); 00077 p_src1 += 1; 00078 00079 xmm6 = _mm_and_si128(xmm1, xmm6); 00080 00081 xmm1 = _mm_andnot_si128(xmm1, xmm2); 00082 p_src2 += 1; 00083 00084 00085 00086 xmm1 = _mm_add_epi16(xmm6, xmm1); 00087 p_src3 += 1; 00088 00089 00090 _mm_store_si128(p_target, xmm1); 00091 p_target += 1; 00092 00093 } 00094 00095 00096 /*asm volatile 00097 ( 00098 "volk_16i_x4_quad_max_star_16i_a_sse2_L1:\n\t" 00099 "cmp $0, %[bound]\n\t" 00100 "je volk_16i_x4_quad_max_star_16i_a_sse2_END\n\t" 00101 00102 "movaps (%[src0]), %%xmm1\n\t" 00103 "movaps (%[src1]), %%xmm2\n\t" 00104 "movaps (%[src2]), %%xmm3\n\t" 00105 "movaps (%[src3]), %%xmm4\n\t" 00106 00107 "pxor %%xmm5, %%xmm5\n\t" 00108 "pxor %%xmm6, %%xmm6\n\t" 00109 "movaps %%xmm1, %%xmm7\n\t" 00110 "movaps %%xmm3, %%xmm8\n\t" 00111 "psubw %%xmm2, %%xmm1\n\t" 00112 "psubw %%xmm4, %%xmm3\n\t" 00113 00114 "pcmpgtw %%xmm1, %%xmm5\n\t" 00115 "pcmpgtw %%xmm3, %%xmm6\n\t" 00116 00117 "pand %%xmm5, %%xmm2\n\t" 00118 "pand %%xmm6, %%xmm4\n\t" 00119 "pandn %%xmm7, %%xmm5\n\t" 00120 "pandn %%xmm8, %%xmm6\n\t" 00121 00122 "paddw %%xmm2, %%xmm5\n\t" 00123 "paddw %%xmm4, %%xmm6\n\t" 00124 00125 "pxor %%xmm1, %%xmm1\n\t" 00126 "movaps %%xmm5, %%xmm2\n\t" 00127 00128 "psubw %%xmm6, %%xmm5\n\t" 00129 "add $16, %[src0]\n\t" 00130 "add $-1, %[bound]\n\t" 00131 00132 "pcmpgtw %%xmm5, %%xmm1\n\t" 00133 "add $16, %[src1]\n\t" 00134 00135 "pand %%xmm1, %%xmm6\n\t" 00136 00137 "pandn %%xmm2, %%xmm1\n\t" 00138 "add $16, %[src2]\n\t" 00139 00140 "paddw %%xmm6, %%xmm1\n\t" 00141 "add $16, %[src3]\n\t" 00142 00143 "movaps %%xmm1, (%[target])\n\t" 00144 "addw $16, %[target]\n\t" 00145 "jmp volk_16i_x4_quad_max_star_16i_a_sse2_L1\n\t" 00146 00147 "volk_16i_x4_quad_max_star_16i_a_sse2_END:\n\t" 00148 : 00149 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [target]"r"(target) 00150 : 00151 ); 00152 */ 00153 00154 short temp0 = 0; 00155 short temp1 = 0; 00156 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { 00157 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; 00158 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; 00159 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; 00160 } 00161 return; 00162 00163 00164 } 00165 00166 #endif /*LV_HAVE_SSE2*/ 00167 00168 00169 #ifdef LV_HAVE_GENERIC 00170 static inline void volk_16i_x4_quad_max_star_16i_generic(short* target, short* src0, short* src1, short* src2, short* src3, unsigned int num_points) { 00171 00172 const unsigned int num_bytes = num_points*2; 00173 00174 int i = 0; 00175 00176 int bound = num_bytes >> 1; 00177 00178 short temp0 = 0; 00179 short temp1 = 0; 00180 for(i = 0; i < bound; ++i) { 00181 temp0 = ((short)(src0[i] - src1[i]) > 0) ? src0[i] : src1[i]; 00182 temp1 = ((short)(src2[i] - src3[i])>0) ? src2[i] : src3[i]; 00183 target[i] = ((short)(temp0 - temp1)>0) ? temp0 : temp1; 00184 } 00185 } 00186 00187 00188 00189 00190 #endif /*LV_HAVE_GENERIC*/ 00191 00192 #endif /*INCLUDED_volk_16i_x4_quad_max_star_16i_a_H*/