GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H 00002 #define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H 00003 00004 00005 #include<inttypes.h> 00006 #include<stdio.h> 00007 00008 00009 00010 00011 00012 #ifdef LV_HAVE_SSE2 00013 #include<xmmintrin.h> 00014 #include<emmintrin.h> 00015 00016 static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { 00017 00018 const unsigned int num_bytes = num_points*2; 00019 00020 __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 00021 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2, *p_src3, *p_src4; 00022 p_target0 = (__m128i*)target0; 00023 p_target1 = (__m128i*)target1; 00024 p_target2 = (__m128i*)target2; 00025 p_target3 = (__m128i*)target3; 00026 00027 p_src0 = (__m128i*)src0; 00028 p_src1 = (__m128i*)src1; 00029 p_src2 = (__m128i*)src2; 00030 p_src3 = (__m128i*)src3; 00031 p_src4 = (__m128i*)src4; 00032 00033 int i = 0; 00034 00035 int bound = (num_bytes >> 4); 00036 int leftovers = (num_bytes >> 1) & 7; 00037 00038 for(; i < bound; ++i) { 00039 xmm0 = _mm_load_si128(p_src0); 00040 xmm1 = _mm_load_si128(p_src1); 00041 xmm2 = _mm_load_si128(p_src2); 00042 xmm3 = _mm_load_si128(p_src3); 00043 xmm4 = _mm_load_si128(p_src4); 00044 00045 p_src0 += 1; 00046 p_src1 += 1; 00047 00048 xmm1 = _mm_add_epi16(xmm0, xmm1); 00049 xmm2 = _mm_add_epi16(xmm0, xmm2); 00050 xmm3 = _mm_add_epi16(xmm0, xmm3); 00051 xmm4 = _mm_add_epi16(xmm0, xmm4); 00052 00053 00054 p_src2 += 1; 00055 p_src3 += 1; 00056 p_src4 += 1; 00057 00058 _mm_store_si128(p_target0, xmm1); 00059 _mm_store_si128(p_target1, xmm2); 00060 _mm_store_si128(p_target2, xmm3); 00061 _mm_store_si128(p_target3, xmm4); 00062 00063 p_target0 += 1; 00064 p_target1 += 1; 00065 p_target2 += 1; 00066 p_target3 += 1; 00067 } 00068 /*asm volatile 00069 ( 00070 ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t" 00071 "cmp $0, %[bound]\n\t" 00072 "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t" 00073 "movaps (%[src0]), %%xmm1\n\t" 00074 "movaps (%[src1]), %%xmm2\n\t" 00075 "movaps (%[src2]), %%xmm3\n\t" 00076 "movaps (%[src3]), %%xmm4\n\t" 00077 "movaps (%[src4]), %%xmm5\n\t" 00078 "add $16, %[src0]\n\t" 00079 "add $16, %[src1]\n\t" 00080 "add $16, %[src2]\n\t" 00081 "add $16, %[src3]\n\t" 00082 "add $16, %[src4]\n\t" 00083 "paddw %%xmm1, %%xmm2\n\t" 00084 "paddw %%xmm1, %%xmm3\n\t" 00085 "paddw %%xmm1, %%xmm4\n\t" 00086 "paddw %%xmm1, %%xmm5\n\t" 00087 "add $-1, %[bound]\n\t" 00088 "movaps %%xmm2, (%[target0])\n\t" 00089 "movaps %%xmm3, (%[target1])\n\t" 00090 "movaps %%xmm4, (%[target2])\n\t" 00091 "movaps %%xmm5, (%[target3])\n\t" 00092 "add $16, %[target0]\n\t" 00093 "add $16, %[target1]\n\t" 00094 "add $16, %[target2]\n\t" 00095 "add $16, %[target3]\n\t" 00096 "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t" 00097 ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t" 00098 : 00099 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2), [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1), [target2]"r"(target2), [target3]"r"(target3) 00100 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 00101 ); 00102 00103 */ 00104 00105 00106 for(i = bound * 8; i < (bound * 8) + leftovers; ++i) { 00107 target0[i] = src0[i] + src1[i]; 00108 target1[i] = src0[i] + src2[i]; 00109 target2[i] = src0[i] + src3[i]; 00110 target3[i] = src0[i] + src4[i]; 00111 } 00112 } 00113 #endif /*LV_HAVE_SSE2*/ 00114 00115 00116 #ifdef LV_HAVE_GENERIC 00117 00118 static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0, short* target1, short* target2, short* target3, short* src0, short* src1, short* src2, short* src3, short* src4, unsigned int num_points) { 00119 00120 const unsigned int num_bytes = num_points*2; 00121 00122 int i = 0; 00123 00124 int bound = num_bytes >> 1; 00125 00126 for(i = 0; i < bound; ++i) { 00127 target0[i] = src0[i] + src1[i]; 00128 target1[i] = src0[i] + src2[i]; 00129 target2[i] = src0[i] + src3[i]; 00130 target3[i] = src0[i] + src4[i]; 00131 } 00132 } 00133 00134 #endif /* LV_HAVE_GENERIC */ 00135 00136 00137 00138 00139 00140 #endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/