GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 00002 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H 00003 00004 #include<inttypes.h> 00005 #include<stdio.h> 00006 #include<volk/volk_complex.h> 00007 #include <string.h> 00008 00009 #ifdef LV_HAVE_SSE3 00010 #include<xmmintrin.h> 00011 #include<pmmintrin.h> 00012 00013 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { 00014 00015 const unsigned int num_bytes = num_points*8; 00016 00017 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; 00018 00019 lv_32fc_t diff; 00020 memset(&diff, 0x0, 2*sizeof(float)); 00021 00022 float sq_dist = 0.0; 00023 int bound = num_bytes >> 5; 00024 int leftovers0 = (num_bytes >> 4) & 1; 00025 int leftovers1 = (num_bytes >> 3) & 1; 00026 int i = 0; 00027 00028 00029 00030 xmm1 = _mm_setzero_ps(); 00031 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); 00032 xmm2 = _mm_load_ps((float*)&points[0]); 00033 xmm8 = _mm_load1_ps(&scalar); 00034 xmm1 = _mm_movelh_ps(xmm1, xmm1); 00035 xmm3 = _mm_load_ps((float*)&points[2]); 00036 00037 00038 for(; i < bound - 1; ++i) { 00039 00040 xmm4 = _mm_sub_ps(xmm1, xmm2); 00041 xmm5 = _mm_sub_ps(xmm1, xmm3); 00042 points += 4; 00043 xmm6 = _mm_mul_ps(xmm4, xmm4); 00044 xmm7 = _mm_mul_ps(xmm5, xmm5); 00045 00046 xmm2 = _mm_load_ps((float*)&points[0]); 00047 00048 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00049 00050 xmm3 = _mm_load_ps((float*)&points[2]); 00051 00052 xmm4 = _mm_mul_ps(xmm4, xmm8); 00053 00054 _mm_store_ps(target, xmm4); 00055 00056 target += 4; 00057 00058 } 00059 00060 xmm4 = _mm_sub_ps(xmm1, xmm2); 00061 xmm5 = _mm_sub_ps(xmm1, xmm3); 00062 00063 00064 00065 points += 4; 00066 xmm6 = _mm_mul_ps(xmm4, xmm4); 00067 xmm7 = _mm_mul_ps(xmm5, xmm5); 00068 00069 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00070 00071 xmm4 = _mm_mul_ps(xmm4, xmm8); 00072 00073 _mm_store_ps(target, xmm4); 00074 00075 target += 4; 00076 00077 00078 for(i = 0; i < leftovers0; ++i) { 00079 00080 xmm2 = _mm_load_ps((float*)&points[0]); 00081 00082 xmm4 = _mm_sub_ps(xmm1, xmm2); 00083 00084 points += 2; 00085 00086 xmm6 = _mm_mul_ps(xmm4, xmm4); 00087 00088 xmm4 = _mm_hadd_ps(xmm6, xmm6); 00089 00090 xmm4 = _mm_mul_ps(xmm4, xmm8); 00091 00092 _mm_storeh_pi((__m64*)target, xmm4); 00093 00094 target += 2; 00095 } 00096 00097 for(i = 0; i < leftovers1; ++i) { 00098 00099 diff = src0[0] - points[0]; 00100 00101 sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); 00102 00103 target[0] = sq_dist; 00104 } 00105 } 00106 00107 #endif /*LV_HAVE_SSE3*/ 00108 00109 #ifdef LV_HAVE_GENERIC 00110 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) { 00111 00112 const unsigned int num_bytes = num_points*8; 00113 00114 lv_32fc_t diff; 00115 float sq_dist; 00116 unsigned int i = 0; 00117 00118 for(; i < num_bytes >> 3; ++i) { 00119 diff = src0[0] - points[i]; 00120 00121 sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff)); 00122 00123 target[i] = sq_dist; 00124 } 00125 } 00126 00127 #endif /*LV_HAVE_GENERIC*/ 00128 00129 00130 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/