GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H 00002 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H 00003 00004 #include<inttypes.h> 00005 #include<stdio.h> 00006 #include<volk/volk_complex.h> 00007 00008 #ifdef LV_HAVE_SSE3 00009 #include<xmmintrin.h> 00010 #include<pmmintrin.h> 00011 00012 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { 00013 00014 const unsigned int num_bytes = num_points*8; 00015 00016 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 00017 00018 lv_32fc_t diff; 00019 float sq_dist; 00020 int bound = num_bytes >> 5; 00021 int leftovers0 = (num_bytes >> 4) & 1; 00022 int leftovers1 = (num_bytes >> 3) & 1; 00023 int i = 0; 00024 00025 xmm1 = _mm_setzero_ps(); 00026 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0); 00027 xmm2 = _mm_load_ps((float*)&points[0]); 00028 xmm1 = _mm_movelh_ps(xmm1, xmm1); 00029 xmm3 = _mm_load_ps((float*)&points[2]); 00030 00031 00032 for(; i < bound - 1; ++i) { 00033 xmm4 = _mm_sub_ps(xmm1, xmm2); 00034 xmm5 = _mm_sub_ps(xmm1, xmm3); 00035 points += 4; 00036 xmm6 = _mm_mul_ps(xmm4, xmm4); 00037 xmm7 = _mm_mul_ps(xmm5, xmm5); 00038 00039 xmm2 = _mm_load_ps((float*)&points[0]); 00040 00041 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00042 00043 xmm3 = _mm_load_ps((float*)&points[2]); 00044 00045 _mm_store_ps(target, xmm4); 00046 00047 target += 4; 00048 00049 } 00050 00051 xmm4 = _mm_sub_ps(xmm1, xmm2); 00052 xmm5 = _mm_sub_ps(xmm1, xmm3); 00053 00054 00055 00056 points += 4; 00057 xmm6 = _mm_mul_ps(xmm4, xmm4); 00058 xmm7 = _mm_mul_ps(xmm5, xmm5); 00059 00060 xmm4 = _mm_hadd_ps(xmm6, xmm7); 00061 00062 _mm_store_ps(target, xmm4); 00063 00064 target += 4; 00065 00066 for(i = 0; i < leftovers0; ++i) { 00067 00068 xmm2 = _mm_load_ps((float*)&points[0]); 00069 00070 xmm4 = _mm_sub_ps(xmm1, xmm2); 00071 00072 points += 2; 00073 00074 xmm6 = _mm_mul_ps(xmm4, xmm4); 00075 00076 xmm4 = _mm_hadd_ps(xmm6, xmm6); 00077 00078 _mm_storeh_pi((__m64*)target, xmm4); 00079 00080 target += 2; 00081 } 00082 00083 for(i = 0; i < leftovers1; ++i) { 00084 00085 diff = src0[0] - points[0]; 00086 00087 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); 00088 00089 target[0] = sq_dist; 00090 } 00091 } 00092 00093 #endif /*LV_HAVE_SSE3*/ 00094 00095 #ifdef LV_HAVE_GENERIC 00096 static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) { 00097 00098 const unsigned int num_bytes = num_points*8; 00099 00100 lv_32fc_t diff; 00101 float sq_dist; 00102 unsigned int i = 0; 00103 00104 for(; i < num_bytes >> 3; ++i) { 00105 diff = src0[0] - points[i]; 00106 00107 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff); 00108 00109 target[i] = sq_dist; 00110 } 00111 } 00112 00113 #endif /*LV_HAVE_GENERIC*/ 00114 00115 00116 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/