GNU Radio 3.6.4.2 C++ API
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
00002 #define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
00003 
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007 
00008 #ifdef LV_HAVE_SSE3
00009 #include<xmmintrin.h>
00010 #include<pmmintrin.h>
00011 
00012 static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
00013 
00014   const unsigned int num_bytes = num_points*8;
00015 
00016   __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
00017 
00018   lv_32fc_t diff;
00019   float sq_dist;
00020   int bound = num_bytes >> 5;
00021   int leftovers0 = (num_bytes >> 4) & 1;
00022   int leftovers1 = (num_bytes >> 3) & 1;
00023   int i = 0;
00024 
00025   xmm1 = _mm_setzero_ps();
00026   xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
00027   xmm2 = _mm_load_ps((float*)&points[0]);
00028   xmm1 = _mm_movelh_ps(xmm1, xmm1);
00029   xmm3 = _mm_load_ps((float*)&points[2]);
00030 
00031 
00032   for(; i < bound - 1; ++i) {
00033     xmm4 = _mm_sub_ps(xmm1, xmm2);
00034     xmm5 = _mm_sub_ps(xmm1, xmm3);
00035     points += 4;
00036     xmm6 = _mm_mul_ps(xmm4, xmm4);
00037     xmm7 = _mm_mul_ps(xmm5, xmm5);
00038 
00039     xmm2 = _mm_load_ps((float*)&points[0]);
00040 
00041     xmm4 = _mm_hadd_ps(xmm6, xmm7);
00042 
00043     xmm3 = _mm_load_ps((float*)&points[2]);
00044 
00045     _mm_store_ps(target, xmm4);
00046 
00047     target += 4;
00048 
00049   }
00050 
00051   xmm4 = _mm_sub_ps(xmm1, xmm2);
00052   xmm5 = _mm_sub_ps(xmm1, xmm3);
00053 
00054 
00055 
00056   points += 4;
00057   xmm6 = _mm_mul_ps(xmm4, xmm4);
00058   xmm7 = _mm_mul_ps(xmm5, xmm5);
00059 
00060   xmm4 = _mm_hadd_ps(xmm6, xmm7);
00061 
00062   _mm_store_ps(target, xmm4);
00063 
00064   target += 4;
00065 
00066   for(i = 0; i < leftovers0; ++i) {
00067 
00068     xmm2 = _mm_load_ps((float*)&points[0]);
00069 
00070     xmm4 = _mm_sub_ps(xmm1, xmm2);
00071 
00072     points += 2;
00073 
00074     xmm6 = _mm_mul_ps(xmm4, xmm4);
00075 
00076     xmm4 = _mm_hadd_ps(xmm6, xmm6);
00077 
00078     _mm_storeh_pi((__m64*)target, xmm4);
00079 
00080     target += 2;
00081   }
00082 
00083   for(i = 0; i < leftovers1; ++i) {
00084 
00085     diff = src0[0] - points[0];
00086 
00087     sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
00088 
00089     target[0] = sq_dist;
00090   }
00091 }
00092 
00093 #endif /*LV_HAVE_SSE3*/
00094 
00095 #ifdef LV_HAVE_GENERIC
00096 static inline void volk_32fc_x2_square_dist_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, unsigned int num_points) {
00097 
00098   const unsigned int num_bytes = num_points*8;
00099 
00100   lv_32fc_t diff;
00101   float sq_dist;
00102   unsigned int i = 0;
00103 
00104   for(; i < num_bytes >> 3; ++i) {
00105     diff = src0[0] - points[i];
00106 
00107     sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
00108 
00109     target[i] = sq_dist;
00110   }
00111 }
00112 
00113 #endif /*LV_HAVE_GENERIC*/
00114 
00115 
00116 #endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/