GNU Radio 3.6.4.2 C++ API
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
00002 #define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
00003 
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007 #include <string.h>
00008 
00009 #ifdef LV_HAVE_SSE3
00010 #include<xmmintrin.h>
00011 #include<pmmintrin.h>
00012 
00013 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
00014 
00015   const unsigned int num_bytes = num_points*8;
00016 
00017   __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
00018 
00019   lv_32fc_t diff;
00020   memset(&diff, 0x0, 2*sizeof(float));
00021 
00022   float sq_dist = 0.0;
00023   int bound = num_bytes >> 5;
00024   int leftovers0 = (num_bytes >> 4) & 1;
00025   int leftovers1 = (num_bytes >> 3) & 1;
00026   int i = 0;
00027 
00028 
00029 
00030   xmm1 = _mm_setzero_ps();
00031   xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
00032   xmm2 = _mm_load_ps((float*)&points[0]);
00033   xmm8 = _mm_load1_ps(&scalar);
00034   xmm1 = _mm_movelh_ps(xmm1, xmm1);
00035   xmm3 = _mm_load_ps((float*)&points[2]);
00036 
00037 
00038   for(; i < bound - 1; ++i) {
00039 
00040     xmm4 = _mm_sub_ps(xmm1, xmm2);
00041     xmm5 = _mm_sub_ps(xmm1, xmm3);
00042     points += 4;
00043     xmm6 = _mm_mul_ps(xmm4, xmm4);
00044     xmm7 = _mm_mul_ps(xmm5, xmm5);
00045 
00046     xmm2 = _mm_load_ps((float*)&points[0]);
00047 
00048     xmm4 = _mm_hadd_ps(xmm6, xmm7);
00049 
00050     xmm3 = _mm_load_ps((float*)&points[2]);
00051 
00052     xmm4 = _mm_mul_ps(xmm4, xmm8);
00053 
00054     _mm_store_ps(target, xmm4);
00055 
00056     target += 4;
00057 
00058   }
00059 
00060   xmm4 = _mm_sub_ps(xmm1, xmm2);
00061   xmm5 = _mm_sub_ps(xmm1, xmm3);
00062 
00063 
00064 
00065   points += 4;
00066   xmm6 = _mm_mul_ps(xmm4, xmm4);
00067   xmm7 = _mm_mul_ps(xmm5, xmm5);
00068 
00069   xmm4 = _mm_hadd_ps(xmm6, xmm7);
00070 
00071   xmm4 = _mm_mul_ps(xmm4, xmm8);
00072 
00073   _mm_store_ps(target, xmm4);
00074 
00075   target += 4;
00076 
00077 
00078   for(i = 0; i < leftovers0; ++i) {
00079 
00080     xmm2 = _mm_load_ps((float*)&points[0]);
00081 
00082     xmm4 = _mm_sub_ps(xmm1, xmm2);
00083 
00084     points += 2;
00085 
00086     xmm6 = _mm_mul_ps(xmm4, xmm4);
00087 
00088     xmm4 = _mm_hadd_ps(xmm6, xmm6);
00089 
00090     xmm4 = _mm_mul_ps(xmm4, xmm8);
00091 
00092     _mm_storeh_pi((__m64*)target, xmm4);
00093 
00094     target += 2;
00095   }
00096 
00097   for(i = 0; i < leftovers1; ++i) {
00098 
00099     diff = src0[0] - points[0];
00100 
00101     sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
00102 
00103     target[0] = sq_dist;
00104   }
00105 }
00106 
00107 #endif /*LV_HAVE_SSE3*/
00108 
00109 #ifdef LV_HAVE_GENERIC
00110 static inline void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float* target, lv_32fc_t* src0, lv_32fc_t* points, float scalar, unsigned int num_points) {
00111 
00112   const unsigned int num_bytes = num_points*8;
00113 
00114   lv_32fc_t diff;
00115   float sq_dist;
00116   unsigned int i = 0;
00117 
00118   for(; i < num_bytes >> 3; ++i) {
00119     diff = src0[0] - points[i];
00120 
00121     sq_dist = scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
00122 
00123     target[i] = sq_dist;
00124   }
00125 }
00126 
00127 #endif /*LV_HAVE_GENERIC*/
00128 
00129 
00130 #endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/