GNU Radio 3.6.4.2 C++ API
volk_32f_x3_sum_of_poly_32f.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
00002 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H
00003 
00004 #include<inttypes.h>
00005 #include<stdio.h>
00006 #include<volk/volk_complex.h>
00007 
00008 #ifndef MAX
00009 #define MAX(X,Y) ((X) > (Y)?(X):(Y))
00010 #endif
00011 
00012 #ifdef LV_HAVE_SSE3
00013 #include<xmmintrin.h>
00014 #include<pmmintrin.h>
00015 
00016 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
00017 
00018   const unsigned int num_bytes = num_points*4;
00019 
00020   float result = 0.0;
00021   float fst = 0.0;
00022   float sq = 0.0;
00023   float thrd = 0.0;
00024   float frth = 0.0;
00025   //float fith = 0.0;
00026 
00027 
00028 
00029   __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12;
00030 
00031   xmm9 = _mm_setzero_ps();
00032   xmm1 = _mm_setzero_ps();
00033 
00034   xmm0 = _mm_load1_ps(&center_point_array[0]);
00035   xmm6 = _mm_load1_ps(&center_point_array[1]);
00036   xmm7 = _mm_load1_ps(&center_point_array[2]);
00037   xmm8 = _mm_load1_ps(&center_point_array[3]);
00038   //xmm11 = _mm_load1_ps(&center_point_array[4]);
00039   xmm10 = _mm_load1_ps(cutoff);
00040 
00041   int bound = num_bytes >> 4;
00042   int leftovers = (num_bytes >> 2) & 3;
00043   int i = 0;
00044 
00045   for(; i < bound; ++i) {
00046     xmm2 = _mm_load_ps(src0);
00047     xmm2 = _mm_max_ps(xmm10, xmm2);
00048     xmm3 = _mm_mul_ps(xmm2, xmm2);
00049     xmm4 = _mm_mul_ps(xmm2, xmm3);
00050     xmm5 = _mm_mul_ps(xmm3, xmm3);
00051     //xmm12 = _mm_mul_ps(xmm3, xmm4);
00052 
00053     xmm2 = _mm_mul_ps(xmm2, xmm0);
00054     xmm3 = _mm_mul_ps(xmm3, xmm6);
00055     xmm4 = _mm_mul_ps(xmm4, xmm7);
00056     xmm5 = _mm_mul_ps(xmm5, xmm8);
00057     //xmm12 = _mm_mul_ps(xmm12, xmm11);
00058 
00059     xmm2 = _mm_add_ps(xmm2, xmm3);
00060     xmm3 = _mm_add_ps(xmm4, xmm5);
00061 
00062     src0 += 4;
00063 
00064     xmm9 = _mm_add_ps(xmm2, xmm9);
00065 
00066     xmm1 = _mm_add_ps(xmm3, xmm1);
00067 
00068     //xmm9 = _mm_add_ps(xmm12, xmm9);
00069   }
00070 
00071   xmm2 = _mm_hadd_ps(xmm9, xmm1);
00072   xmm3 = _mm_hadd_ps(xmm2, xmm2);
00073   xmm4 = _mm_hadd_ps(xmm3, xmm3);
00074 
00075   _mm_store_ss(&result, xmm4);
00076 
00077 
00078 
00079   for(i = 0; i < leftovers; ++i) {
00080     fst = src0[i];
00081     fst = MAX(fst, *cutoff);
00082     sq = fst * fst;
00083     thrd = fst * sq;
00084     frth = sq * sq;
00085     //fith = sq * thrd;
00086 
00087     result += (center_point_array[0] * fst +
00088                center_point_array[1] * sq +
00089                center_point_array[2] * thrd +
00090                center_point_array[3] * frth);// +
00091                //center_point_array[4] * fith);
00092   }
00093 
00094   result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5];
00095 
00096   target[0] = result;
00097 }
00098 
00099 
00100 #endif /*LV_HAVE_SSE3*/
00101 
00102 #ifdef LV_HAVE_GENERIC
00103 
00104 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) {
00105 
00106   const unsigned int num_bytes = num_points*4;
00107 
00108   float result = 0.0;
00109   float fst = 0.0;
00110   float sq = 0.0;
00111   float thrd = 0.0;
00112   float frth = 0.0;
00113   //float fith = 0.0;
00114 
00115 
00116 
00117   unsigned int i = 0;
00118 
00119   for(; i < num_bytes >> 2; ++i) {
00120     fst = src0[i];
00121     fst = MAX(fst, *cutoff);
00122 
00123     sq = fst * fst;
00124     thrd = fst * sq;
00125     frth = sq * sq;
00126     //fith = sq * thrd;
00127 
00128     result += (center_point_array[0] * fst +
00129                center_point_array[1] * sq +
00130                center_point_array[2] * thrd +
00131                center_point_array[3] * frth); //+
00132                //center_point_array[4] * fith);
00133     /*printf("%f12...%d\n", (center_point_array[0] * fst +
00134                   center_point_array[1] * sq +
00135                   center_point_array[2] * thrd +
00136                          center_point_array[3] * frth) +
00137            //center_point_array[4] * fith) +
00138            (center_point_array[4]), i);
00139     */
00140   }
00141 
00142   result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]);
00143 
00144 
00145 
00146   *target = result;
00147 }
00148 
00149 #endif /*LV_HAVE_GENERIC*/
00150 
00151 
00152 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/