GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 00002 #define INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H 00003 00004 #include<inttypes.h> 00005 #include<stdio.h> 00006 #include<volk/volk_complex.h> 00007 00008 #ifndef MAX 00009 #define MAX(X,Y) ((X) > (Y)?(X):(Y)) 00010 #endif 00011 00012 #ifdef LV_HAVE_SSE3 00013 #include<xmmintrin.h> 00014 #include<pmmintrin.h> 00015 00016 static inline void volk_32f_x3_sum_of_poly_32f_a_sse3(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { 00017 00018 const unsigned int num_bytes = num_points*4; 00019 00020 float result = 0.0; 00021 float fst = 0.0; 00022 float sq = 0.0; 00023 float thrd = 0.0; 00024 float frth = 0.0; 00025 //float fith = 0.0; 00026 00027 00028 00029 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;// xmm11, xmm12; 00030 00031 xmm9 = _mm_setzero_ps(); 00032 xmm1 = _mm_setzero_ps(); 00033 00034 xmm0 = _mm_load1_ps(¢er_point_array[0]); 00035 xmm6 = _mm_load1_ps(¢er_point_array[1]); 00036 xmm7 = _mm_load1_ps(¢er_point_array[2]); 00037 xmm8 = _mm_load1_ps(¢er_point_array[3]); 00038 //xmm11 = _mm_load1_ps(¢er_point_array[4]); 00039 xmm10 = _mm_load1_ps(cutoff); 00040 00041 int bound = num_bytes >> 4; 00042 int leftovers = (num_bytes >> 2) & 3; 00043 int i = 0; 00044 00045 for(; i < bound; ++i) { 00046 xmm2 = _mm_load_ps(src0); 00047 xmm2 = _mm_max_ps(xmm10, xmm2); 00048 xmm3 = _mm_mul_ps(xmm2, xmm2); 00049 xmm4 = _mm_mul_ps(xmm2, xmm3); 00050 xmm5 = _mm_mul_ps(xmm3, xmm3); 00051 //xmm12 = _mm_mul_ps(xmm3, xmm4); 00052 00053 xmm2 = _mm_mul_ps(xmm2, xmm0); 00054 xmm3 = _mm_mul_ps(xmm3, xmm6); 00055 xmm4 = _mm_mul_ps(xmm4, xmm7); 00056 xmm5 = _mm_mul_ps(xmm5, xmm8); 00057 //xmm12 = _mm_mul_ps(xmm12, xmm11); 00058 00059 xmm2 = _mm_add_ps(xmm2, xmm3); 00060 xmm3 = _mm_add_ps(xmm4, xmm5); 00061 00062 src0 += 4; 00063 00064 xmm9 = _mm_add_ps(xmm2, xmm9); 00065 00066 xmm1 = _mm_add_ps(xmm3, xmm1); 00067 00068 //xmm9 = _mm_add_ps(xmm12, xmm9); 00069 } 00070 00071 xmm2 = _mm_hadd_ps(xmm9, xmm1); 00072 xmm3 = _mm_hadd_ps(xmm2, xmm2); 00073 xmm4 = _mm_hadd_ps(xmm3, xmm3); 00074 00075 _mm_store_ss(&result, xmm4); 00076 00077 00078 00079 for(i = 0; i < leftovers; ++i) { 00080 fst = src0[i]; 00081 fst = MAX(fst, *cutoff); 00082 sq = fst * fst; 00083 thrd = fst * sq; 00084 frth = sq * sq; 00085 //fith = sq * thrd; 00086 00087 result += (center_point_array[0] * fst + 00088 center_point_array[1] * sq + 00089 center_point_array[2] * thrd + 00090 center_point_array[3] * frth);// + 00091 //center_point_array[4] * fith); 00092 } 00093 00094 result += ((float)((bound * 4) + leftovers)) * center_point_array[4]; //center_point_array[5]; 00095 00096 target[0] = result; 00097 } 00098 00099 00100 #endif /*LV_HAVE_SSE3*/ 00101 00102 #ifdef LV_HAVE_GENERIC 00103 00104 static inline void volk_32f_x3_sum_of_poly_32f_generic(float* target, float* src0, float* center_point_array, float* cutoff, unsigned int num_points) { 00105 00106 const unsigned int num_bytes = num_points*4; 00107 00108 float result = 0.0; 00109 float fst = 0.0; 00110 float sq = 0.0; 00111 float thrd = 0.0; 00112 float frth = 0.0; 00113 //float fith = 0.0; 00114 00115 00116 00117 unsigned int i = 0; 00118 00119 for(; i < num_bytes >> 2; ++i) { 00120 fst = src0[i]; 00121 fst = MAX(fst, *cutoff); 00122 00123 sq = fst * fst; 00124 thrd = fst * sq; 00125 frth = sq * sq; 00126 //fith = sq * thrd; 00127 00128 result += (center_point_array[0] * fst + 00129 center_point_array[1] * sq + 00130 center_point_array[2] * thrd + 00131 center_point_array[3] * frth); //+ 00132 //center_point_array[4] * fith); 00133 /*printf("%f12...%d\n", (center_point_array[0] * fst + 00134 center_point_array[1] * sq + 00135 center_point_array[2] * thrd + 00136 center_point_array[3] * frth) + 00137 //center_point_array[4] * fith) + 00138 (center_point_array[4]), i); 00139 */ 00140 } 00141 00142 result += ((float)(num_bytes >> 2)) * (center_point_array[4]);//(center_point_array[5]); 00143 00144 00145 00146 *target = result; 00147 } 00148 00149 #endif /*LV_HAVE_GENERIC*/ 00150 00151 00152 #endif /*INCLUDED_volk_32f_x3_sum_of_poly_32f_a_H*/