GNU Radio 3.6.4.2 C++ API
|
00001 #ifndef INCLUDED_volk_16i_max_star_horizontal_16i_a_H 00002 #define INCLUDED_volk_16i_max_star_horizontal_16i_a_H 00003 00004 #include <volk/volk_common.h> 00005 00006 #include<inttypes.h> 00007 #include<stdio.h> 00008 00009 00010 #ifdef LV_HAVE_SSSE3 00011 00012 #include<xmmintrin.h> 00013 #include<emmintrin.h> 00014 #include<tmmintrin.h> 00015 00016 static inline void volk_16i_max_star_horizontal_16i_a_ssse3(int16_t* target, int16_t* src0, unsigned int num_points) { 00017 00018 const unsigned int num_bytes = num_points*2; 00019 00020 const static uint8_t shufmask0[16] = {0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 00021 const static uint8_t shufmask1[16] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x04, 0x05, 0x08, 0x09, 0x0c, 0x0d}; 00022 const static uint8_t andmask0[16] = {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; 00023 const static uint8_t andmask1[16] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02}; 00024 00025 00026 00027 __m128i xmm0, xmm1, xmm2, xmm3, xmm4; 00028 __m128i xmm5, xmm6, xmm7, xmm8; 00029 00030 xmm4 = _mm_load_si128((__m128i*)shufmask0); 00031 xmm5 = _mm_load_si128((__m128i*)shufmask1); 00032 xmm6 = _mm_load_si128((__m128i*)andmask0); 00033 xmm7 = _mm_load_si128((__m128i*)andmask1); 00034 00035 __m128i *p_target, *p_src0; 00036 00037 p_target = (__m128i*)target; 00038 p_src0 = (__m128i*)src0; 00039 00040 int bound = num_bytes >> 5; 00041 int intermediate = (num_bytes >> 4) & 1; 00042 int leftovers = (num_bytes >> 1) & 7; 00043 00044 int i = 0; 00045 00046 00047 for(i = 0; i < bound; ++i) { 00048 00049 xmm0 = _mm_load_si128(p_src0); 00050 xmm1 = _mm_load_si128(&p_src0[1]); 00051 00052 00053 00054 xmm2 = _mm_xor_si128(xmm2, xmm2); 00055 p_src0 += 2; 00056 00057 xmm3 = _mm_hsub_epi16(xmm0, xmm1); 00058 00059 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); 00060 00061 xmm8 = _mm_and_si128(xmm2, xmm6); 00062 xmm3 = _mm_and_si128(xmm2, xmm7); 00063 00064 00065 xmm8 = _mm_add_epi8(xmm8, xmm4); 00066 xmm3 = _mm_add_epi8(xmm3, xmm5); 00067 00068 xmm0 = _mm_shuffle_epi8(xmm0, xmm8); 00069 xmm1 = _mm_shuffle_epi8(xmm1, xmm3); 00070 00071 00072 xmm3 = _mm_add_epi16(xmm0, xmm1); 00073 00074 00075 _mm_store_si128(p_target, xmm3); 00076 00077 p_target += 1; 00078 00079 } 00080 00081 for(i = 0; i < intermediate; ++i) { 00082 00083 xmm0 = _mm_load_si128(p_src0); 00084 00085 00086 xmm2 = _mm_xor_si128(xmm2, xmm2); 00087 p_src0 += 1; 00088 00089 xmm3 = _mm_hsub_epi16(xmm0, xmm1); 00090 xmm2 = _mm_cmpgt_epi16(xmm2, xmm3); 00091 00092 xmm8 = _mm_and_si128(xmm2, xmm6); 00093 00094 xmm3 = _mm_add_epi8(xmm8, xmm4); 00095 00096 xmm0 = _mm_shuffle_epi8(xmm0, xmm3); 00097 00098 _mm_storel_pd((double*)p_target, bit128_p(&xmm0)->double_vec); 00099 00100 p_target = (__m128i*)((int8_t*)p_target + 8); 00101 00102 } 00103 00104 for(i = (bound << 4) + (intermediate << 3); i < (bound << 4) + (intermediate << 3) + leftovers ; i += 2) { 00105 target[i>>1] = ((int16_t)(src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i + 1]; 00106 } 00107 00108 00109 } 00110 00111 #endif /*LV_HAVE_SSSE3*/ 00112 00113 00114 #ifdef LV_HAVE_GENERIC 00115 static inline void volk_16i_max_star_horizontal_16i_generic(int16_t* target, int16_t* src0, unsigned int num_points) { 00116 00117 const unsigned int num_bytes = num_points*2; 00118 00119 int i = 0; 00120 00121 int bound = num_bytes >> 1; 00122 00123 00124 for(i = 0; i < bound; i += 2) { 00125 target[i >> 1] = ((int16_t) (src0[i] - src0[i + 1]) > 0) ? src0[i] : src0[i+1]; 00126 } 00127 00128 } 00129 00130 00131 00132 #endif /*LV_HAVE_GENERIC*/ 00133 00134 #endif /*INCLUDED_volk_16i_max_star_horizontal_16i_a_H*/