一段NEON代码
作者:互联网
#include <iostream>
#include <arm_neon.h> //需包含的头文件
using namespace std;
float sum_array(float *arr, int len)
{
if (NULL == arr || len < 1)
{
cout << "input error\n";
return 0;
}
int dim4 = len >> 2; // 数组长度除4整数
int left4 = len & 3; // 数组长度除4余数
float32x4_t sum_vec = vdupq_n_f32(0.0); // 定义用于暂存累加结果的寄存器且初始化为0
for (; dim4 > 0; dim4--, arr += 4) // 每次同时访问4个数组元素
{
float32x4_t data_vec = vld1q_f32(arr); //依次取4个元素存入寄存器vec
sum_vec = vaddq_f32(sum_vec, data_vec); //ri = ai + bi 计算两组寄存器对应元素之和并存放到相应结果
}
float sum = vgetq_lane_f32(sum_vec, 0) + vgetq_lane_f32(sum_vec, 1) + vgetq_lane_f32(sum_vec, 2) + vgetq_lane_f32(sum_vec, 3); //将累加结果寄存器中的所有元素相加得到最终累加值
for (; left4 > 0; left4--, arr++)
sum += (*arr); //对于剩下的少于4的数字,依次计算累加即可
return sum;
}
void av_clip_pixel()
{
int height = 100;
int width = 200;
int16x8_t result_16x8;
int16x8_t offset_16x8 = vmovq_n_s16(offset);
int16x8_t minusshift_16x8 = vmovq_n_s16(-1 * shift);
int16x8_t min_16x8 = vmovq_n_s16(0);
int16x8_t max_16x8 = vmovq_n_s16(255);
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x += 8)
{
result_16x8 = vshlq_n_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(&src[x]))), 6);
result_16x8 = vshlq_s16(vqaddq_s16(vqaddq_s16(result_16x8, vld1q_s16(&src2[x])), offset_16x8), minusshift_16x8);
vst1_u8(&dst[x], vqmovn_u16(vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(result_16x8, max_16x8), min_16x8))));
}
src += srcstride;
dst += dststride;
src2 += MAX_PB_SIZE;
}
}
int main()
{
float arr[10] = {1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0};
cout << "sum_array result: " << sum_array(arr, 10) << endl;
return 0;
}
标签:arr,sum,NEON,s16,f32,vec,一段,16x8,代码 来源: https://blog.csdn.net/starperfection/article/details/120291541