Issue
The loop containing the scalar reduction pattern can be made faster by offloading it to an FPGA using the Xilinx programming environment.
Action
Implement a version of the scalar reduction loop using an Application Program Interface (API) provided by the HLS compiler of the Xilinx programming environment.
Relevance
A loop often represents a computationally expensive block of computation and a FPGA accelerator can be used to speed up the computation. A loop with loop-carried dependencies can be mapped to a HLS implementation (see the example ‘dot product’ code below). The programmer needs to explicitly provide the HLS compiler with annotations to guide the high-level synthesis process.
Code examples
CPU:
float ff_dot_productf(const float* a, const float* b, int length)
{
float sum = 0;
int i;
for(i=0; i<length; i++)
sum += a[i] * b[i];
return sum;
}
Xilinx HLS:
#include "../../include/global.hpp"
#include "../../include/common.hpp"
static void ddot(hls::stream< v_dt>& Xin, hls::stream< v_dt>& Yin,const int N,float* result, const float alpha) {
double temp_result[VDATA_SIZE];
#pragma HLS ARRAY_PARTITION temp_result dim=1 complete
double final_result = 0;
v_dt x,y;
init_ddot:
for(int i=0;i<VDATA_SIZE;i++){
#pragma HLS unroll
temp_result[i]=0;
}
execute:
for (int i = 0; i < N; i+=VDATA_SIZE) {
#pragma HLS pipeline II=1
x=Xin.read();
y=Yin.read();
for(int j=0;j<VDATA_SIZE;j++){
#pragma HLS unroll
if(i+VDATA_SIZE<=N || j<(N%VDATA_SIZE)){
temp_result[j] +=(x.data[j]*y.data[j]);
}
else{
temp_result[j] +=0;
}
}
}
excecute_final_of_ddot:
for (int i=0;i<VDATA_SIZE;i++){
#pragma HLS pipeline II=1
final_result+=temp_result[i];
}
result[0] = alpha+final_result;
}
References