|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable |
|
|
|
__constant static half log_2_e = (half)1.442695040888963; |
|
|
|
#define ALLOW_EARLY_RETURN 1 |
|
|
|
static void inline logistic_activate_hwc( |
|
__local const half *restrict src, |
|
__local half *restrict dst, |
|
int offset, |
|
int stride) |
|
{ |
|
half val = src[offset]; |
|
val = 1.0h / (1.0h + exp2(val * -log_2_e)); |
|
dst[offset * stride] = val; |
|
} |
|
|
|
__kernel void region_hwc( |
|
__global const half *restrict src, |
|
__global half *restrict dst, |
|
int W, |
|
int H, |
|
int classes, |
|
int coords, |
|
int num, |
|
int maskSize, |
|
int doSoftmax) |
|
{ |
|
__local half local_src[13 * 13 * (4 + 1 + 80)]; |
|
__local half local_dst[13 * 13 * (4 + 1 + 80)]; |
|
|
|
const int pixel_pos = get_local_id(0); |
|
|
|
const int local_C = classes + coords + 1; |
|
const int c = get_group_id(1) * local_C; |
|
const int h = get_group_id(0); |
|
|
|
num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize; |
|
const int C = local_C * num; |
|
|
|
event_t e1 = async_work_group_copy_2D2D( |
|
local_src, |
|
src + h * W * C + c, |
|
local_C, |
|
H * W, |
|
C - local_C, |
|
0, |
|
0); |
|
|
|
wait_group_events(1, &e1); |
|
|
|
#if ALLOW_EARLY_RETURN |
|
if (pixel_pos < W * H) |
|
#endif |
|
{ |
|
const int w = pixel_pos % W; |
|
const int h = pixel_pos / W; |
|
|
|
__local const half *restrict src = local_src + h * W * local_C + w * local_C; |
|
__local half *restrict dst = local_dst + h * W + w; |
|
|
|
const int stride = H * W; |
|
logistic_activate_hwc(src, dst, 0, stride); |
|
logistic_activate_hwc(src, dst, 1, stride); |
|
|
|
|
|
dst[2 * stride] = src[2]; |
|
dst[3 * stride] = src[3]; |
|
|
|
logistic_activate_hwc(src, dst, 4, stride); |
|
|
|
src += coords + 1; |
|
dst += (coords + 1) * stride; |
|
|
|
if (doSoftmax) { |
|
half max_val = src[0]; |
|
#pragma unroll 4 |
|
for (int c = 1; c < classes; c++) { |
|
max_val = max(max_val, src[c]); |
|
} |
|
|
|
half expSum = 0.0h; |
|
#pragma unroll 4 |
|
for (int c = 0; c < classes; c++) { |
|
const half e = src[c] - max_val; |
|
const half tmp = exp2(e * log_2_e); |
|
dst[c * stride] = tmp; |
|
expSum += tmp; |
|
} |
|
|
|
const half invExpSum = 1.0h / expSum; |
|
#pragma unroll 4 |
|
for (int c = 0; c < classes; c++) { |
|
dst[c * stride] *= invExpSum; |
|
} |
|
} else { |
|
#pragma unroll 4 |
|
for (int c = 0; c < classes; c++) { |
|
logistic_activate_hwc(src, dst, c, stride); |
|
} |
|
} |
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
const int box_sz = W * H * (classes + coords + 1); |
|
event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0); |
|
wait_group_events(1, &e2); |
|
} |
|
|