|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable |
|
|
|
__kernel void reorg_chw( |
|
__global const half *restrict src, |
|
__global half *restrict dst, |
|
int W, |
|
int H, |
|
int C, |
|
int stride) |
|
{ |
|
__local half local_src[8 * 1024]; |
|
__local half local_dst[8 * 1024]; |
|
|
|
event_t e1 = async_work_group_copy_2D2D( |
|
local_src, |
|
src + get_group_id(1) * W * stride |
|
+ get_group_id(0) * W * stride * stride, |
|
W * stride, |
|
get_local_size(0), |
|
W * stride * (stride * get_num_groups(0) - 1), |
|
0, |
|
0); |
|
wait_group_events(1, &e1); |
|
|
|
const int c = get_local_id(0); |
|
const int stride_x = get_local_id(1); |
|
|
|
const int srcIdx = stride_x + c * W * stride; |
|
const int dstIdx = stride_x * W * get_local_size(0) + c * W; |
|
|
|
int x = 0; |
|
for (; x <= W - 8; x += 8) { |
|
half8 data = (half8){ |
|
local_src[srcIdx + (x + 0) * stride], |
|
local_src[srcIdx + (x + 1) * stride], |
|
local_src[srcIdx + (x + 2) * stride], |
|
local_src[srcIdx + (x + 3) * stride], |
|
local_src[srcIdx + (x + 4) * stride], |
|
local_src[srcIdx + (x + 5) * stride], |
|
local_src[srcIdx + (x + 6) * stride], |
|
local_src[srcIdx + (x + 7) * stride]}; |
|
|
|
*((__local half8 *)(&local_dst[dstIdx + x])) = data; |
|
} |
|
|
|
for (; x < W; x++) { |
|
local_dst[dstIdx + x] = local_src[srcIdx + x * stride]; |
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
event_t e2 = async_work_group_copy_2D2D( |
|
dst + get_group_id(0) * W |
|
+ get_group_id(1) * W * stride * get_global_size(0), |
|
local_dst, |
|
W, |
|
get_local_size(0) * stride, |
|
0, |
|
W * (get_num_groups(0) - 1), |
|
0); |
|
wait_group_events(1, &e2); |
|
} |
|
|