|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable |
|
|
|
__kernel void reorg_hwc( |
|
__global half const *restrict src, |
|
__global half *restrict dst, |
|
int W, |
|
int H, |
|
int C, |
|
int stride) |
|
{ |
|
__local half local_src[8 * 1024]; |
|
__local half local_dst[8 * 1024]; |
|
|
|
event_t e1 = async_work_group_copy_2D2D( |
|
local_src, |
|
src + get_group_id(0) * stride + get_group_id(1) * C, |
|
stride, |
|
H * W / stride, |
|
(C - 1) * stride, |
|
0, |
|
0); |
|
wait_group_events(1, &e1); |
|
|
|
const int stride_y = get_local_id(1); |
|
const int blocks = get_local_size(0); |
|
const int b = get_local_id(0); |
|
|
|
const int OC = stride * stride; |
|
const int OH = H / stride; |
|
const int OW = W / stride; |
|
const int IC = stride; |
|
const int IH = H; |
|
const int IW = W / stride; |
|
|
|
for (int block_h = 0; block_h < stride; block_h++) { |
|
const int src_line = b * stride * stride + stride_y * stride + block_h; |
|
const int c = src_line / IH; |
|
const int h = src_line % IH; |
|
|
|
const int dst_line = b * stride + stride_y * blocks * stride + block_h; |
|
const int oc = dst_line / OH; |
|
const int oh = dst_line % OH; |
|
|
|
for (int w = 0; w < W / stride; w++) { |
|
local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c]; |
|
} |
|
} |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
event_t e2 = async_work_group_copy_2D2D( |
|
dst + get_group_id(1) * C + get_group_id(0) * stride, |
|
local_dst, |
|
stride, |
|
W * H / stride, |
|
0, |
|
C * stride - stride, |
|
0); |
|
wait_group_events(1, &e2); |
|
} |
|
|