|
|
|
|
|
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
|
#pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable |
|
|
|
__kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias) |
|
{ |
|
__local uchar local_src[8 * 1024]; |
|
__local half local_dst[8 * 1024]; |
|
|
|
event_t e1 = async_work_group_copy_3D3D( |
|
local_src, |
|
src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0) |
|
+ get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), |
|
get_local_size(0), |
|
get_local_size(0) * get_local_size(1) / (get_local_size(0)), |
|
get_global_size(0) - get_local_size(0), |
|
0, |
|
get_local_size(2), |
|
get_global_size(0) * (get_global_size(1) - get_local_size(1)), |
|
0, |
|
0); |
|
wait_group_events(1, &e1); |
|
|
|
size_t idx = get_local_id(0) |
|
+ get_local_id(1) * get_local_size(0) |
|
+ get_local_id(2) * get_local_size(0) * get_local_size(1); |
|
|
|
local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias; |
|
|
|
barrier(CLK_LOCAL_MEM_FENCE); |
|
|
|
event_t e2 = async_work_group_copy_3D3D( |
|
dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0) |
|
+ get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), |
|
local_dst, |
|
get_local_size(0), |
|
get_local_size(1), |
|
0, |
|
get_global_size(0) - get_local_size(0), |
|
get_local_size(2), |
|
0, |
|
get_global_size(0) * (get_global_size(1) - get_local_size(1)), |
|
0); |
|
wait_group_events(1, &e2); |
|
} |
|
|