|
ViennaCL - The Vienna Computing Library
1.1.2
|
00001 #ifndef _VIENNACL_VECTOR_SOURCE_HPP_ 00002 #define _VIENNACL_VECTOR_SOURCE_HPP_ 00003 //Automatically generated file from aux-directory, do not edit manually! 00004 namespace viennacl 00005 { 00006 namespace linalg 00007 { 00008 namespace kernels 00009 { 00010 const char * const vector_align4_inplace_div_sub = 00011 "__kernel void inplace_div_sub(\n" 00012 " __global float4 * vec1,\n" 00013 " __global const float4 * vec2,\n" 00014 " __global const float * fac, //CPU variant is mapped to mult_add\n" 00015 " unsigned int size\n" 00016 " ) \n" 00017 "{ \n" 00018 " float factor = *fac;\n" 00019 " for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n" 00020 " vec1[i] -= vec2[i] / factor;\n" 00021 "}\n" 00022 ; //vector_align4_inplace_div_sub 00023 00024 const char * const vector_align4_mul_add = 00025 "__kernel void mul_add(\n" 00026 " __global const float4 * vec1,\n" 00027 " __global const float * fac,\n" 00028 " __global const float4 * vec2,\n" 00029 " __global float4 * result,\n" 00030 " unsigned int size\n" 00031 " ) \n" 00032 "{ \n" 00033 " float factor = *fac;\n" 00034 " for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n" 00035 " result[i] = vec1[i] * factor + vec2[i];\n" 00036 "}\n" 00037 ; //vector_align4_mul_add 00038 00039 const char * const vector_align4_inplace_mul_add = 00040 "__kernel void inplace_mul_add(\n" 00041 " __global float4 * vec1,\n" 00042 " __global const float4 * vec2,\n" 00043 " __global const float * fac,\n" 00044 " unsigned int size\n" 00045 " ) \n" 00046 "{ \n" 00047 " float factor = *fac;\n" 00048 " unsigned int size_div_4 = size >> 2;\n" 00049 " for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n" 00050 " vec1[i] += vec2[i] * factor;\n" 00051 "}\n" 00052 ; //vector_align4_inplace_mul_add 00053 00054 const char * const vector_align4_inplace_div_add = 00055 "__kernel void inplace_div_add(\n" 00056 " __global float4 * vec1,\n" 00057 " __global const float4 * vec2,\n" 00058 " __global const float * fac, //CPU variant is mapped to mult_add\n" 00059 " unsigned int size\n" 00060 " ) \n" 00061 "{ \n" 00062 " float factor = *fac;\n" 00063 " for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n" 00064 " vec1[i] -= vec2[i] / factor;\n" 00065 "}\n" 00066 ; //vector_align4_inplace_div_add 00067 00068 const char * const vector_align4_cpu_mul_add = 00069 "__kernel void cpu_mul_add(\n" 00070 " __global const float4 * vec1,\n" 00071 " float factor,\n" 00072 " __global const float4 * vec2,\n" 00073 " __global float4 * result,\n" 00074 " unsigned int size\n" 00075 " ) \n" 00076 "{ \n" 00077 " for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n" 00078 " result[i] = vec1[i] * factor + vec2[i];\n" 00079 "}\n" 00080 ; //vector_align4_cpu_mul_add 00081 00082 const char * const vector_align4_cpu_inplace_mul_add = 00083 "__kernel void cpu_inplace_mul_add(\n" 00084 " __global float4 * vec1,\n" 00085 " __global const float4 * vec2,\n" 00086 " float factor,\n" 00087 " unsigned int size\n" 00088 " ) \n" 00089 "{ \n" 00090 " for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n" 00091 " vec1[i] += vec2[i] * factor;\n" 00092 "}\n" 00093 ; //vector_align4_cpu_inplace_mul_add 00094 00095 const char * const vector_align4_inplace_mul_sub = 00096 "__kernel void inplace_mul_sub(\n" 00097 " __global float4 * vec1,\n" 00098 " __global const float4 * vec2,\n" 00099 " __global const float * fac, //CPU variant is mapped to mult_add\n" 00100 " unsigned int size\n" 00101 " ) \n" 00102 "{ \n" 00103 " float factor = *fac;\n" 00104 " for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n" 00105 " vec1[i] -= vec2[i] * factor;\n" 00106 "}\n" 00107 ; //vector_align4_inplace_mul_sub 00108 00109 const char * const vector_align1_inplace_divide = 00110 "__kernel void inplace_divide(\n" 00111 " __global float * vec,\n" 00112 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n" 00113 " unsigned int size) \n" 00114 "{ \n" 00115 " float factor = *fac;\n" 00116 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00117 " vec[i] /= factor;\n" 00118 "}\n" 00119 ; //vector_align1_inplace_divide 00120 00121 const char * const vector_align1_inplace_div_sub = 00122 "///// divide substract:\n" 00123 "__kernel void inplace_div_sub(\n" 00124 " __global float * vec1,\n" 00125 " __global const float * vec2,\n" 00126 " __global const float * fac, //CPU variant is mapped to mult_add\n" 00127 " unsigned int size\n" 00128 " ) \n" 00129 "{ \n" 00130 " float factor = *fac;\n" 00131 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00132 " vec1[i] -= vec2[i] / factor;\n" 00133 "}\n" 00134 ; //vector_align1_inplace_div_sub 00135 00136 const char * const vector_align1_vmax = 00137 "__kernel void vmax(\n" 00138 " __global float * vec1,\n" 00139 " __global float * result,\n" 00140 " unsigned int size) \n" 00141 "{ \n" 00142 " //parallel reduction on global memory:\n" 00143 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00144 " {\n" 00145 " if (get_global_id(0) < stride)\n" 00146 " vec1[get_global_id(0)] = fmax(vec1[get_global_id(0)+stride], vec1[get_global_id(0)]);\n" 00147 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00148 " }\n" 00149 " \n" 00150 " if (get_global_id(0) == 0)\n" 00151 " *result = vec1[0]; \n" 00152 "}\n" 00153 ; //vector_align1_vmax 00154 00155 const char * const vector_align1_index_norm_inf = 00156 "//index_norm_inf:\n" 00157 "unsigned int float_vector1_index_norm_inf_impl(\n" 00158 " __global const float * vec,\n" 00159 " unsigned int size,\n" 00160 " __local float * float_buffer,\n" 00161 " __local unsigned int * index_buffer)\n" 00162 "{\n" 00163 " //step 1: fill buffer:\n" 00164 " float cur_max = 0.0f;\n" 00165 " float tmp;\n" 00166 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00167 " {\n" 00168 " tmp = fabs(vec[i]);\n" 00169 " if (cur_max < tmp)\n" 00170 " {\n" 00171 " float_buffer[get_global_id(0)] = tmp;\n" 00172 " index_buffer[get_global_id(0)] = i;\n" 00173 " cur_max = tmp;\n" 00174 " }\n" 00175 " }\n" 00176 " \n" 00177 " //step 2: parallel reduction:\n" 00178 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00179 " {\n" 00180 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00181 " if (get_global_id(0) < stride)\n" 00182 " {\n" 00183 " //find the first occurring index\n" 00184 " if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n" 00185 " {\n" 00186 " index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n" 00187 " float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n" 00188 " }\n" 00189 " \n" 00190 " //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n" 00191 " //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n" 00192 " }\n" 00193 " }\n" 00194 " \n" 00195 " return index_buffer[0];\n" 00196 "}\n" 00197 "\n" 00198 "__kernel void index_norm_inf(\n" 00199 " __global float * vec,\n" 00200 " unsigned int size,\n" 00201 " __local float * float_buffer,\n" 00202 " __local unsigned int * index_buffer,\n" 00203 " global unsigned int * result) \n" 00204 "{ \n" 00205 " unsigned int tmp = float_vector1_index_norm_inf_impl(vec, size, float_buffer, index_buffer);\n" 00206 " if (get_global_id(0) == 0) *result = tmp;\n" 00207 "}\n" 00208 "\n" 00209 "\n" 00210 ; //vector_align1_index_norm_inf 00211 00212 const char * const vector_align1_sub = 00213 "__kernel void sub(\n" 00214 " __global const float * vec1,\n" 00215 " __global const float * vec2, \n" 00216 " __global float * result,\n" 00217 " unsigned int size)\n" 00218 "{ \n" 00219 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00220 " result[i] = vec1[i] - vec2[i];\n" 00221 "}\n" 00222 ; //vector_align1_sub 00223 00224 const char * const vector_align1_mul_add = 00225 "__kernel void mul_add(\n" 00226 " __global const float * vec1,\n" 00227 " __global const float * fac,\n" 00228 " __global const float * vec2,\n" 00229 " __global float * result,\n" 00230 " unsigned int size\n" 00231 " ) \n" 00232 "{ \n" 00233 " float factor = *fac;\n" 00234 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00235 " result[i] = vec1[i] * factor + vec2[i];\n" 00236 "}\n" 00237 ; //vector_align1_mul_add 00238 00239 const char * const vector_align1_inplace_sub = 00240 "__kernel void inplace_sub(\n" 00241 " __global float * vec1,\n" 00242 " __global const float * vec2,\n" 00243 " unsigned int size) \n" 00244 "{ \n" 00245 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00246 " vec1[i] -= vec2[i];\n" 00247 "}\n" 00248 ; //vector_align1_inplace_sub 00249 00250 const char * const vector_align1_inner_prod = 00251 "//helper:\n" 00252 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n" 00253 "{\n" 00254 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n" 00255 " {\n" 00256 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00257 " if (get_local_id(0) < stride)\n" 00258 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n" 00259 " }\n" 00260 "}\n" 00261 "//////// inner products:\n" 00262 "float impl_inner_prod(\n" 00263 " __global const float * vec1,\n" 00264 " __global const float * vec2,\n" 00265 " unsigned int start_index,\n" 00266 " unsigned int end_index,\n" 00267 " __local float * tmp_buffer)\n" 00268 "{\n" 00269 " float tmp = 0;\n" 00270 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n" 00271 " tmp += vec1[i] * vec2[i];\n" 00272 " tmp_buffer[get_local_id(0)] = tmp;\n" 00273 " \n" 00274 " helper_inner_prod_parallel_reduction(tmp_buffer);\n" 00275 " \n" 00276 " return tmp_buffer[0];\n" 00277 "}\n" 00278 "__kernel void inner_prod(\n" 00279 " __global const float * vec1,\n" 00280 " __global const float * vec2,\n" 00281 " unsigned int size,\n" 00282 " __local float * tmp_buffer,\n" 00283 " global float * group_buffer)\n" 00284 "{\n" 00285 " float tmp = impl_inner_prod(vec1,\n" 00286 " vec2,\n" 00287 " ( get_group_id(0) * size) / get_num_groups(0),\n" 00288 " ((get_group_id(0) + 1) * size) / get_num_groups(0),\n" 00289 " tmp_buffer);\n" 00290 " \n" 00291 " if (get_local_id(0) == 0)\n" 00292 " group_buffer[get_group_id(0)] = tmp;\n" 00293 " \n" 00294 "}\n" 00295 ; //vector_align1_inner_prod 00296 00297 const char * const vector_align1_mult = 00298 "__kernel void mult(\n" 00299 " __global const float * vec,\n" 00300 " __global const float * fac, \n" 00301 " __global float * result,\n" 00302 " unsigned int size) \n" 00303 "{ \n" 00304 " float factor = *fac;\n" 00305 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00306 " result[i] = vec[i] * factor;\n" 00307 "}\n" 00308 ; //vector_align1_mult 00309 00310 const char * const vector_align1_diag_precond = 00311 "__kernel void diag_precond(\n" 00312 " __global const float * diag_A_inv, \n" 00313 " __global float * x, \n" 00314 " unsigned int size) \n" 00315 "{ \n" 00316 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00317 " x[i] *= diag_A_inv[i];\n" 00318 "}\n" 00319 ; //vector_align1_diag_precond 00320 00321 const char * const vector_align1_inplace_mul_add = 00322 "__kernel void inplace_mul_add(\n" 00323 " __global float * vec1,\n" 00324 " __global const float * vec2,\n" 00325 " __global const float * fac,\n" 00326 " unsigned int size\n" 00327 " ) \n" 00328 "{ \n" 00329 " float factor = *fac;\n" 00330 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00331 " vec1[i] += vec2[i] * factor;\n" 00332 "}\n" 00333 ; //vector_align1_inplace_mul_add 00334 00335 const char * const vector_align1_norm_1 = 00336 "//helper:\n" 00337 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n" 00338 "{\n" 00339 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00340 " {\n" 00341 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00342 " if (get_global_id(0) < stride)\n" 00343 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n" 00344 " }\n" 00345 "}\n" 00346 "\n" 00347 "////// norm_1\n" 00348 "float impl_norm_1(\n" 00349 " __global const float * vec,\n" 00350 " unsigned int start_index,\n" 00351 " unsigned int end_index,\n" 00352 " __local float * tmp_buffer)\n" 00353 "{\n" 00354 " float tmp = 0;\n" 00355 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n" 00356 " tmp += fabs(vec[i]);\n" 00357 " \n" 00358 " tmp_buffer[get_local_id(0)] = tmp;\n" 00359 " \n" 00360 " helper_norm1_parallel_reduction(tmp_buffer);\n" 00361 " \n" 00362 " return tmp_buffer[0];\n" 00363 "};\n" 00364 "\n" 00365 "__kernel void norm_1(\n" 00366 " __global const float * vec,\n" 00367 " unsigned int size,\n" 00368 " __local float * tmp_buffer,\n" 00369 " global float * group_buffer)\n" 00370 "{\n" 00371 " float tmp = impl_norm_1(vec,\n" 00372 " ( get_group_id(0) * size) / get_num_groups(0),\n" 00373 " ((get_group_id(0) + 1) * size) / get_num_groups(0),\n" 00374 " tmp_buffer);\n" 00375 " \n" 00376 " if (get_local_id(0) == 0)\n" 00377 " group_buffer[get_group_id(0)] = tmp; \n" 00378 "}\n" 00379 "\n" 00380 ; //vector_align1_norm_1 00381 00382 const char * const vector_align1_divide = 00383 "// Note: name 'div' is not allowed by the jit-compiler\n" 00384 "__kernel void divide(\n" 00385 " __global const float * vec,\n" 00386 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n" 00387 " __global float * result,\n" 00388 " unsigned int size) \n" 00389 "{ \n" 00390 " float factor = *fac;\n" 00391 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00392 " result[i] = vec[i] / factor;\n" 00393 "}\n" 00394 ; //vector_align1_divide 00395 00396 const char * const vector_align1_swap = 00397 "////// swap:\n" 00398 "__kernel void swap(\n" 00399 " __global float * vec1,\n" 00400 " __global float * vec2,\n" 00401 " unsigned int size\n" 00402 " ) \n" 00403 "{ \n" 00404 " float tmp;\n" 00405 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00406 " {\n" 00407 " tmp = vec2[i];\n" 00408 " vec2[i] = vec1[i];\n" 00409 " vec1[i] = tmp;\n" 00410 " }\n" 00411 "}\n" 00412 " \n" 00413 ; //vector_align1_swap 00414 00415 const char * const vector_align1_norm_inf = 00416 "\n" 00417 "////// norm_inf\n" 00418 "float impl_norm_inf(\n" 00419 " __global const float * vec,\n" 00420 " unsigned int start_index,\n" 00421 " unsigned int end_index,\n" 00422 " __local float * tmp_buffer)\n" 00423 "{\n" 00424 " float tmp = 0;\n" 00425 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n" 00426 " tmp = fmax(fabs(vec[i]), tmp);\n" 00427 " tmp_buffer[get_local_id(0)] = tmp;\n" 00428 " \n" 00429 " //step 2: parallel reduction:\n" 00430 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00431 " {\n" 00432 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00433 " if (get_global_id(0) < stride)\n" 00434 " tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n" 00435 " }\n" 00436 " \n" 00437 " return tmp_buffer[0];\n" 00438 "}\n" 00439 "\n" 00440 "__kernel void norm_inf(\n" 00441 " __global const float * vec,\n" 00442 " unsigned int size,\n" 00443 " __local float * tmp_buffer,\n" 00444 " global float * group_buffer)\n" 00445 "{\n" 00446 " float tmp = impl_norm_inf(vec,\n" 00447 " ( get_group_id(0) * size) / get_num_groups(0),\n" 00448 " ((get_group_id(0) + 1) * size) / get_num_groups(0),\n" 00449 " tmp_buffer);\n" 00450 " \n" 00451 " if (get_local_id(0) == 0)\n" 00452 " group_buffer[get_group_id(0)] = tmp; \n" 00453 "}\n" 00454 ; //vector_align1_norm_inf 00455 00456 const char * const vector_align1_inplace_div_add = 00457 "///// divide add:\n" 00458 "__kernel void inplace_div_add(\n" 00459 " __global float * vec1,\n" 00460 " __global const float * vec2,\n" 00461 " __global const float * fac, //CPU variant is mapped to mult_add\n" 00462 " unsigned int size\n" 00463 " ) \n" 00464 "{ \n" 00465 " float factor = *fac;\n" 00466 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00467 " vec1[i] -= vec2[i] / factor;\n" 00468 "}\n" 00469 ; //vector_align1_inplace_div_add 00470 00471 const char * const vector_align1_sqrt_sum = 00472 "__kernel void sqrt_sum(\n" 00473 " __global float * vec1,\n" 00474 " __global float * result,\n" 00475 " unsigned int size) \n" 00476 "{ \n" 00477 " //parallel reduction on global memory: \n" 00478 " \n" 00479 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00480 " {\n" 00481 " if (get_global_id(0) < stride)\n" 00482 " vec1[get_global_id(0)] += vec1[get_global_id(0)+stride];\n" 00483 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00484 " }\n" 00485 " \n" 00486 " if (get_global_id(0) == 0)\n" 00487 " *result = sqrt(vec1[0]);\n" 00488 " \n" 00489 "}\n" 00490 ; //vector_align1_sqrt_sum 00491 00492 const char * const vector_align1_inplace_add = 00493 "__kernel void inplace_add(\n" 00494 " __global float * vec1,\n" 00495 " __global const float * vec2,\n" 00496 " unsigned int size) \n" 00497 "{ \n" 00498 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00499 " vec1[i] += vec2[i];\n" 00500 "}\n" 00501 ; //vector_align1_inplace_add 00502 00503 const char * const vector_align1_mul_sub = 00504 "///// multiply subtract:\n" 00505 "__kernel void mul_sub(\n" 00506 " __global const float * vec1,\n" 00507 " __global const float * fac,\n" 00508 " __global const float * vec2,\n" 00509 " __global float * result,\n" 00510 " unsigned int size\n" 00511 " ) \n" 00512 "{ \n" 00513 " float factor = *fac;\n" 00514 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00515 " result[i] = vec1[i] * factor - vec2[i];\n" 00516 "}\n" 00517 ; //vector_align1_mul_sub 00518 00519 const char * const vector_align1_sum = 00520 "__kernel void sum(\n" 00521 " __global float * vec1,\n" 00522 " __global float * result) \n" 00523 "{ \n" 00524 " //parallel reduction on global memory: \n" 00525 " \n" 00526 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00527 " {\n" 00528 " if (get_global_id(0) < stride)\n" 00529 " vec1[get_global_id(0)] += vec1[get_global_id(0)+stride];\n" 00530 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00531 " }\n" 00532 " \n" 00533 " if (get_global_id(0) == 0)\n" 00534 " *result = vec1[0]; \n" 00535 "}\n" 00536 ; //vector_align1_sum 00537 00538 const char * const vector_align1_cpu_mul_add = 00539 "__kernel void cpu_mul_add(\n" 00540 " __global const float * vec1,\n" 00541 " float factor,\n" 00542 " __global const float * vec2,\n" 00543 " __global float * result,\n" 00544 " unsigned int size\n" 00545 " ) \n" 00546 "{ \n" 00547 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00548 " result[i] = vec1[i] * factor + vec2[i];\n" 00549 "}\n" 00550 ; //vector_align1_cpu_mul_add 00551 00552 const char * const vector_align1_cpu_mult = 00553 "__kernel void cpu_mult(\n" 00554 " __global const float * vec,\n" 00555 " float factor, \n" 00556 " __global float * result,\n" 00557 " unsigned int size) \n" 00558 "{ \n" 00559 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00560 " result[i] = vec[i] * factor;\n" 00561 "}\n" 00562 ; //vector_align1_cpu_mult 00563 00564 const char * const vector_align1_cpu_inplace_mul_add = 00565 "__kernel void cpu_inplace_mul_add(\n" 00566 " __global float * vec1,\n" 00567 " __global const float * vec2,\n" 00568 " float factor,\n" 00569 " unsigned int size\n" 00570 " ) \n" 00571 "{ \n" 00572 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00573 " vec1[i] += vec2[i] * factor;\n" 00574 "}\n" 00575 ; //vector_align1_cpu_inplace_mul_add 00576 00577 const char * const vector_align1_cpu_inplace_mult = 00578 "__kernel void cpu_inplace_mult(\n" 00579 " __global float * vec,\n" 00580 " float factor, \n" 00581 " unsigned int size) \n" 00582 "{ \n" 00583 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00584 " vec[i] *= factor;\n" 00585 "}\n" 00586 ; //vector_align1_cpu_inplace_mult 00587 00588 const char * const vector_align1_plane_rotation = 00589 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n" 00590 "__kernel void plane_rotation(\n" 00591 " __global float * vec1,\n" 00592 " __global float * vec2, \n" 00593 " float alpha,\n" 00594 " float beta,\n" 00595 " unsigned int size) \n" 00596 "{ \n" 00597 " float tmp1 = 0;\n" 00598 " float tmp2 = 0;\n" 00599 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00600 " {\n" 00601 " tmp1 = vec1[i];\n" 00602 " tmp2 = vec2[i];\n" 00603 " \n" 00604 " vec1[i] = alpha * tmp1 + beta * tmp2;\n" 00605 " vec2[i] = alpha * tmp2 - beta * tmp1;\n" 00606 " }\n" 00607 "}\n" 00608 ; //vector_align1_plane_rotation 00609 00610 const char * const vector_align1_clear = 00611 "__kernel void clear(\n" 00612 " __global float * vec,\n" 00613 " unsigned int size) \n" 00614 "{ \n" 00615 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00616 " vec[i] = 0;\n" 00617 "}\n" 00618 ; //vector_align1_clear 00619 00620 const char * const vector_align1_inplace_mult = 00621 "__kernel void inplace_mult(\n" 00622 " __global float * vec,\n" 00623 " __global const float * fac, \n" 00624 " unsigned int size) \n" 00625 "{ \n" 00626 " float factor = *fac;\n" 00627 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00628 " vec[i] *= factor;\n" 00629 "}\n" 00630 ; //vector_align1_inplace_mult 00631 00632 const char * const vector_align1_inplace_mul_sub = 00633 "__kernel void inplace_mul_sub(\n" 00634 " __global float * vec1,\n" 00635 " __global const float * vec2,\n" 00636 " __global const float * fac, //CPU variant is mapped to mult_add\n" 00637 " unsigned int size\n" 00638 " ) \n" 00639 "{ \n" 00640 " float factor = *fac;\n" 00641 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00642 " vec1[i] -= vec2[i] * factor;\n" 00643 "}\n" 00644 ; //vector_align1_inplace_mul_sub 00645 00646 const char * const vector_align1_norm_2 = 00647 "//helper:\n" 00648 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n" 00649 "{\n" 00650 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n" 00651 " {\n" 00652 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00653 " if (get_global_id(0) < stride)\n" 00654 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n" 00655 " }\n" 00656 "}\n" 00657 "\n" 00658 "////// norm_2\n" 00659 "float impl_norm_2(\n" 00660 " __global const float * vec,\n" 00661 " unsigned int start_index,\n" 00662 " unsigned int end_index,\n" 00663 " __local float * tmp_buffer)\n" 00664 "{\n" 00665 " float tmp = 0;\n" 00666 " float vec_entry = 0;\n" 00667 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n" 00668 " {\n" 00669 " vec_entry = vec[i];\n" 00670 " tmp += vec_entry * vec_entry;\n" 00671 " }\n" 00672 " tmp_buffer[get_local_id(0)] = tmp;\n" 00673 " \n" 00674 " helper_norm2_parallel_reduction(tmp_buffer);\n" 00675 " \n" 00676 " return tmp_buffer[0];\n" 00677 "};\n" 00678 "\n" 00679 "__kernel void norm_2(\n" 00680 " __global const float * vec,\n" 00681 " unsigned int size,\n" 00682 " __local float * tmp_buffer,\n" 00683 " global float * group_buffer)\n" 00684 "{\n" 00685 " float tmp = impl_norm_2(vec,\n" 00686 " ( get_group_id(0) * size) / get_num_groups(0),\n" 00687 " ((get_group_id(0) + 1) * size) / get_num_groups(0),\n" 00688 " tmp_buffer);\n" 00689 " \n" 00690 " if (get_local_id(0) == 0)\n" 00691 " group_buffer[get_group_id(0)] = tmp; \n" 00692 "}\n" 00693 "\n" 00694 ; //vector_align1_norm_2 00695 00696 const char * const vector_align1_add = 00697 "__kernel void add(\n" 00698 " __global const float * vec1,\n" 00699 " __global const float * vec2, \n" 00700 " __global float * result,\n" 00701 " unsigned int size) \n" 00702 "{ \n" 00703 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00704 " result[i] = vec1[i] + vec2[i];\n" 00705 "}\n" 00706 ; //vector_align1_add 00707 00708 const char * const vector_align16_inplace_divide = 00709 "__kernel void inplace_divide(\n" 00710 " __global float16 * vec,\n" 00711 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n" 00712 " unsigned int size) \n" 00713 "{ \n" 00714 " float factor = *fac;\n" 00715 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00716 " vec[i] /= factor;\n" 00717 "}\n" 00718 ; //vector_align16_inplace_divide 00719 00720 const char * const vector_align16_sub = 00721 "__kernel void sub(\n" 00722 " __global const float16 * vec1,\n" 00723 " __global const float16 * vec2, \n" 00724 " __global float16 * result,\n" 00725 " unsigned int size)\n" 00726 "{ \n" 00727 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00728 " result[i] = vec1[i] - vec2[i];\n" 00729 "}\n" 00730 ; //vector_align16_sub 00731 00732 const char * const vector_align16_cpu_inplace_mul = 00733 "\n" 00734 "__kernel void cpu_inplace_mult(\n" 00735 " __global float16 * vec,\n" 00736 " float factor, \n" 00737 " unsigned int size) \n" 00738 "{ \n" 00739 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00740 " vec[i] *= factor;\n" 00741 "}\n" 00742 "\n" 00743 ; //vector_align16_cpu_inplace_mul 00744 00745 const char * const vector_align16_inplace_sub = 00746 "__kernel void inplace_sub(\n" 00747 " __global float16 * vec1,\n" 00748 " __global const float16 * vec2,\n" 00749 " unsigned int size) \n" 00750 "{ \n" 00751 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00752 " vec1[i] -= vec2[i];\n" 00753 "}\n" 00754 ; //vector_align16_inplace_sub 00755 00756 const char * const vector_align16_mult = 00757 "__kernel void mult(\n" 00758 " __global const float16 * vec,\n" 00759 " __global const float * fac, \n" 00760 " __global float16 * result,\n" 00761 " unsigned int size) \n" 00762 "{ \n" 00763 " float factor = *fac;\n" 00764 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00765 " result[i] = vec[i] * factor;\n" 00766 "}\n" 00767 ; //vector_align16_mult 00768 00769 const char * const vector_align16_divide = 00770 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n" 00771 "__kernel void divide(\n" 00772 " __global const float16 * vec,\n" 00773 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n" 00774 " __global float16 * result,\n" 00775 " unsigned int size) \n" 00776 "{ \n" 00777 " float factor = *fac;\n" 00778 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00779 " result[i] = vec[i] / factor;\n" 00780 "}\n" 00781 ; //vector_align16_divide 00782 00783 const char * const vector_align16_inplace_add = 00784 "__kernel void inplace_add(\n" 00785 " __global float16 * vec1,\n" 00786 " __global const float16 * vec2,\n" 00787 " unsigned int size) \n" 00788 "{ \n" 00789 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00790 " vec1[i] += vec2[i];\n" 00791 "}\n" 00792 ; //vector_align16_inplace_add 00793 00794 const char * const vector_align16_cpu_mult = 00795 "__kernel void cpu_mult(\n" 00796 " __global const float16 * vec,\n" 00797 " float factor, \n" 00798 " __global float16 * result,\n" 00799 " unsigned int size) \n" 00800 "{ \n" 00801 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00802 " result[i] = vec[i] * factor;\n" 00803 "}\n" 00804 ; //vector_align16_cpu_mult 00805 00806 const char * const vector_align16_inplace_mult = 00807 "__kernel void inplace_mult(\n" 00808 " __global float16 * vec,\n" 00809 " __global const float * fac, \n" 00810 " unsigned int size) \n" 00811 "{ \n" 00812 " float factor = *fac;\n" 00813 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00814 " vec[i] *= factor;\n" 00815 "}\n" 00816 ; //vector_align16_inplace_mult 00817 00818 const char * const vector_align16_add = 00819 "__kernel void add(\n" 00820 " __global const float16 * vec1,\n" 00821 " __global const float16 * vec2, \n" 00822 " __global float16 * result,\n" 00823 " unsigned int size)\n" 00824 "{ \n" 00825 " for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n" 00826 " result[i] = vec1[i] + vec2[i];\n" 00827 "}\n" 00828 ; //vector_align16_add 00829 00830 } //namespace kernels 00831 } //namespace linalg 00832 } //namespace viennacl 00833 #endif
1.7.6.1