ViennaCL - The Vienna Computing Library  1.1.2
/build/buildd/viennacl-1.1.2/viennacl/linalg/kernels/vector_source.h
Go to the documentation of this file.
00001 #ifndef _VIENNACL_VECTOR_SOURCE_HPP_
00002 #define _VIENNACL_VECTOR_SOURCE_HPP_
00003 //Automatically generated file from aux-directory, do not edit manually!
00004 namespace viennacl
00005 {
00006  namespace linalg
00007  {
00008   namespace kernels
00009   {
00010 const char * const vector_align4_inplace_div_sub = 
00011 "__kernel void inplace_div_sub(\n"
00012 "          __global float4 * vec1,\n"
00013 "          __global const float4 * vec2,\n"
00014 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00015 "          unsigned int size\n"
00016 "          ) \n"
00017 "{ \n"
00018 "  float factor = *fac;\n"
00019 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00020 "    vec1[i] -= vec2[i] / factor;\n"
00021 "}\n"
00022 ; //vector_align4_inplace_div_sub
00023 
00024 const char * const vector_align4_mul_add = 
00025 "__kernel void mul_add(\n"
00026 "          __global const float4 * vec1,\n"
00027 "          __global const float * fac,\n"
00028 "          __global const float4 * vec2,\n"
00029 "          __global float4 * result,\n"
00030 "          unsigned int size\n"
00031 "          ) \n"
00032 "{ \n"
00033 "  float factor = *fac;\n"
00034 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00035 "    result[i] = vec1[i] * factor + vec2[i];\n"
00036 "}\n"
00037 ; //vector_align4_mul_add
00038 
00039 const char * const vector_align4_inplace_mul_add = 
00040 "__kernel void inplace_mul_add(\n"
00041 "          __global float4 * vec1,\n"
00042 "          __global const float4 * vec2,\n"
00043 "          __global const float * fac,\n"
00044 "          unsigned int size\n"
00045 "          ) \n"
00046 "{ \n"
00047 "  float factor = *fac;\n"
00048 "  unsigned int size_div_4 = size >> 2;\n"
00049 "  for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
00050 "    vec1[i] += vec2[i] * factor;\n"
00051 "}\n"
00052 ; //vector_align4_inplace_mul_add
00053 
00054 const char * const vector_align4_inplace_div_add = 
00055 "__kernel void inplace_div_add(\n"
00056 "          __global float4 * vec1,\n"
00057 "          __global const float4 * vec2,\n"
00058 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00059 "          unsigned int size\n"
00060 "          ) \n"
00061 "{ \n"
00062 "  float factor = *fac;\n"
00063 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00064 "    vec1[i] -= vec2[i] / factor;\n"
00065 "}\n"
00066 ; //vector_align4_inplace_div_add
00067 
00068 const char * const vector_align4_cpu_mul_add = 
00069 "__kernel void cpu_mul_add(\n"
00070 "          __global const float4 * vec1,\n"
00071 "          float factor,\n"
00072 "          __global const float4 * vec2,\n"
00073 "          __global float4 * result,\n"
00074 "          unsigned int size\n"
00075 "          ) \n"
00076 "{ \n"
00077 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00078 "    result[i] = vec1[i] * factor + vec2[i];\n"
00079 "}\n"
00080 ; //vector_align4_cpu_mul_add
00081 
00082 const char * const vector_align4_cpu_inplace_mul_add = 
00083 "__kernel void cpu_inplace_mul_add(\n"
00084 "          __global float4 * vec1,\n"
00085 "          __global const float4 * vec2,\n"
00086 "          float factor,\n"
00087 "          unsigned int size\n"
00088 "          ) \n"
00089 "{ \n"
00090 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00091 "    vec1[i] += vec2[i] * factor;\n"
00092 "}\n"
00093 ; //vector_align4_cpu_inplace_mul_add
00094 
00095 const char * const vector_align4_inplace_mul_sub = 
00096 "__kernel void inplace_mul_sub(\n"
00097 "          __global float4 * vec1,\n"
00098 "          __global const float4 * vec2,\n"
00099 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00100 "          unsigned int size\n"
00101 "          ) \n"
00102 "{ \n"
00103 "  float factor = *fac;\n"
00104 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00105 "    vec1[i] -= vec2[i] * factor;\n"
00106 "}\n"
00107 ; //vector_align4_inplace_mul_sub
00108 
00109 const char * const vector_align1_inplace_divide = 
00110 "__kernel void inplace_divide(\n"
00111 "          __global float * vec,\n"
00112 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00113 "          unsigned int size) \n"
00114 "{ \n"
00115 "  float factor = *fac;\n"
00116 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00117 "    vec[i] /= factor;\n"
00118 "}\n"
00119 ; //vector_align1_inplace_divide
00120 
00121 const char * const vector_align1_inplace_div_sub = 
00122 "///// divide substract:\n"
00123 "__kernel void inplace_div_sub(\n"
00124 "          __global float * vec1,\n"
00125 "          __global const float * vec2,\n"
00126 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00127 "          unsigned int size\n"
00128 "          ) \n"
00129 "{ \n"
00130 "  float factor = *fac;\n"
00131 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00132 "    vec1[i] -= vec2[i] / factor;\n"
00133 "}\n"
00134 ; //vector_align1_inplace_div_sub
00135 
00136 const char * const vector_align1_vmax = 
00137 "__kernel void vmax(\n"
00138 "          __global float * vec1,\n"
00139 "          __global float * result,\n"
00140 "          unsigned int size) \n"
00141 "{ \n"
00142 "  //parallel reduction on global memory:\n"
00143 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00144 "  {\n"
00145 "    if (get_global_id(0) < stride)\n"
00146 "      vec1[get_global_id(0)] = fmax(vec1[get_global_id(0)+stride], vec1[get_global_id(0)]);\n"
00147 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00148 "  }\n"
00149 "  \n"
00150 "  if (get_global_id(0) == 0)\n"
00151 "    *result = vec1[0];  \n"
00152 "}\n"
00153 ; //vector_align1_vmax
00154 
00155 const char * const vector_align1_index_norm_inf = 
00156 "//index_norm_inf:\n"
00157 "unsigned int float_vector1_index_norm_inf_impl(\n"
00158 "          __global const float * vec,\n"
00159 "          unsigned int size,\n"
00160 "          __local float * float_buffer,\n"
00161 "          __local unsigned int * index_buffer)\n"
00162 "{\n"
00163 "  //step 1: fill buffer:\n"
00164 "  float cur_max = 0.0f;\n"
00165 "  float tmp;\n"
00166 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00167 "  {\n"
00168 "    tmp = fabs(vec[i]);\n"
00169 "    if (cur_max < tmp)\n"
00170 "    {\n"
00171 "      float_buffer[get_global_id(0)] = tmp;\n"
00172 "      index_buffer[get_global_id(0)] = i;\n"
00173 "      cur_max = tmp;\n"
00174 "    }\n"
00175 "  }\n"
00176 "  \n"
00177 "  //step 2: parallel reduction:\n"
00178 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00179 "  {\n"
00180 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00181 "    if (get_global_id(0) < stride)\n"
00182 "    {\n"
00183 "      //find the first occurring index\n"
00184 "      if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
00185 "      {\n"
00186 "        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
00187 "        float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
00188 "      }\n"
00189 "      \n"
00190 "      //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
00191 "      //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
00192 "    }\n"
00193 "  }\n"
00194 "  \n"
00195 "  return index_buffer[0];\n"
00196 "}\n"
00197 "\n"
00198 "__kernel void index_norm_inf(\n"
00199 "          __global float * vec,\n"
00200 "          unsigned int size,\n"
00201 "          __local float * float_buffer,\n"
00202 "          __local unsigned int * index_buffer,\n"
00203 "          global unsigned int * result) \n"
00204 "{ \n"
00205 "  unsigned int tmp = float_vector1_index_norm_inf_impl(vec, size, float_buffer, index_buffer);\n"
00206 "  if (get_global_id(0) == 0) *result = tmp;\n"
00207 "}\n"
00208 "\n"
00209 "\n"
00210 ; //vector_align1_index_norm_inf
00211 
00212 const char * const vector_align1_sub = 
00213 "__kernel void sub(\n"
00214 "          __global const float * vec1,\n"
00215 "          __global const float * vec2, \n"
00216 "          __global float * result,\n"
00217 "          unsigned int size)\n"
00218 "{ \n"
00219 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00220 "    result[i] = vec1[i] - vec2[i];\n"
00221 "}\n"
00222 ; //vector_align1_sub
00223 
00224 const char * const vector_align1_mul_add = 
00225 "__kernel void mul_add(\n"
00226 "          __global const float * vec1,\n"
00227 "          __global const float * fac,\n"
00228 "          __global const float * vec2,\n"
00229 "          __global float * result,\n"
00230 "          unsigned int size\n"
00231 "          ) \n"
00232 "{ \n"
00233 "  float factor = *fac;\n"
00234 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00235 "    result[i] = vec1[i] * factor + vec2[i];\n"
00236 "}\n"
00237 ; //vector_align1_mul_add
00238 
00239 const char * const vector_align1_inplace_sub = 
00240 "__kernel void inplace_sub(\n"
00241 "          __global float * vec1,\n"
00242 "          __global const float * vec2,\n"
00243 "          unsigned int size) \n"
00244 "{ \n"
00245 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00246 "    vec1[i] -= vec2[i];\n"
00247 "}\n"
00248 ; //vector_align1_inplace_sub
00249 
00250 const char * const vector_align1_inner_prod = 
00251 "//helper:\n"
00252 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
00253 "{\n"
00254 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00255 "  {\n"
00256 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00257 "    if (get_local_id(0) < stride)\n"
00258 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00259 "  }\n"
00260 "}\n"
00261 "//////// inner products:\n"
00262 "float impl_inner_prod(\n"
00263 "          __global const float * vec1,\n"
00264 "          __global const float * vec2,\n"
00265 "          unsigned int start_index,\n"
00266 "          unsigned int end_index,\n"
00267 "          __local float * tmp_buffer)\n"
00268 "{\n"
00269 "  float tmp = 0;\n"
00270 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00271 "    tmp += vec1[i] * vec2[i];\n"
00272 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00273 "  \n"
00274 "  helper_inner_prod_parallel_reduction(tmp_buffer);\n"
00275 "  \n"
00276 "  return tmp_buffer[0];\n"
00277 "}\n"
00278 "__kernel void inner_prod(\n"
00279 "          __global const float * vec1,\n"
00280 "          __global const float * vec2,\n"
00281 "          unsigned int size,\n"
00282 "          __local float * tmp_buffer,\n"
00283 "          global float * group_buffer)\n"
00284 "{\n"
00285 "  float tmp = impl_inner_prod(vec1,\n"
00286 "                              vec2,\n"
00287 "                              (      get_group_id(0) * size) / get_num_groups(0),\n"
00288 "                              ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00289 "                              tmp_buffer);\n"
00290 "  \n"
00291 "  if (get_local_id(0) == 0)\n"
00292 "    group_buffer[get_group_id(0)] = tmp;\n"
00293 "  \n"
00294 "}\n"
00295 ; //vector_align1_inner_prod
00296 
00297 const char * const vector_align1_mult = 
00298 "__kernel void mult(\n"
00299 "          __global const float * vec,\n"
00300 "          __global const float * fac, \n"
00301 "          __global float * result,\n"
00302 "          unsigned int size) \n"
00303 "{ \n"
00304 "  float factor = *fac;\n"
00305 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00306 "    result[i] = vec[i] * factor;\n"
00307 "}\n"
00308 ; //vector_align1_mult
00309 
00310 const char * const vector_align1_diag_precond = 
00311 "__kernel void diag_precond(\n"
00312 "          __global const float * diag_A_inv, \n"
00313 "          __global float * x, \n"
00314 "          unsigned int size) \n"
00315 "{ \n"
00316 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00317 "    x[i] *= diag_A_inv[i];\n"
00318 "}\n"
00319 ; //vector_align1_diag_precond
00320 
00321 const char * const vector_align1_inplace_mul_add = 
00322 "__kernel void inplace_mul_add(\n"
00323 "          __global float * vec1,\n"
00324 "          __global const float * vec2,\n"
00325 "          __global const float * fac,\n"
00326 "          unsigned int size\n"
00327 "          ) \n"
00328 "{ \n"
00329 "  float factor = *fac;\n"
00330 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00331 "    vec1[i] += vec2[i] * factor;\n"
00332 "}\n"
00333 ; //vector_align1_inplace_mul_add
00334 
00335 const char * const vector_align1_norm_1 = 
00336 "//helper:\n"
00337 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
00338 "{\n"
00339 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00340 "  {\n"
00341 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00342 "    if (get_global_id(0) < stride)\n"
00343 "      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00344 "  }\n"
00345 "}\n"
00346 "\n"
00347 "////// norm_1\n"
00348 "float impl_norm_1(\n"
00349 "          __global const float * vec,\n"
00350 "          unsigned int start_index,\n"
00351 "          unsigned int end_index,\n"
00352 "          __local float * tmp_buffer)\n"
00353 "{\n"
00354 "  float tmp = 0;\n"
00355 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00356 "    tmp += fabs(vec[i]);\n"
00357 "  \n"
00358 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00359 "  \n"
00360 "  helper_norm1_parallel_reduction(tmp_buffer);\n"
00361 "  \n"
00362 "  return tmp_buffer[0];\n"
00363 "};\n"
00364 "\n"
00365 "__kernel void norm_1(\n"
00366 "          __global const float * vec,\n"
00367 "          unsigned int size,\n"
00368 "          __local float * tmp_buffer,\n"
00369 "          global float * group_buffer)\n"
00370 "{\n"
00371 "  float tmp = impl_norm_1(vec,\n"
00372 "                          (      get_group_id(0) * size) / get_num_groups(0),\n"
00373 "                          ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00374 "                          tmp_buffer);\n"
00375 "  \n"
00376 "  if (get_local_id(0) == 0)\n"
00377 "    group_buffer[get_group_id(0)] = tmp;  \n"
00378 "}\n"
00379 "\n"
00380 ; //vector_align1_norm_1
00381 
00382 const char * const vector_align1_divide = 
00383 "// Note: name 'div' is not allowed by the jit-compiler\n"
00384 "__kernel void divide(\n"
00385 "          __global const float * vec,\n"
00386 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00387 "          __global float * result,\n"
00388 "          unsigned int size)  \n"
00389 "{ \n"
00390 "  float factor = *fac;\n"
00391 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00392 "    result[i] = vec[i] / factor;\n"
00393 "}\n"
00394 ; //vector_align1_divide
00395 
00396 const char * const vector_align1_swap = 
00397 "////// swap:\n"
00398 "__kernel void swap(\n"
00399 "          __global float * vec1,\n"
00400 "          __global float * vec2,\n"
00401 "          unsigned int size\n"
00402 "          ) \n"
00403 "{ \n"
00404 "  float tmp;\n"
00405 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00406 "  {\n"
00407 "    tmp = vec2[i];\n"
00408 "    vec2[i] = vec1[i];\n"
00409 "    vec1[i] = tmp;\n"
00410 "  }\n"
00411 "}\n"
00412 " \n"
00413 ; //vector_align1_swap
00414 
00415 const char * const vector_align1_norm_inf = 
00416 "\n"
00417 "////// norm_inf\n"
00418 "float impl_norm_inf(\n"
00419 "          __global const float * vec,\n"
00420 "          unsigned int start_index,\n"
00421 "          unsigned int end_index,\n"
00422 "          __local float * tmp_buffer)\n"
00423 "{\n"
00424 "  float tmp = 0;\n"
00425 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00426 "    tmp = fmax(fabs(vec[i]), tmp);\n"
00427 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00428 "  \n"
00429 "  //step 2: parallel reduction:\n"
00430 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00431 "  {\n"
00432 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00433 "    if (get_global_id(0) < stride)\n"
00434 "      tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
00435 "  }\n"
00436 "  \n"
00437 "  return tmp_buffer[0];\n"
00438 "}\n"
00439 "\n"
00440 "__kernel void norm_inf(\n"
00441 "          __global const float * vec,\n"
00442 "          unsigned int size,\n"
00443 "          __local float * tmp_buffer,\n"
00444 "          global float * group_buffer)\n"
00445 "{\n"
00446 "  float tmp = impl_norm_inf(vec,\n"
00447 "                          (      get_group_id(0) * size) / get_num_groups(0),\n"
00448 "                          ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00449 "                          tmp_buffer);\n"
00450 "  \n"
00451 "  if (get_local_id(0) == 0)\n"
00452 "    group_buffer[get_group_id(0)] = tmp;  \n"
00453 "}\n"
00454 ; //vector_align1_norm_inf
00455 
00456 const char * const vector_align1_inplace_div_add = 
00457 "///// divide add:\n"
00458 "__kernel void inplace_div_add(\n"
00459 "          __global float * vec1,\n"
00460 "          __global const float * vec2,\n"
00461 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00462 "          unsigned int size\n"
00463 "          ) \n"
00464 "{ \n"
00465 "  float factor = *fac;\n"
00466 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00467 "    vec1[i] -= vec2[i] / factor;\n"
00468 "}\n"
00469 ; //vector_align1_inplace_div_add
00470 
00471 const char * const vector_align1_sqrt_sum = 
00472 "__kernel void sqrt_sum(\n"
00473 "          __global float * vec1,\n"
00474 "          __global float * result,\n"
00475 "          unsigned int size) \n"
00476 "{ \n"
00477 "  //parallel reduction on global memory:  \n"
00478 "  \n"
00479 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00480 "  {\n"
00481 "    if (get_global_id(0) < stride)\n"
00482 "      vec1[get_global_id(0)] += vec1[get_global_id(0)+stride];\n"
00483 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00484 "  }\n"
00485 "  \n"
00486 "  if (get_global_id(0) == 0)\n"
00487 "    *result = sqrt(vec1[0]);\n"
00488 "  \n"
00489 "}\n"
00490 ; //vector_align1_sqrt_sum
00491 
00492 const char * const vector_align1_inplace_add = 
00493 "__kernel void inplace_add(\n"
00494 "          __global float * vec1,\n"
00495 "          __global const float * vec2,\n"
00496 "          unsigned int size) \n"
00497 "{ \n"
00498 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00499 "    vec1[i] += vec2[i];\n"
00500 "}\n"
00501 ; //vector_align1_inplace_add
00502 
00503 const char * const vector_align1_mul_sub = 
00504 "///// multiply subtract:\n"
00505 "__kernel void mul_sub(\n"
00506 "          __global const float * vec1,\n"
00507 "          __global const float * fac,\n"
00508 "          __global const float * vec2,\n"
00509 "          __global float * result,\n"
00510 "          unsigned int size\n"
00511 "          ) \n"
00512 "{ \n"
00513 "  float factor = *fac;\n"
00514 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00515 "    result[i] = vec1[i] * factor - vec2[i];\n"
00516 "}\n"
00517 ; //vector_align1_mul_sub
00518 
00519 const char * const vector_align1_sum = 
00520 "__kernel void sum(\n"
00521 "          __global float * vec1,\n"
00522 "          __global float * result) \n"
00523 "{ \n"
00524 "  //parallel reduction on global memory:  \n"
00525 "  \n"
00526 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00527 "  {\n"
00528 "    if (get_global_id(0) < stride)\n"
00529 "      vec1[get_global_id(0)] += vec1[get_global_id(0)+stride];\n"
00530 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00531 "  }\n"
00532 "  \n"
00533 "  if (get_global_id(0) == 0)\n"
00534 "    *result = vec1[0];  \n"
00535 "}\n"
00536 ; //vector_align1_sum
00537 
00538 const char * const vector_align1_cpu_mul_add = 
00539 "__kernel void cpu_mul_add(\n"
00540 "          __global const float * vec1,\n"
00541 "          float factor,\n"
00542 "          __global const float * vec2,\n"
00543 "          __global float * result,\n"
00544 "          unsigned int size\n"
00545 "          ) \n"
00546 "{ \n"
00547 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00548 "    result[i] = vec1[i] * factor + vec2[i];\n"
00549 "}\n"
00550 ; //vector_align1_cpu_mul_add
00551 
00552 const char * const vector_align1_cpu_mult = 
00553 "__kernel void cpu_mult(\n"
00554 "          __global const float * vec,\n"
00555 "          float factor, \n"
00556 "          __global float * result,\n"
00557 "          unsigned int size) \n"
00558 "{ \n"
00559 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00560 "    result[i] = vec[i] * factor;\n"
00561 "}\n"
00562 ; //vector_align1_cpu_mult
00563 
00564 const char * const vector_align1_cpu_inplace_mul_add = 
00565 "__kernel void cpu_inplace_mul_add(\n"
00566 "          __global float * vec1,\n"
00567 "          __global const float * vec2,\n"
00568 "          float factor,\n"
00569 "          unsigned int size\n"
00570 "          ) \n"
00571 "{ \n"
00572 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00573 "    vec1[i] += vec2[i] * factor;\n"
00574 "}\n"
00575 ; //vector_align1_cpu_inplace_mul_add
00576 
00577 const char * const vector_align1_cpu_inplace_mult = 
00578 "__kernel void cpu_inplace_mult(\n"
00579 "          __global float * vec,\n"
00580 "          float factor, \n"
00581 "          unsigned int size) \n"
00582 "{ \n"
00583 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00584 "    vec[i] *= factor;\n"
00585 "}\n"
00586 ; //vector_align1_cpu_inplace_mult
00587 
00588 const char * const vector_align1_plane_rotation = 
00589 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
00590 "__kernel void plane_rotation(\n"
00591 "          __global float * vec1,\n"
00592 "          __global float * vec2, \n"
00593 "          float alpha,\n"
00594 "          float beta,\n"
00595 "          unsigned int size) \n"
00596 "{ \n"
00597 "  float tmp1 = 0;\n"
00598 "  float tmp2 = 0;\n"
00599 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00600 "  {\n"
00601 "    tmp1 = vec1[i];\n"
00602 "    tmp2 = vec2[i];\n"
00603 "    \n"
00604 "    vec1[i] = alpha * tmp1 + beta * tmp2;\n"
00605 "    vec2[i] = alpha * tmp2 - beta * tmp1;\n"
00606 "  }\n"
00607 "}\n"
00608 ; //vector_align1_plane_rotation
00609 
00610 const char * const vector_align1_clear = 
00611 "__kernel void clear(\n"
00612 "          __global float * vec,\n"
00613 "          unsigned int size) \n"
00614 "{ \n"
00615 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00616 "    vec[i] = 0;\n"
00617 "}\n"
00618 ; //vector_align1_clear
00619 
00620 const char * const vector_align1_inplace_mult = 
00621 "__kernel void inplace_mult(\n"
00622 "          __global float * vec,\n"
00623 "          __global const float * fac, \n"
00624 "          unsigned int size) \n"
00625 "{ \n"
00626 "  float factor = *fac;\n"
00627 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00628 "    vec[i] *= factor;\n"
00629 "}\n"
00630 ; //vector_align1_inplace_mult
00631 
00632 const char * const vector_align1_inplace_mul_sub = 
00633 "__kernel void inplace_mul_sub(\n"
00634 "          __global float * vec1,\n"
00635 "          __global const float * vec2,\n"
00636 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00637 "          unsigned int size\n"
00638 "          ) \n"
00639 "{ \n"
00640 "  float factor = *fac;\n"
00641 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00642 "    vec1[i] -= vec2[i] * factor;\n"
00643 "}\n"
00644 ; //vector_align1_inplace_mul_sub
00645 
00646 const char * const vector_align1_norm_2 = 
00647 "//helper:\n"
00648 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
00649 "{\n"
00650 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00651 "  {\n"
00652 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00653 "    if (get_global_id(0) < stride)\n"
00654 "      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00655 "  }\n"
00656 "}\n"
00657 "\n"
00658 "////// norm_2\n"
00659 "float impl_norm_2(\n"
00660 "          __global const float * vec,\n"
00661 "          unsigned int start_index,\n"
00662 "          unsigned int end_index,\n"
00663 "          __local float * tmp_buffer)\n"
00664 "{\n"
00665 "  float tmp = 0;\n"
00666 "  float vec_entry = 0;\n"
00667 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00668 "  {\n"
00669 "    vec_entry = vec[i];\n"
00670 "    tmp += vec_entry * vec_entry;\n"
00671 "  }\n"
00672 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00673 "  \n"
00674 "  helper_norm2_parallel_reduction(tmp_buffer);\n"
00675 "  \n"
00676 "  return tmp_buffer[0];\n"
00677 "};\n"
00678 "\n"
00679 "__kernel void norm_2(\n"
00680 "          __global const float * vec,\n"
00681 "          unsigned int size,\n"
00682 "          __local float * tmp_buffer,\n"
00683 "          global float * group_buffer)\n"
00684 "{\n"
00685 "  float tmp = impl_norm_2(vec,\n"
00686 "                          (      get_group_id(0) * size) / get_num_groups(0),\n"
00687 "                          ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00688 "                          tmp_buffer);\n"
00689 "  \n"
00690 "  if (get_local_id(0) == 0)\n"
00691 "    group_buffer[get_group_id(0)] = tmp;  \n"
00692 "}\n"
00693 "\n"
00694 ; //vector_align1_norm_2
00695 
00696 const char * const vector_align1_add = 
00697 "__kernel void add(\n"
00698 "          __global const float * vec1,\n"
00699 "          __global const float * vec2, \n"
00700 "          __global float * result,\n"
00701 "          unsigned int size) \n"
00702 "{ \n"
00703 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00704 "    result[i] = vec1[i] + vec2[i];\n"
00705 "}\n"
00706 ; //vector_align1_add
00707 
00708 const char * const vector_align16_inplace_divide = 
00709 "__kernel void inplace_divide(\n"
00710 "          __global float16 * vec,\n"
00711 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00712 "          unsigned int size) \n"
00713 "{ \n"
00714 "  float factor = *fac;\n"
00715 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00716 "    vec[i] /= factor;\n"
00717 "}\n"
00718 ; //vector_align16_inplace_divide
00719 
00720 const char * const vector_align16_sub = 
00721 "__kernel void sub(\n"
00722 "          __global const float16 * vec1,\n"
00723 "          __global const float16 * vec2, \n"
00724 "          __global float16 * result,\n"
00725 "          unsigned int size)\n"
00726 "{ \n"
00727 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00728 "    result[i] = vec1[i] - vec2[i];\n"
00729 "}\n"
00730 ; //vector_align16_sub
00731 
00732 const char * const vector_align16_cpu_inplace_mul = 
00733 "\n"
00734 "__kernel void cpu_inplace_mult(\n"
00735 "          __global float16 * vec,\n"
00736 "          float factor, \n"
00737 "          unsigned int size) \n"
00738 "{ \n"
00739 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00740 "    vec[i] *= factor;\n"
00741 "}\n"
00742 "\n"
00743 ; //vector_align16_cpu_inplace_mul
00744 
00745 const char * const vector_align16_inplace_sub = 
00746 "__kernel void inplace_sub(\n"
00747 "          __global float16 * vec1,\n"
00748 "          __global const float16 * vec2,\n"
00749 "          unsigned int size) \n"
00750 "{ \n"
00751 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00752 "    vec1[i] -= vec2[i];\n"
00753 "}\n"
00754 ; //vector_align16_inplace_sub
00755 
00756 const char * const vector_align16_mult = 
00757 "__kernel void mult(\n"
00758 "          __global const float16 * vec,\n"
00759 "          __global const float * fac, \n"
00760 "          __global float16 * result,\n"
00761 "          unsigned int size) \n"
00762 "{ \n"
00763 "  float factor = *fac;\n"
00764 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00765 "    result[i] = vec[i] * factor;\n"
00766 "}\n"
00767 ; //vector_align16_mult
00768 
00769 const char * const vector_align16_divide = 
00770 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
00771 "__kernel void divide(\n"
00772 "          __global const float16 * vec,\n"
00773 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00774 "          __global float16 * result,\n"
00775 "          unsigned int size)  \n"
00776 "{ \n"
00777 "  float factor = *fac;\n"
00778 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00779 "    result[i] = vec[i] / factor;\n"
00780 "}\n"
00781 ; //vector_align16_divide
00782 
00783 const char * const vector_align16_inplace_add = 
00784 "__kernel void inplace_add(\n"
00785 "          __global float16 * vec1,\n"
00786 "          __global const float16 * vec2,\n"
00787 "          unsigned int size) \n"
00788 "{ \n"
00789 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00790 "    vec1[i] += vec2[i];\n"
00791 "}\n"
00792 ; //vector_align16_inplace_add
00793 
00794 const char * const vector_align16_cpu_mult = 
00795 "__kernel void cpu_mult(\n"
00796 "          __global const float16 * vec,\n"
00797 "          float factor, \n"
00798 "          __global float16 * result,\n"
00799 "          unsigned int size) \n"
00800 "{ \n"
00801 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00802 "    result[i] = vec[i] * factor;\n"
00803 "}\n"
00804 ; //vector_align16_cpu_mult
00805 
00806 const char * const vector_align16_inplace_mult = 
00807 "__kernel void inplace_mult(\n"
00808 "          __global float16 * vec,\n"
00809 "          __global const float * fac, \n"
00810 "          unsigned int size) \n"
00811 "{ \n"
00812 "  float factor = *fac;\n"
00813 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00814 "    vec[i] *= factor;\n"
00815 "}\n"
00816 ; //vector_align16_inplace_mult
00817 
00818 const char * const vector_align16_add = 
00819 "__kernel void add(\n"
00820 "          __global const float16 * vec1,\n"
00821 "          __global const float16 * vec2, \n"
00822 "          __global float16 * result,\n"
00823 "          unsigned int size)\n"
00824 "{ \n"
00825 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00826 "    result[i] = vec1[i] + vec2[i];\n"
00827 "}\n"
00828 ; //vector_align16_add
00829 
00830   }  //namespace kernels
00831  }  //namespace linalg
00832 }  //namespace viennacl
00833 #endif