|
ViennaCL - The Vienna Computing Library
1.1.2
|
00001 /* ======================================================================= 00002 Copyright (c) 2010, Institute for Microelectronics, TU Vienna. 00003 http://www.iue.tuwien.ac.at 00004 ----------------- 00005 ViennaCL - The Vienna Computing Library 00006 ----------------- 00007 00008 authors: Karl Rupp rupp@iue.tuwien.ac.at 00009 Florian Rudolf flo.rudy+viennacl@gmail.com 00010 Josef Weinbub weinbub@iue.tuwien.ac.at 00011 00012 license: MIT (X11), see file LICENSE in the ViennaCL base directory 00013 ======================================================================= */ 00014 00015 #ifndef _VIENNACL_VECTOR_OPERATIONS_HPP_ 00016 #define _VIENNACL_VECTOR_OPERATIONS_HPP_ 00017 00022 #include "viennacl/forwards.h" 00023 #include "viennacl/ocl/device.hpp" 00024 #include "viennacl/ocl/handle.hpp" 00025 #include "viennacl/ocl/kernel.hpp" 00026 #include "viennacl/scalar.hpp" 00027 #include "viennacl/tools/tools.hpp" 00028 #include "viennacl/linalg/kernels/vector_kernels.h" 00029 00030 namespace viennacl 00031 { 00032 namespace linalg 00033 { 00040 template<class SCALARTYPE, unsigned int ALIGNMENT> 00041 void add(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00042 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00043 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00044 { 00045 assert(vec1.size() == vec2.size() && vec1.size() == result.size()); 00046 00047 unsigned int size = std::min(vec1.internal_size(), vec2.internal_size()); 00048 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "add"); 00049 00050 viennacl::ocl::enqueue(k(vec1, vec2, result, size)); 00051 } 00052 00060 template<class SCALARTYPE, unsigned int ALIGNMENT> 00061 void inplace_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00062 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2) 00063 { 00064 assert(vec1.size() == vec2.size()); 00065 unsigned int size = std::min(vec1.internal_size(), vec2.internal_size()); 00066 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_add"); 00067 00068 viennacl::ocl::enqueue(k(vec1, vec2, size)); 00069 } 00070 00079 template<class SCALARTYPE, unsigned int ALIGNMENT> 00080 void sub(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00081 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00082 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00083 { 00084 assert(vec1.size() == vec2.size()); 00085 result.resize(vec1.size()); 00086 unsigned int size = std::min(vec1.internal_size(), vec2.internal_size()); 00087 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sub"); 00088 00089 viennacl::ocl::enqueue(k(vec1, vec2, result, size)); 00090 } 00091 00099 template<class SCALARTYPE, unsigned int ALIGNMENT> 00100 void inplace_sub(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00101 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2) 00102 { 00103 assert(vec1.size() == vec2.size()); 00104 unsigned int size = std::min(vec1.internal_size(), vec2.internal_size()); 00105 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_sub"); 00106 00107 viennacl::ocl::enqueue(k(vec1, vec2, size)); 00108 } 00109 00110 00111 //result = vec * scalar 00120 template<class SCALARTYPE, unsigned int ALIGNMENT> 00121 void mult(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec, 00122 scalar<SCALARTYPE> const & alpha, 00123 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00124 { 00125 result.resize(vec.size()); 00126 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "mult"); 00127 00128 viennacl::ocl::enqueue(k(vec, alpha, result, static_cast<cl_uint>(vec.internal_size()))); 00129 } 00130 00139 template<class SCALARTYPE, unsigned int ALIGNMENT> 00140 void mult(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec, 00141 SCALARTYPE alpha, 00142 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00143 { 00144 result.resize(vec.size()); 00145 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_mult"); 00146 00147 viennacl::ocl::enqueue(k(vec, alpha, result, static_cast<cl_uint>(vec.internal_size()))); 00148 } 00149 00157 template<class SCALARTYPE, unsigned int ALIGNMENT> 00158 void inplace_mult(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec, 00159 scalar<SCALARTYPE> const & alpha) 00160 { 00161 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_mult"); 00162 00163 viennacl::ocl::enqueue(k(vec, alpha, static_cast<cl_uint>(vec.internal_size()))); 00164 } 00165 00173 template<class SCALARTYPE, unsigned int ALIGNMENT> 00174 void inplace_mult(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec, 00175 SCALARTYPE alpha) 00176 { 00177 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_inplace_mult"); 00178 00179 viennacl::ocl::enqueue(k(vec, alpha, static_cast<cl_uint>(vec.internal_size()))); 00180 } 00181 00182 //result = vec / scalar 00191 template<class SCALARTYPE, unsigned int ALIGNMENT> 00192 void divide(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec, 00193 scalar<SCALARTYPE> const & alpha, 00194 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00195 { 00196 assert(vec.size() == result.size()); 00197 result.resize(vec.size()); 00198 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "divide"); 00199 00200 viennacl::ocl::enqueue(k(vec, alpha, result, static_cast<cl_uint>(vec.internal_size()))); 00201 } 00202 00210 template<class SCALARTYPE, unsigned int ALIGNMENT> 00211 void inplace_divide(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec, 00212 scalar<SCALARTYPE> const & alpha) 00213 { 00214 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_divide"); 00215 00216 viennacl::ocl::enqueue(k(vec, alpha, static_cast<cl_uint>(vec.internal_size()))); 00217 } 00218 00219 //result = factor * vec1 + vec2 00229 template<class SCALARTYPE, unsigned int ALIGNMENT> 00230 void mul_add(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00231 scalar<SCALARTYPE> const & alpha, 00232 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00233 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00234 { 00235 assert(vec1.size() == vec2.size() && result.size() == vec1.size()); 00236 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "mul_add"); 00237 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00238 00239 viennacl::ocl::enqueue(k(vec1, alpha, vec2, result, size)); 00240 } 00241 00251 template<class SCALARTYPE, unsigned int ALIGNMENT> 00252 void mul_add(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00253 SCALARTYPE alpha, 00254 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00255 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00256 { 00257 assert(vec1.size() == vec2.size() && result.size() == vec1.size()); 00258 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_mul_add"); 00259 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00260 00261 viennacl::ocl::enqueue(k(vec1, alpha, vec2, result, size)); 00262 } 00263 00264 //vec1 += factor * vec2 00273 template<class SCALARTYPE, unsigned int ALIGNMENT> 00274 void inplace_mul_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00275 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00276 scalar<SCALARTYPE> const & alpha) 00277 { 00278 assert(vec1.size() == vec2.size()); 00279 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00280 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_mul_add"); 00281 00282 viennacl::ocl::enqueue(k(vec1, vec2, alpha, size)); 00283 } 00284 00293 template<class SCALARTYPE, unsigned int ALIGNMENT> 00294 void inplace_mul_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00295 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00296 SCALARTYPE alpha) 00297 { 00298 assert(vec1.size() == vec2.size()); 00299 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_inplace_mul_add"); 00300 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00301 00302 viennacl::ocl::enqueue(k(vec1, vec2, alpha, size)); 00303 } 00304 00314 template<class SCALARTYPE, unsigned int ALIGNMENT> 00315 void mul_sub(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00316 scalar<SCALARTYPE> const & alpha, 00317 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00318 viennacl::vector<SCALARTYPE, ALIGNMENT> & result) 00319 { 00320 assert(vec1.size() == vec2.size() && result.size() == vec1.size()); 00321 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "mul_sub"); 00322 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00323 00324 viennacl::ocl::enqueue(k(vec1, alpha, vec2, result, size)); 00325 } 00326 00327 00336 template<class SCALARTYPE, unsigned int ALIGNMENT> 00337 void inplace_mul_sub(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00338 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00339 scalar<SCALARTYPE> const & alpha) 00340 { 00341 assert(vec1.size() == vec2.size()); 00342 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_mul_sub"); 00343 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00344 00345 viennacl::ocl::enqueue(k(vec1, vec2, alpha, size)); 00346 } 00347 00356 template<class SCALARTYPE, unsigned int ALIGNMENT> 00357 void inplace_div_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00358 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00359 scalar<SCALARTYPE> const & alpha) 00360 { 00361 assert(vec1.size() == vec2.size()); 00362 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_div_add"); 00363 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00364 00365 viennacl::ocl::enqueue(k(vec1, vec2, alpha, size)); 00366 } 00367 00376 template<class SCALARTYPE, unsigned int ALIGNMENT> 00377 void inplace_div_sub(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00378 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00379 scalar<SCALARTYPE> const & alpha) 00380 { 00381 assert(vec1.size() == vec2.size()); 00382 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_div_sub"); 00383 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00384 00385 viennacl::ocl::enqueue(k(vec1, vec2, alpha, size)); 00386 } 00387 00388 00390 00391 00392 //implementation of inner product: 00393 //namespace { 00400 template<class SCALARTYPE, unsigned int ALIGNMENT> 00401 void inner_prod_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00402 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00403 scalar<SCALARTYPE> & result) 00404 { 00405 assert(vec1.size() == vec2.size()); 00406 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inner_prod"); 00407 cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size())); 00408 unsigned int work_groups = k.global_work_size() / k.local_work_size(); 00409 static viennacl::vector<SCALARTYPE> temp(work_groups); 00410 00411 /*unsigned int pos = 0; 00412 k.argument(pos++, vec1.handle()); 00413 k.argument(pos++, vec2.handle()); 00414 k.argument(pos++, size); 00415 k.local_buffer(pos++, static_cast<unsigned int>(sizeof(SCALARTYPE) * k.local_work_size())); 00416 k.argument(pos++, temp.handle());*/ 00417 00418 //Note: Number of work groups MUST be a power of two! 00419 //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl; 00420 assert( work_groups * k.local_work_size() == k.global_work_size() ); 00421 assert( (k.global_work_size() / k.local_work_size()) == 1 00422 || (k.global_work_size() / k.local_work_size()) == 2 00423 || (k.global_work_size() / k.local_work_size()) == 4 00424 || (k.global_work_size() / k.local_work_size()) == 8 00425 || (k.global_work_size() / k.local_work_size()) == 16 00426 || (k.global_work_size() / k.local_work_size()) == 32 00427 || (k.global_work_size() / k.local_work_size()) == 64 00428 || (k.global_work_size() / k.local_work_size()) == 128 00429 || (k.global_work_size() / k.local_work_size()) == 256 00430 || (k.global_work_size() / k.local_work_size()) == 512 ); 00431 00432 viennacl::ocl::enqueue(k(vec1, vec2, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp)); 00433 00434 viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sum"); 00435 00436 ksum.local_work_size(0, work_groups); 00437 ksum.global_work_size(0, work_groups); 00438 viennacl::ocl::enqueue(ksum(temp, result)); 00439 } 00440 //} 00441 00442 //public interface of inner product 00449 template<class SCALARTYPE, unsigned int ALIGNMENT1, unsigned int ALIGNMENT2> 00450 viennacl::scalar_expression< const viennacl::vector<SCALARTYPE, ALIGNMENT1>, 00451 const viennacl::vector<SCALARTYPE, ALIGNMENT2>, 00452 viennacl::op_inner_prod > 00453 inner_prod_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT1> & vec1, 00454 const viennacl::vector<SCALARTYPE, ALIGNMENT2> & vec2) 00455 { 00456 return viennacl::scalar_expression< const viennacl::vector<SCALARTYPE, ALIGNMENT1>, 00457 const viennacl::vector<SCALARTYPE, ALIGNMENT2>, 00458 viennacl::op_inner_prod >(vec1, vec2); 00459 } 00460 00461 00462 00468 template<class SCALARTYPE, unsigned int ALIGNMENT> 00469 void norm_1_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec, 00470 scalar<SCALARTYPE> & result) 00471 { 00472 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "norm_1"); 00473 cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size()); 00474 00475 if (k.local_work_size() != k.global_work_size()) 00476 { 00477 //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible 00478 k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size()); 00479 k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size()); 00480 } 00481 00482 00483 unsigned int work_groups = k.global_work_size() / k.local_work_size(); 00484 viennacl::vector<SCALARTYPE> temp(work_groups); 00485 00486 //Note: Number of work groups MUST be a power of two! 00487 //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl; 00488 assert( work_groups * k.local_work_size() == k.global_work_size() ); 00489 assert( (k.global_work_size() / k.local_work_size()) == 1 00490 || (k.global_work_size() / k.local_work_size()) == 2 00491 || (k.global_work_size() / k.local_work_size()) == 4 00492 || (k.global_work_size() / k.local_work_size()) == 8 00493 || (k.global_work_size() / k.local_work_size()) == 16 00494 || (k.global_work_size() / k.local_work_size()) == 32 00495 || (k.global_work_size() / k.local_work_size()) == 64 00496 || (k.global_work_size() / k.local_work_size()) == 128 00497 || (k.global_work_size() / k.local_work_size()) == 256 00498 || (k.global_work_size() / k.local_work_size()) == 512 ); 00499 00500 viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp)); 00501 00502 viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sum"); 00503 00504 ksum.local_work_size(0, work_groups); 00505 ksum.global_work_size(0, work_groups); 00506 viennacl::ocl::enqueue(ksum(temp, result)); 00507 } 00508 00514 template<class SCALARTYPE, unsigned int ALIGNMENT> 00515 void norm_2_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec, 00516 scalar<SCALARTYPE> & result) 00517 { 00518 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "norm_2"); 00519 cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size()); 00520 00521 if (k.local_work_size() != k.global_work_size()) 00522 { 00523 //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible 00524 k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size()); 00525 k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size()); 00526 } 00527 00528 unsigned int work_groups = k.global_work_size() / k.local_work_size(); 00529 viennacl::vector<SCALARTYPE> temp(work_groups); 00530 00531 //Note: Number of work groups MUST be a power of two! 00532 //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl; 00533 assert( work_groups * k.local_work_size() == k.global_work_size() ); 00534 assert( (k.global_work_size() / k.local_work_size()) == 1 00535 || (k.global_work_size() / k.local_work_size()) == 2 00536 || (k.global_work_size() / k.local_work_size()) == 4 00537 || (k.global_work_size() / k.local_work_size()) == 8 00538 || (k.global_work_size() / k.local_work_size()) == 16 00539 || (k.global_work_size() / k.local_work_size()) == 32 00540 || (k.global_work_size() / k.local_work_size()) == 64 00541 || (k.global_work_size() / k.local_work_size()) == 128 00542 || (k.global_work_size() / k.local_work_size()) == 256 00543 || (k.global_work_size() / k.local_work_size()) == 512 ); 00544 00545 viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp)); 00546 00547 viennacl::ocl::kernel & sqrt_sum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sqrt_sum"); 00548 00549 sqrt_sum.local_work_size(0, work_groups); 00550 sqrt_sum.global_work_size(0, work_groups); 00551 viennacl::ocl::enqueue(sqrt_sum(temp, result, work_groups)); 00552 } 00553 00559 template<class SCALARTYPE, unsigned int ALIGNMENT> 00560 void norm_inf_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec, 00561 scalar<SCALARTYPE> & result) 00562 { 00563 cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size()); 00564 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "norm_inf"); 00565 00566 if (k.local_work_size() != k.global_work_size()) 00567 { 00568 //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible 00569 k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size()); 00570 k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size()); 00571 } 00572 00573 unsigned int work_groups = k.global_work_size() / k.local_work_size(); 00574 viennacl::vector<SCALARTYPE> temp(work_groups); 00575 00576 //Note: Number of work groups MUST be a power of two! 00577 //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl; 00578 assert( work_groups * k.local_work_size() == k.global_work_size() ); 00579 assert( work_groups == 1 00580 || work_groups == 2 00581 || work_groups == 4 00582 || work_groups == 8 00583 || work_groups == 16 00584 || work_groups == 32 00585 || work_groups == 64 00586 || work_groups == 128 00587 || work_groups == 256 00588 || work_groups == 512 ); 00589 00590 viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp)); 00591 //viennacl::ocl::get_queue().finish(); 00592 00593 //part 2: parallel reduction of reduced kernel: 00594 viennacl::ocl::kernel & max_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "vmax"); 00595 max_kernel.local_work_size(0, work_groups); 00596 max_kernel.global_work_size(0, work_groups); 00597 00598 viennacl::ocl::enqueue(max_kernel(temp, result, work_groups)); 00599 } 00600 00601 //This function should return a CPU scalar, otherwise statements like 00602 // vcl_rhs[index_norm_inf(vcl_rhs)] 00603 // are ambiguous 00609 template<class SCALARTYPE, unsigned int ALIGNMENT> 00610 cl_uint index_norm_inf(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec) 00611 { 00612 viennacl::ocl::handle<cl_mem> h = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint)); 00613 00614 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "index_norm_inf"); 00615 cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size()); 00616 00617 k.global_work_size(0, k.local_work_size()); 00618 viennacl::ocl::enqueue(k(vcl_vec, 00619 size, 00620 viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), 00621 viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h)); 00622 00623 //read value: 00624 cl_uint result; 00625 cl_int err; 00626 err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), h, CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL); 00627 VIENNACL_ERR_CHECK(err); 00628 return result; 00629 } 00630 00631 //TODO: Special case vec1 == vec2 allows improvement!! 00641 template<class SCALARTYPE, unsigned int ALIGNMENT> 00642 void plane_rotation(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 00643 const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 00644 SCALARTYPE alpha, 00645 SCALARTYPE beta) 00646 { 00647 assert(vec1.size() == vec2.size()); 00648 viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "plane_rotation"); 00649 00650 viennacl::ocl::enqueue(k(vec1, vec2, alpha, beta, static_cast<cl_uint>(vec1.size()))); 00651 } 00652 00653 } //namespace linalg 00654 } //namespace viennacl 00655 00656 00657 #endif
1.7.6.1