ViennaCL - The Vienna Computing Library  1.1.2
/build/buildd/viennacl-1.1.2/viennacl/linalg/kernels/matrix_solve_col_row_source.h
Go to the documentation of this file.
00001 #ifndef _VIENNACL_MATRIX_SOLVE_COL_ROW_SOURCE_HPP_
00002 #define _VIENNACL_MATRIX_SOLVE_COL_ROW_SOURCE_HPP_
00003 //Automatically generated file from aux-directory, do not edit manually!
00004 namespace viennacl
00005 {
00006  namespace linalg
00007  {
00008   namespace kernels
00009   {
00010 const char * const matrix_solve_col_row_align1_lower_trans_solve = 
00011 "// file automatically generated - do not edit!\n"
00012 "// inplace solve A \\ B^T\n"
00013 "// matrix layouts: A...col_major, B...row_major\n"
00014 "__kernel void lower_trans_solve(\n"
00015 "          __global const float * A,\n"
00016 "          unsigned int A_rows,\n"
00017 "          unsigned int A_cols,\n"
00018 "          unsigned int A_internal_rows,\n"
00019 "          unsigned int A_internal_cols,\n"
00020 "          __global float * B,  \n"
00021 "          unsigned int B_rows,\n"
00022 "          unsigned int B_cols,\n"
00023 "          unsigned int B_internal_rows,\n"
00024 "          unsigned int B_internal_cols)\n"
00025 "{ \n"
00026 "  float temp; \n"
00027 "  for (int row = 0; row < A_rows; ++row) \n"
00028 "  { \n"
00029 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00030 "    if (get_local_id(0) == 0) \n"
00031 "      B[row + get_group_id(0) * B_internal_cols] /= A[row + row*A_internal_cols]; \n"
00032 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00033 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00034 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00035 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00036 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim + row * A_internal_rows];\n"
00037 "   }\n"
00038 "}\n"
00039 ; //matrix_solve_col_row_align1_lower_trans_solve
00040 
00041 const char * const matrix_solve_col_row_align1_unit_lower_solve = 
00042 "// file automatically generated - do not edit!\n"
00043 "// inplace solve A \\ B\n"
00044 "// matrix layouts: A...col_major, B...row_major\n"
00045 "__kernel void unit_lower_solve(\n"
00046 "          __global const float * A,\n"
00047 "          unsigned int A_rows,\n"
00048 "          unsigned int A_cols,\n"
00049 "          unsigned int A_internal_rows,\n"
00050 "          unsigned int A_internal_cols,\n"
00051 "          __global float * B,  \n"
00052 "          unsigned int B_rows,\n"
00053 "          unsigned int B_cols,\n"
00054 "          unsigned int B_internal_rows,\n"
00055 "          unsigned int B_internal_cols)\n"
00056 "{ \n"
00057 "  float temp; \n"
00058 "  for (int row = 0; row < A_rows; ++row) \n"
00059 "  { \n"
00060 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00061 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00062 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00063 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00064 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim + row * A_internal_rows];\n"
00065 "   }\n"
00066 "}\n"
00067 ; //matrix_solve_col_row_align1_unit_lower_solve
00068 
00069 const char * const matrix_solve_col_row_align1_trans_unit_upper_trans_solve = 
00070 "// file automatically generated - do not edit!\n"
00071 "// inplace solve A^T \\ B^T\n"
00072 "// matrix layouts: A...col_major, B...row_major\n"
00073 "__kernel void trans_unit_upper_trans_solve(\n"
00074 "          __global const float * A,\n"
00075 "          unsigned int A_rows,\n"
00076 "          unsigned int A_cols,\n"
00077 "          unsigned int A_internal_rows,\n"
00078 "          unsigned int A_internal_cols,\n"
00079 "          __global float * B,  \n"
00080 "          unsigned int B_rows,\n"
00081 "          unsigned int B_cols,\n"
00082 "          unsigned int B_internal_rows,\n"
00083 "          unsigned int B_internal_cols)\n"
00084 "{ \n"
00085 "  float temp; \n"
00086 "  for (int row = A_rows-1; row > -1; --row) \n"
00087 "  { \n"
00088 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00089 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00090 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00091 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00092 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim * A_internal_rows + row];\n"
00093 "   }\n"
00094 "}\n"
00095 ; //matrix_solve_col_row_align1_trans_unit_upper_trans_solve
00096 
00097 const char * const matrix_solve_col_row_align1_unit_upper_trans_solve = 
00098 "// file automatically generated - do not edit!\n"
00099 "// inplace solve A \\ B^T\n"
00100 "// matrix layouts: A...col_major, B...row_major\n"
00101 "__kernel void unit_upper_trans_solve(\n"
00102 "          __global const float * A,\n"
00103 "          unsigned int A_rows,\n"
00104 "          unsigned int A_cols,\n"
00105 "          unsigned int A_internal_rows,\n"
00106 "          unsigned int A_internal_cols,\n"
00107 "          __global float * B,  \n"
00108 "          unsigned int B_rows,\n"
00109 "          unsigned int B_cols,\n"
00110 "          unsigned int B_internal_rows,\n"
00111 "          unsigned int B_internal_cols)\n"
00112 "{ \n"
00113 "  float temp; \n"
00114 "  for (int row = A_rows-1; row > -1; --row) \n"
00115 "  { \n"
00116 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00117 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00118 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00119 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00120 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim + row * A_internal_rows];\n"
00121 "   }\n"
00122 "}\n"
00123 ; //matrix_solve_col_row_align1_unit_upper_trans_solve
00124 
00125 const char * const matrix_solve_col_row_align1_unit_lower_trans_solve = 
00126 "// file automatically generated - do not edit!\n"
00127 "// inplace solve A \\ B^T\n"
00128 "// matrix layouts: A...col_major, B...row_major\n"
00129 "__kernel void unit_lower_trans_solve(\n"
00130 "          __global const float * A,\n"
00131 "          unsigned int A_rows,\n"
00132 "          unsigned int A_cols,\n"
00133 "          unsigned int A_internal_rows,\n"
00134 "          unsigned int A_internal_cols,\n"
00135 "          __global float * B,  \n"
00136 "          unsigned int B_rows,\n"
00137 "          unsigned int B_cols,\n"
00138 "          unsigned int B_internal_rows,\n"
00139 "          unsigned int B_internal_cols)\n"
00140 "{ \n"
00141 "  float temp; \n"
00142 "  for (int row = 0; row < A_rows; ++row) \n"
00143 "  { \n"
00144 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00145 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00146 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00147 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00148 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim + row * A_internal_rows];\n"
00149 "   }\n"
00150 "}\n"
00151 ; //matrix_solve_col_row_align1_unit_lower_trans_solve
00152 
00153 const char * const matrix_solve_col_row_align1_trans_upper_trans_solve = 
00154 "// file automatically generated - do not edit!\n"
00155 "// inplace solve A^T \\ B^T\n"
00156 "// matrix layouts: A...col_major, B...row_major\n"
00157 "__kernel void trans_upper_trans_solve(\n"
00158 "          __global const float * A,\n"
00159 "          unsigned int A_rows,\n"
00160 "          unsigned int A_cols,\n"
00161 "          unsigned int A_internal_rows,\n"
00162 "          unsigned int A_internal_cols,\n"
00163 "          __global float * B,  \n"
00164 "          unsigned int B_rows,\n"
00165 "          unsigned int B_cols,\n"
00166 "          unsigned int B_internal_rows,\n"
00167 "          unsigned int B_internal_cols)\n"
00168 "{ \n"
00169 "  float temp; \n"
00170 "  for (int row = A_rows-1; row > -1; --row) \n"
00171 "  { \n"
00172 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00173 "    if (get_local_id(0) == 0) \n"
00174 "      B[row + get_group_id(0) * B_internal_cols] /= A[row + row*A_internal_cols]; \n"
00175 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00176 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00177 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00178 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00179 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim * A_internal_rows + row];\n"
00180 "   }\n"
00181 "}\n"
00182 ; //matrix_solve_col_row_align1_trans_upper_trans_solve
00183 
00184 const char * const matrix_solve_col_row_align1_upper_solve = 
00185 "// file automatically generated - do not edit!\n"
00186 "// inplace solve A \\ B\n"
00187 "// matrix layouts: A...col_major, B...row_major\n"
00188 "__kernel void upper_solve(\n"
00189 "          __global const float * A,\n"
00190 "          unsigned int A_rows,\n"
00191 "          unsigned int A_cols,\n"
00192 "          unsigned int A_internal_rows,\n"
00193 "          unsigned int A_internal_cols,\n"
00194 "          __global float * B,  \n"
00195 "          unsigned int B_rows,\n"
00196 "          unsigned int B_cols,\n"
00197 "          unsigned int B_internal_rows,\n"
00198 "          unsigned int B_internal_cols)\n"
00199 "{ \n"
00200 "  float temp; \n"
00201 "  for (int row = A_rows-1; row > -1; --row) \n"
00202 "  { \n"
00203 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00204 "    if (get_local_id(0) == 0) \n"
00205 "      B[row * B_internal_cols + get_group_id(0)] /= A[row + row*A_internal_cols]; \n"
00206 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00207 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00208 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00209 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00210 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim + row * A_internal_rows];\n"
00211 "   }\n"
00212 "}\n"
00213 ; //matrix_solve_col_row_align1_upper_solve
00214 
00215 const char * const matrix_solve_col_row_align1_lower_solve = 
00216 "// file automatically generated - do not edit!\n"
00217 "// inplace solve A \\ B\n"
00218 "// matrix layouts: A...col_major, B...row_major\n"
00219 "__kernel void lower_solve(\n"
00220 "          __global const float * A,\n"
00221 "          unsigned int A_rows,\n"
00222 "          unsigned int A_cols,\n"
00223 "          unsigned int A_internal_rows,\n"
00224 "          unsigned int A_internal_cols,\n"
00225 "          __global float * B,  \n"
00226 "          unsigned int B_rows,\n"
00227 "          unsigned int B_cols,\n"
00228 "          unsigned int B_internal_rows,\n"
00229 "          unsigned int B_internal_cols)\n"
00230 "{ \n"
00231 "  float temp; \n"
00232 "  for (int row = 0; row < A_rows; ++row) \n"
00233 "  { \n"
00234 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00235 "    if (get_local_id(0) == 0) \n"
00236 "      B[row * B_internal_cols + get_group_id(0)] /= A[row + row*A_internal_cols]; \n"
00237 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00238 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00239 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00240 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00241 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim + row * A_internal_rows];\n"
00242 "   }\n"
00243 "}\n"
00244 ; //matrix_solve_col_row_align1_lower_solve
00245 
00246 const char * const matrix_solve_col_row_align1_trans_unit_lower_solve = 
00247 "// file automatically generated - do not edit!\n"
00248 "// inplace solve A^T \\ B\n"
00249 "// matrix layouts: A...col_major, B...row_major\n"
00250 "__kernel void trans_unit_lower_solve(\n"
00251 "          __global const float * A,\n"
00252 "          unsigned int A_rows,\n"
00253 "          unsigned int A_cols,\n"
00254 "          unsigned int A_internal_rows,\n"
00255 "          unsigned int A_internal_cols,\n"
00256 "          __global float * B,  \n"
00257 "          unsigned int B_rows,\n"
00258 "          unsigned int B_cols,\n"
00259 "          unsigned int B_internal_rows,\n"
00260 "          unsigned int B_internal_cols)\n"
00261 "{ \n"
00262 "  float temp; \n"
00263 "  for (int row = 0; row < A_rows; ++row) \n"
00264 "  { \n"
00265 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00266 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00267 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00268 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00269 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim * A_internal_rows + row];\n"
00270 "   }\n"
00271 "}\n"
00272 ; //matrix_solve_col_row_align1_trans_unit_lower_solve
00273 
00274 const char * const matrix_solve_col_row_align1_trans_lower_trans_solve = 
00275 "// file automatically generated - do not edit!\n"
00276 "// inplace solve A^T \\ B^T\n"
00277 "// matrix layouts: A...col_major, B...row_major\n"
00278 "__kernel void trans_lower_trans_solve(\n"
00279 "          __global const float * A,\n"
00280 "          unsigned int A_rows,\n"
00281 "          unsigned int A_cols,\n"
00282 "          unsigned int A_internal_rows,\n"
00283 "          unsigned int A_internal_cols,\n"
00284 "          __global float * B,  \n"
00285 "          unsigned int B_rows,\n"
00286 "          unsigned int B_cols,\n"
00287 "          unsigned int B_internal_rows,\n"
00288 "          unsigned int B_internal_cols)\n"
00289 "{ \n"
00290 "  float temp; \n"
00291 "  for (int row = 0; row < A_rows; ++row) \n"
00292 "  { \n"
00293 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00294 "    if (get_local_id(0) == 0) \n"
00295 "      B[row + get_group_id(0) * B_internal_cols] /= A[row + row*A_internal_cols]; \n"
00296 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00297 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00298 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00299 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00300 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim * A_internal_rows + row];\n"
00301 "   }\n"
00302 "}\n"
00303 ; //matrix_solve_col_row_align1_trans_lower_trans_solve
00304 
00305 const char * const matrix_solve_col_row_align1_trans_lower_solve = 
00306 "// file automatically generated - do not edit!\n"
00307 "// inplace solve A^T \\ B\n"
00308 "// matrix layouts: A...col_major, B...row_major\n"
00309 "__kernel void trans_lower_solve(\n"
00310 "          __global const float * A,\n"
00311 "          unsigned int A_rows,\n"
00312 "          unsigned int A_cols,\n"
00313 "          unsigned int A_internal_rows,\n"
00314 "          unsigned int A_internal_cols,\n"
00315 "          __global float * B,  \n"
00316 "          unsigned int B_rows,\n"
00317 "          unsigned int B_cols,\n"
00318 "          unsigned int B_internal_rows,\n"
00319 "          unsigned int B_internal_cols)\n"
00320 "{ \n"
00321 "  float temp; \n"
00322 "  for (int row = 0; row < A_rows; ++row) \n"
00323 "  { \n"
00324 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00325 "    if (get_local_id(0) == 0) \n"
00326 "      B[row * B_internal_cols + get_group_id(0)] /= A[row + row*A_internal_cols]; \n"
00327 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00328 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00329 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00330 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00331 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim * A_internal_rows + row];\n"
00332 "   }\n"
00333 "}\n"
00334 ; //matrix_solve_col_row_align1_trans_lower_solve
00335 
00336 const char * const matrix_solve_col_row_align1_unit_upper_solve = 
00337 "// file automatically generated - do not edit!\n"
00338 "// inplace solve A \\ B\n"
00339 "// matrix layouts: A...col_major, B...row_major\n"
00340 "__kernel void unit_upper_solve(\n"
00341 "          __global const float * A,\n"
00342 "          unsigned int A_rows,\n"
00343 "          unsigned int A_cols,\n"
00344 "          unsigned int A_internal_rows,\n"
00345 "          unsigned int A_internal_cols,\n"
00346 "          __global float * B,  \n"
00347 "          unsigned int B_rows,\n"
00348 "          unsigned int B_cols,\n"
00349 "          unsigned int B_internal_rows,\n"
00350 "          unsigned int B_internal_cols)\n"
00351 "{ \n"
00352 "  float temp; \n"
00353 "  for (int row = A_rows-1; row > -1; --row) \n"
00354 "  { \n"
00355 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00356 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00357 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00358 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00359 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim + row * A_internal_rows];\n"
00360 "   }\n"
00361 "}\n"
00362 ; //matrix_solve_col_row_align1_unit_upper_solve
00363 
00364 const char * const matrix_solve_col_row_align1_trans_upper_solve = 
00365 "// file automatically generated - do not edit!\n"
00366 "// inplace solve A^T \\ B\n"
00367 "// matrix layouts: A...col_major, B...row_major\n"
00368 "__kernel void trans_upper_solve(\n"
00369 "          __global const float * A,\n"
00370 "          unsigned int A_rows,\n"
00371 "          unsigned int A_cols,\n"
00372 "          unsigned int A_internal_rows,\n"
00373 "          unsigned int A_internal_cols,\n"
00374 "          __global float * B,  \n"
00375 "          unsigned int B_rows,\n"
00376 "          unsigned int B_cols,\n"
00377 "          unsigned int B_internal_rows,\n"
00378 "          unsigned int B_internal_cols)\n"
00379 "{ \n"
00380 "  float temp; \n"
00381 "  for (int row = A_rows-1; row > -1; --row) \n"
00382 "  { \n"
00383 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00384 "    if (get_local_id(0) == 0) \n"
00385 "      B[row * B_internal_cols + get_group_id(0)] /= A[row + row*A_internal_cols]; \n"
00386 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00387 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00388 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00389 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00390 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim * A_internal_rows + row];\n"
00391 "   }\n"
00392 "}\n"
00393 ; //matrix_solve_col_row_align1_trans_upper_solve
00394 
00395 const char * const matrix_solve_col_row_align1_upper_trans_solve = 
00396 "// file automatically generated - do not edit!\n"
00397 "// inplace solve A \\ B^T\n"
00398 "// matrix layouts: A...col_major, B...row_major\n"
00399 "__kernel void upper_trans_solve(\n"
00400 "          __global const float * A,\n"
00401 "          unsigned int A_rows,\n"
00402 "          unsigned int A_cols,\n"
00403 "          unsigned int A_internal_rows,\n"
00404 "          unsigned int A_internal_cols,\n"
00405 "          __global float * B,  \n"
00406 "          unsigned int B_rows,\n"
00407 "          unsigned int B_cols,\n"
00408 "          unsigned int B_internal_rows,\n"
00409 "          unsigned int B_internal_cols)\n"
00410 "{ \n"
00411 "  float temp; \n"
00412 "  for (int row = A_rows-1; row > -1; --row) \n"
00413 "  { \n"
00414 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00415 "    if (get_local_id(0) == 0) \n"
00416 "      B[row + get_group_id(0) * B_internal_cols] /= A[row + row*A_internal_cols]; \n"
00417 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00418 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00419 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00420 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00421 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim + row * A_internal_rows];\n"
00422 "   }\n"
00423 "}\n"
00424 ; //matrix_solve_col_row_align1_upper_trans_solve
00425 
00426 const char * const matrix_solve_col_row_align1_trans_unit_lower_trans_solve = 
00427 "// file automatically generated - do not edit!\n"
00428 "// inplace solve A^T \\ B^T\n"
00429 "// matrix layouts: A...col_major, B...row_major\n"
00430 "__kernel void trans_unit_lower_trans_solve(\n"
00431 "          __global const float * A,\n"
00432 "          unsigned int A_rows,\n"
00433 "          unsigned int A_cols,\n"
00434 "          unsigned int A_internal_rows,\n"
00435 "          unsigned int A_internal_cols,\n"
00436 "          __global float * B,  \n"
00437 "          unsigned int B_rows,\n"
00438 "          unsigned int B_cols,\n"
00439 "          unsigned int B_internal_rows,\n"
00440 "          unsigned int B_internal_cols)\n"
00441 "{ \n"
00442 "  float temp; \n"
00443 "  for (int row = 0; row < A_rows; ++row) \n"
00444 "  { \n"
00445 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00446 "      temp = B[row + get_group_id(0) * B_internal_cols]; \n"
00447 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00448 "    for  (int elim = row + get_local_id(0) + 1; elim < A_rows; elim += get_local_size(0)) \n"
00449 "      B[elim + get_group_id(0) * B_internal_cols] -= temp * A[elim * A_internal_rows + row];\n"
00450 "   }\n"
00451 "}\n"
00452 ; //matrix_solve_col_row_align1_trans_unit_lower_trans_solve
00453 
00454 const char * const matrix_solve_col_row_align1_trans_unit_upper_solve = 
00455 "// file automatically generated - do not edit!\n"
00456 "// inplace solve A^T \\ B\n"
00457 "// matrix layouts: A...col_major, B...row_major\n"
00458 "__kernel void trans_unit_upper_solve(\n"
00459 "          __global const float * A,\n"
00460 "          unsigned int A_rows,\n"
00461 "          unsigned int A_cols,\n"
00462 "          unsigned int A_internal_rows,\n"
00463 "          unsigned int A_internal_cols,\n"
00464 "          __global float * B,  \n"
00465 "          unsigned int B_rows,\n"
00466 "          unsigned int B_cols,\n"
00467 "          unsigned int B_internal_rows,\n"
00468 "          unsigned int B_internal_cols)\n"
00469 "{ \n"
00470 "  float temp; \n"
00471 "  for (int row = A_rows-1; row > -1; --row) \n"
00472 "  { \n"
00473 "    barrier(CLK_GLOBAL_MEM_FENCE); \n"
00474 "      temp = B[row * B_internal_cols + get_group_id(0)]; \n"
00475 "    //eliminate column of op(A) with index 'row' in parallel: \n"
00476 "    for  (int elim = get_local_id(0); elim < row; elim += get_local_size(0)) \n"
00477 "      B[elim * B_internal_cols + get_group_id(0)] -= temp * A[elim * A_internal_rows + row];\n"
00478 "   }\n"
00479 "}\n"
00480 ; //matrix_solve_col_row_align1_trans_unit_upper_solve
00481 
00482   }  //namespace kernels
00483  }  //namespace linalg
00484 }  //namespace viennacl
00485 #endif