|
ViennaCL - The Vienna Computing Library
1.1.2
|
00001 #ifndef _VIENNACL_COMPRESSED_MATRIX_SOURCE_HPP_ 00002 #define _VIENNACL_COMPRESSED_MATRIX_SOURCE_HPP_ 00003 //Automatically generated file from aux-directory, do not edit manually! 00004 namespace viennacl 00005 { 00006 namespace linalg 00007 { 00008 namespace kernels 00009 { 00010 const char * const compressed_matrix_align4_vec_mul = 00011 "__kernel void vec_mul(\n" 00012 " __global const unsigned int * row_indices,\n" 00013 " __global const uint4 * column_indices, \n" 00014 " __global const float4 * elements,\n" 00015 " __global const float * vector, \n" 00016 " __global float * result,\n" 00017 " unsigned int size)\n" 00018 "{ \n" 00019 " float dot_prod;\n" 00020 " unsigned int start, next_stop;\n" 00021 " uint4 col_idx;\n" 00022 " float4 tmp_vec;\n" 00023 " float4 tmp_entries;\n" 00024 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00025 " {\n" 00026 " dot_prod = 0.0f;\n" 00027 " start = row_indices[row] / 4;\n" 00028 " next_stop = row_indices[row+1] / 4;\n" 00029 " for (unsigned int i = start; i < next_stop; ++i)\n" 00030 " {\n" 00031 " col_idx = column_indices[i];\n" 00032 " tmp_entries = elements[i];\n" 00033 " tmp_vec.x = vector[col_idx.x];\n" 00034 " tmp_vec.y = vector[col_idx.y];\n" 00035 " tmp_vec.z = vector[col_idx.z];\n" 00036 " tmp_vec.w = vector[col_idx.w];\n" 00037 " dot_prod += dot(tmp_entries, tmp_vec);\n" 00038 " }\n" 00039 " result[row] = dot_prod;\n" 00040 " }\n" 00041 "}\n" 00042 ; //compressed_matrix_align4_vec_mul 00043 00044 const char * const compressed_matrix_align1_jacobi_precond = 00045 "__kernel void jacobi_precond(\n" 00046 " __global const unsigned int * row_indices,\n" 00047 " __global const unsigned int * column_indices, \n" 00048 " __global const float * elements,\n" 00049 " __global float * diag_M_inv,\n" 00050 " unsigned int size) \n" 00051 "{ \n" 00052 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00053 " {\n" 00054 " float diag = 1.0f;\n" 00055 " unsigned int row_end = row_indices[row+1];\n" 00056 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00057 " {\n" 00058 " if (row == column_indices[i])\n" 00059 " {\n" 00060 " diag = elements[i];\n" 00061 " break;\n" 00062 " }\n" 00063 " }\n" 00064 " diag_M_inv[row] = 1.0f / diag;\n" 00065 " }\n" 00066 "}\n" 00067 ; //compressed_matrix_align1_jacobi_precond 00068 00069 const char * const compressed_matrix_align1_lu_forward = 00070 " \n" 00071 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00072 "__kernel void lu_forward(\n" 00073 " __global const unsigned int * row_indices,\n" 00074 " __global const unsigned int * column_indices, \n" 00075 " __global const float * elements,\n" 00076 " __local int * buffer, \n" 00077 " __local float * vec_entries, //a memory block from vector\n" 00078 " __global float * vector,\n" 00079 " unsigned int size) \n" 00080 "{\n" 00081 " int waiting_for; //block index that must be finished before the current thread can start\n" 00082 " unsigned int waiting_for_index;\n" 00083 " int block_offset;\n" 00084 " unsigned int col;\n" 00085 " unsigned int row;\n" 00086 " unsigned int row_index_end;\n" 00087 " \n" 00088 " //backward substitution: one thread per row in blocks of get_global_size(0)\n" 00089 " for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n" 00090 " {\n" 00091 " block_offset = block_num * get_global_size(0);\n" 00092 " row = block_offset + get_global_id(0);\n" 00093 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n" 00094 " waiting_for = -1;\n" 00095 " if (row < size)\n" 00096 " {\n" 00097 " vec_entries[get_global_id(0)] = vector[row];\n" 00098 " waiting_for_index = row_indices[row];\n" 00099 " row_index_end = row_indices[row+1];\n" 00100 " }\n" 00101 " \n" 00102 " if (get_global_id(0) == 0)\n" 00103 " buffer[get_global_size(0)] = 1;\n" 00104 " //try to eliminate all lines in the block. \n" 00105 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n" 00106 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n" 00107 " {\n" 00108 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00109 " if (row < size) //valid index?\n" 00110 " {\n" 00111 " if (waiting_for >= 0)\n" 00112 " {\n" 00113 " if (buffer[waiting_for] == 1)\n" 00114 " waiting_for = -1;\n" 00115 " }\n" 00116 " \n" 00117 " if (waiting_for == -1) //substitution not yet done, check whether possible\n" 00118 " {\n" 00119 " //check whether reduction is possible:\n" 00120 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n" 00121 " {\n" 00122 " col = column_indices[j];\n" 00123 " if (col < block_offset) //index valid, but not from current block\n" 00124 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n" 00125 " else if (col < row) //index is from current block\n" 00126 " {\n" 00127 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n" 00128 " {\n" 00129 " waiting_for = col - block_offset;\n" 00130 " waiting_for_index = j;\n" 00131 " break;\n" 00132 " }\n" 00133 " else //updated entry is available in shared memory:\n" 00134 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n" 00135 " }\n" 00136 " }\n" 00137 " \n" 00138 " if (waiting_for == -1) //this row is done\n" 00139 " {\n" 00140 " buffer[get_global_id(0)] = 1;\n" 00141 " waiting_for = -2; //magic number: thread is finished\n" 00142 " }\n" 00143 " } \n" 00144 " } //row < size\n" 00145 " else\n" 00146 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n" 00147 " ///////// check whether all threads are done. If yes, exit loop /////////////\n" 00148 " \n" 00149 " if (buffer[get_global_id(0)] == 0)\n" 00150 " buffer[get_global_size(0)] = 0;\n" 00151 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00152 " \n" 00153 " if (buffer[get_global_size(0)] > 0) //all threads break this loop simultaneously\n" 00154 " break;\n" 00155 " if (get_global_id(0) == 0)\n" 00156 " buffer[get_global_size(0)] = 1;\n" 00157 " } //for k\n" 00158 " \n" 00159 " //write to vector:\n" 00160 " if (row < size)\n" 00161 " vector[row] = vec_entries[get_global_id(0)];\n" 00162 " \n" 00163 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00164 " } //for block_num\n" 00165 "}\n" 00166 ; //compressed_matrix_align1_lu_forward 00167 00168 const char * const compressed_matrix_align1_bicgstab_kernel1 = 00169 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n" 00170 "{\n" 00171 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n" 00172 " {\n" 00173 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00174 " if (get_local_id(0) < stride)\n" 00175 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n" 00176 " }\n" 00177 "}\n" 00178 "//////// inner products:\n" 00179 "float bicgstab_kernel1_inner_prod(\n" 00180 " __global const float * vec1,\n" 00181 " __global const float * vec2,\n" 00182 " unsigned int size,\n" 00183 " __local float * tmp_buffer)\n" 00184 "{\n" 00185 " float tmp = 0;\n" 00186 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n" 00187 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n" 00188 " {\n" 00189 " if (i < size)\n" 00190 " tmp += vec1[i] * vec2[i];\n" 00191 " }\n" 00192 " tmp_buffer[get_local_id(0)] = tmp;\n" 00193 " \n" 00194 " helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n" 00195 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00196 " return tmp_buffer[0];\n" 00197 "}\n" 00198 "__kernel void bicgstab_kernel1(\n" 00199 " __global const float * tmp0,\n" 00200 " __global const float * r0star, \n" 00201 " __global const float * residual,\n" 00202 " __global float * s,\n" 00203 " __global float * alpha,\n" 00204 " __global const float * ip_rr0star,\n" 00205 " __local float * tmp_buffer,\n" 00206 " unsigned int size) \n" 00207 "{ \n" 00208 " float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n" 00209 " \n" 00210 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n" 00211 " s[i] = residual[i] - alpha_local * tmp0[i];\n" 00212 " \n" 00213 " if (get_global_id(0) == 0)\n" 00214 " alpha[0] = alpha_local;\n" 00215 "}\n" 00216 ; //compressed_matrix_align1_bicgstab_kernel1 00217 00218 const char * const compressed_matrix_align1_lu_backward = 00219 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n" 00220 "__kernel void lu_backward(\n" 00221 " __global const unsigned int * row_indices,\n" 00222 " __global const unsigned int * column_indices, \n" 00223 " __global const float * elements,\n" 00224 " __local int * buffer, \n" 00225 " __local float * vec_entries, //a memory block from vector\n" 00226 " __global float * vector,\n" 00227 " unsigned int size) \n" 00228 "{\n" 00229 " int waiting_for; //block index that must be finished before the current thread can start\n" 00230 " unsigned int waiting_for_index;\n" 00231 " unsigned int block_offset;\n" 00232 " unsigned int col;\n" 00233 " unsigned int row;\n" 00234 " unsigned int row_index_end;\n" 00235 " float diagonal_entry = 42;\n" 00236 " \n" 00237 " //forward substitution: one thread per row in blocks of get_global_size(0)\n" 00238 " for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n" 00239 " {\n" 00240 " block_offset = block_num * get_global_size(0);\n" 00241 " row = block_offset + get_global_id(0);\n" 00242 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n" 00243 " waiting_for = -1;\n" 00244 " \n" 00245 " if (row < size)\n" 00246 " {\n" 00247 " vec_entries[get_global_id(0)] = vector[row];\n" 00248 " waiting_for_index = row_indices[row];\n" 00249 " row_index_end = row_indices[row+1];\n" 00250 " diagonal_entry = column_indices[waiting_for_index];\n" 00251 " }\n" 00252 " \n" 00253 " if (get_global_id(0) == 0)\n" 00254 " buffer[get_global_size(0)] = 1;\n" 00255 " //try to eliminate all lines in the block. \n" 00256 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n" 00257 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n" 00258 " {\n" 00259 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00260 " if (row < size) //valid index?\n" 00261 " {\n" 00262 " if (waiting_for >= 0)\n" 00263 " {\n" 00264 " if (buffer[waiting_for] == 1)\n" 00265 " waiting_for = -1;\n" 00266 " }\n" 00267 " \n" 00268 " if (waiting_for == -1) //substitution not yet done, check whether possible\n" 00269 " {\n" 00270 " //check whether reduction is possible:\n" 00271 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n" 00272 " {\n" 00273 " col = column_indices[j];\n" 00274 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00275 " if (col >= block_offset + get_global_size(0)) //index valid, but not from current block\n" 00276 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n" 00277 " else if (col > row) //index is from current block\n" 00278 " {\n" 00279 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n" 00280 " {\n" 00281 " waiting_for = col - block_offset;\n" 00282 " waiting_for_index = j;\n" 00283 " break;\n" 00284 " }\n" 00285 " else //updated entry is available in shared memory:\n" 00286 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n" 00287 " }\n" 00288 " else if (col == row)\n" 00289 " diagonal_entry = elements[j];\n" 00290 " }\n" 00291 " \n" 00292 " if (waiting_for == -1) //this row is done\n" 00293 " {\n" 00294 " if (row == 0)\n" 00295 " vec_entries[get_global_id(0)] /= elements[0];\n" 00296 " else\n" 00297 " vec_entries[get_global_id(0)] /= diagonal_entry;\n" 00298 " buffer[get_global_id(0)] = 1;\n" 00299 " waiting_for = -2; //magic number: thread is finished\n" 00300 " }\n" 00301 " } \n" 00302 " } //row < size\n" 00303 " else\n" 00304 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n" 00305 " \n" 00306 " ///////// check whether all threads are done. If yes, exit loop /////////////\n" 00307 " if (buffer[get_global_id(0)] == 0)\n" 00308 " buffer[get_global_size(0)] = 0;\n" 00309 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00310 " \n" 00311 " if (buffer[get_global_size(0)] > 0) //all threads break the loop simultaneously\n" 00312 " break;\n" 00313 " if (get_global_id(0) == 0)\n" 00314 " buffer[get_global_size(0)] = 1;\n" 00315 " } //for k\n" 00316 " if (row < size)\n" 00317 " vector[row] = vec_entries[get_global_id(0)];\n" 00318 " //vector[row] = diagonal_entry;\n" 00319 " \n" 00320 " //if (row == 0)\n" 00321 " //vector[0] = diagonal_entry;\n" 00322 " //vector[0] = elements[0];\n" 00323 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00324 " } //for block_num\n" 00325 "}\n" 00326 ; //compressed_matrix_align1_lu_backward 00327 00328 const char * const compressed_matrix_align1_row_scaling_2 = 00329 "__kernel void row_scaling_2(\n" 00330 " __global const unsigned int * row_indices,\n" 00331 " __global const unsigned int * column_indices, \n" 00332 " __global const float * elements,\n" 00333 " __global float * diag_M_inv,\n" 00334 " unsigned int size) \n" 00335 "{ \n" 00336 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00337 " {\n" 00338 " float dot_prod = 0.0f;\n" 00339 " float temp = 0.0f;\n" 00340 " unsigned int row_end = row_indices[row+1];\n" 00341 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00342 " {\n" 00343 " temp = elements[i];\n" 00344 " dot_prod += temp * temp;\n" 00345 " }\n" 00346 " diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n" 00347 " }\n" 00348 "}\n" 00349 ; //compressed_matrix_align1_row_scaling_2 00350 00351 const char * const compressed_matrix_align1_vec_mul = 00352 "__kernel void vec_mul(\n" 00353 " __global const unsigned int * row_indices,\n" 00354 " __global const unsigned int * column_indices, \n" 00355 " __global const float * elements,\n" 00356 " __global const float * vector, \n" 00357 " __global float * result,\n" 00358 " unsigned int size) \n" 00359 "{ \n" 00360 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00361 " {\n" 00362 " float dot_prod = 0.0f;\n" 00363 " unsigned int row_end = row_indices[row+1];\n" 00364 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00365 " dot_prod += elements[i] * vector[column_indices[i]];\n" 00366 " result[row] = dot_prod;\n" 00367 " }\n" 00368 "}\n" 00369 ; //compressed_matrix_align1_vec_mul 00370 00371 const char * const compressed_matrix_align1_bicgstab_kernel2 = 00372 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n" 00373 "{\n" 00374 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n" 00375 " {\n" 00376 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00377 " if (get_local_id(0) < stride)\n" 00378 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n" 00379 " }\n" 00380 "}\n" 00381 "//////// inner products:\n" 00382 "float bicgstab_kernel2_inner_prod(\n" 00383 " __global const float * vec1,\n" 00384 " __global const float * vec2,\n" 00385 " unsigned int size,\n" 00386 " __local float * tmp_buffer)\n" 00387 "{\n" 00388 " float tmp = 0;\n" 00389 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n" 00390 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n" 00391 " {\n" 00392 " if (i < size)\n" 00393 " tmp += vec1[i] * vec2[i];\n" 00394 " }\n" 00395 " tmp_buffer[get_local_id(0)] = tmp;\n" 00396 " \n" 00397 " helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n" 00398 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00399 " return tmp_buffer[0];\n" 00400 "}\n" 00401 "__kernel void bicgstab_kernel2(\n" 00402 " __global const float * tmp0,\n" 00403 " __global const float * tmp1,\n" 00404 " __global const float * r0star, \n" 00405 " __global const float * s, \n" 00406 " __global float * p, \n" 00407 " __global float * result,\n" 00408 " __global float * residual,\n" 00409 " __global const float * alpha,\n" 00410 " __global float * ip_rr0star,\n" 00411 " __global float * error_estimate,\n" 00412 " __local float * tmp_buffer,\n" 00413 " unsigned int size) \n" 00414 "{ \n" 00415 " float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n" 00416 " float alpha_local = alpha[0];\n" 00417 " \n" 00418 " //result += alpha * p + omega * s;\n" 00419 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n" 00420 " result[i] += alpha_local * p[i] + omega_local * s[i];\n" 00421 " //residual = s - omega * tmp1;\n" 00422 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n" 00423 " residual[i] = s[i] - omega_local * tmp1[i];\n" 00424 " //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n" 00425 " float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n" 00426 " float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n" 00427 " \n" 00428 " //p = residual + beta * (p - omega*tmp0);\n" 00429 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n" 00430 " p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n" 00431 " //compute norm of residual:\n" 00432 " float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n" 00433 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00434 " //update values:\n" 00435 " if (get_global_id(0) == 0)\n" 00436 " {\n" 00437 " error_estimate[0] = new_error_estimate;\n" 00438 " ip_rr0star[0] = new_ip_rr0star;\n" 00439 " }\n" 00440 "}\n" 00441 ; //compressed_matrix_align1_bicgstab_kernel2 00442 00443 const char * const compressed_matrix_align1_row_scaling_1 = 00444 "__kernel void row_scaling_1(\n" 00445 " __global const unsigned int * row_indices,\n" 00446 " __global const unsigned int * column_indices, \n" 00447 " __global const float * elements,\n" 00448 " __global float * diag_M_inv,\n" 00449 " unsigned int size) \n" 00450 "{ \n" 00451 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00452 " {\n" 00453 " float dot_prod = 0.0f;\n" 00454 " unsigned int row_end = row_indices[row+1];\n" 00455 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00456 " dot_prod += fabs(elements[i]);\n" 00457 " diag_M_inv[row] = 1.0f / dot_prod;\n" 00458 " }\n" 00459 "}\n" 00460 ; //compressed_matrix_align1_row_scaling_1 00461 00462 const char * const compressed_matrix_align8_vec_mul = 00463 "__kernel void vec_mul(\n" 00464 " __global const unsigned int * row_indices,\n" 00465 " __global const uint8 * column_indices, \n" 00466 " __global const float8 * elements,\n" 00467 " __global const float * vector, \n" 00468 " __global float * result,\n" 00469 " unsigned int size)\n" 00470 "{ \n" 00471 " float dot_prod;\n" 00472 " unsigned int start, next_stop;\n" 00473 " uint8 col_idx;\n" 00474 " float8 tmp_vec;\n" 00475 " float8 tmp_entries;\n" 00476 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00477 " {\n" 00478 " dot_prod = 0.0f;\n" 00479 " start = row_indices[row] / 8;\n" 00480 " next_stop = row_indices[row+1] / 8;\n" 00481 " for (unsigned int i = start; i < next_stop; ++i)\n" 00482 " {\n" 00483 " col_idx = column_indices[i];\n" 00484 " tmp_entries = elements[i];\n" 00485 " tmp_vec.s0 = vector[col_idx.s0];\n" 00486 " tmp_vec.s1 = vector[col_idx.s1];\n" 00487 " tmp_vec.s2 = vector[col_idx.s2];\n" 00488 " tmp_vec.s3 = vector[col_idx.s3];\n" 00489 " tmp_vec.s4 = vector[col_idx.s4];\n" 00490 " tmp_vec.s5 = vector[col_idx.s5];\n" 00491 " tmp_vec.s6 = vector[col_idx.s6];\n" 00492 " tmp_vec.s7 = vector[col_idx.s7];\n" 00493 " dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n" 00494 " dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n" 00495 " }\n" 00496 " result[row] = dot_prod;\n" 00497 " }\n" 00498 "}\n" 00499 ; //compressed_matrix_align8_vec_mul 00500 00501 } //namespace kernels 00502 } //namespace linalg 00503 } //namespace viennacl 00504 #endif
1.7.6.1