|
ViennaCL - The Vienna Computing Library
1.1.2
|
00001 #ifndef _VIENNACL_COORDINATE_MATRIX_SOURCE_HPP_ 00002 #define _VIENNACL_COORDINATE_MATRIX_SOURCE_HPP_ 00003 //Automatically generated file from aux-directory, do not edit manually! 00004 namespace viennacl 00005 { 00006 namespace linalg 00007 { 00008 namespace kernels 00009 { 00010 const char * const coordinate_matrix_align1_vec_mul = 00011 "//segmented parallel reduction. At present restricted to up to 256 threads\n" 00012 "void segmented_parallel_reduction(unsigned int row, \n" 00013 " float val, \n" 00014 " __local unsigned int * shared_rows, \n" 00015 " __local float * inter_results) \n" 00016 "{ \n" 00017 " //barrier(CLK_LOCAL_MEM_FENCE); \n" 00018 " shared_rows[get_local_id(0)] = row; \n" 00019 " inter_results[get_local_id(0)] = val; \n" 00020 " float left = 0;\n" 00021 " \n" 00022 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00023 " if( get_local_id(0) >= 1 && row == shared_rows[get_local_id(0) - 1] ) { left = inter_results[get_local_id(0) - 1]; } \n" 00024 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00025 " inter_results[get_local_id(0)] += left; left = 0;\n" 00026 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00027 " if( get_local_id(0) >= 2 && row == shared_rows[get_local_id(0) - 2] ) { left = inter_results[get_local_id(0) - 2]; } \n" 00028 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00029 " inter_results[get_local_id(0)] += left; left = 0;\n" 00030 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00031 " if( get_local_id(0) >= 4 && row == shared_rows[get_local_id(0) - 4] ) { left = inter_results[get_local_id(0) - 4]; } \n" 00032 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00033 " inter_results[get_local_id(0)] += left; left = 0;\n" 00034 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00035 " if( get_local_id(0) >= 8 && row == shared_rows[get_local_id(0) - 8] ) { left = inter_results[get_local_id(0) - 8]; } \n" 00036 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00037 " inter_results[get_local_id(0)] += left; left = 0;\n" 00038 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00039 " if( get_local_id(0) >= 16 && row == shared_rows[get_local_id(0) - 16] ) { left = inter_results[get_local_id(0) - 16]; } \n" 00040 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00041 " inter_results[get_local_id(0)] += left; left = 0;\n" 00042 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00043 " if( get_local_id(0) >= 32 && row == shared_rows[get_local_id(0) - 32] ) { left = inter_results[get_local_id(0) - 32]; } \n" 00044 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00045 " inter_results[get_local_id(0)] += left; left = 0;\n" 00046 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00047 " if( get_local_id(0) >= 64 && row == shared_rows[get_local_id(0) - 64] ) { left = inter_results[get_local_id(0) - 64]; } \n" 00048 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00049 " inter_results[get_local_id(0)] += left; left = 0;\n" 00050 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00051 " if( get_local_id(0) >= 128 && row == shared_rows[get_local_id(0) - 128] ) { left = inter_results[get_local_id(0) - 128]; } \n" 00052 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00053 " inter_results[get_local_id(0)] += left; left = 0;\n" 00054 " barrier(CLK_LOCAL_MEM_FENCE); \n" 00055 " //if( get_local_id(0) >= 256 && row == shared_rows[get_local_id(0) - 256] ) { left = inter_results[get_local_id(0) - 256]; } \n" 00056 " //barrier(CLK_LOCAL_MEM_FENCE); \n" 00057 " //inter_results[get_local_id(0)] += left; left = 0;\n" 00058 " //barrier(CLK_LOCAL_MEM_FENCE); \n" 00059 "}\n" 00060 "__kernel void vec_mul( \n" 00061 " __global const uint2 * coords, //(row_index, column_index) \n" 00062 " __global const float * elements, \n" 00063 " __global const uint * group_boundaries,\n" 00064 " __global const float * vector, \n" 00065 " __global float * result, \n" 00066 " __local unsigned int * shared_rows, \n" 00067 " __local float * inter_results) \n" 00068 "{ \n" 00069 " uint2 tmp; \n" 00070 " float val;\n" 00071 " uint last_index = get_local_size(0) - 1;\n" 00072 " uint group_start = group_boundaries[get_group_id(0)];\n" 00073 " uint group_end = group_boundaries[get_group_id(0) + 1];\n" 00074 " uint k_end = 1 + (group_end - group_start - 1) / get_local_size(0); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)\n" 00075 " uint local_index = 0;\n" 00076 " for (uint k = 0; k < k_end; ++k)\n" 00077 " { \n" 00078 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n" 00079 " \n" 00080 " local_index = group_start + k * get_local_size(0) + get_local_id(0); \n" 00081 " \n" 00082 " if (local_index < group_end)\n" 00083 " {\n" 00084 " tmp = coords[local_index]; \n" 00085 " val = elements[local_index] * vector[tmp.y]; \n" 00086 " }\n" 00087 " else\n" 00088 " {\n" 00089 " tmp.x = 0;\n" 00090 " tmp.y = 0;\n" 00091 " val = 0;\n" 00092 " }\n" 00093 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n" 00094 " //check for carry from previous loop run: \n" 00095 " if (get_local_id(0) == 0 && k > 0)\n" 00096 " { \n" 00097 " if (tmp.x == shared_rows[last_index]) \n" 00098 " val += inter_results[last_index]; \n" 00099 " else \n" 00100 " result[shared_rows[last_index]] += inter_results[last_index]; \n" 00101 " } \n" 00102 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n" 00103 " segmented_parallel_reduction(tmp.x, val, shared_rows, inter_results); //all threads have to enter this function\n" 00104 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n" 00105 " if (get_local_id(0) != last_index &&\n" 00106 " shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] &&\n" 00107 " inter_results[get_local_id(0)] != 0) \n" 00108 " { \n" 00109 " result[tmp.x] += inter_results[get_local_id(0)]; \n" 00110 " }\n" 00111 " \n" 00112 " barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); \n" 00113 " } //for k\n" 00114 " \n" 00115 " if (get_local_id(0) == last_index && inter_results[last_index] != 0) \n" 00116 " result[tmp.x] += inter_results[last_index]; \n" 00117 "}\n" 00118 ; //coordinate_matrix_align1_vec_mul 00119 00120 } //namespace kernels 00121 } //namespace linalg 00122 } //namespace viennacl 00123 #endif
1.7.6.1