|
ViennaCL - The Vienna Computing Library
1.1.2
|
00001 #ifndef _VIENNACL_MATRIX_PROD_COL_COL_ROW_SOURCE_HPP_ 00002 #define _VIENNACL_MATRIX_PROD_COL_COL_ROW_SOURCE_HPP_ 00003 //Automatically generated file from aux-directory, do not edit manually! 00004 namespace viennacl 00005 { 00006 namespace linalg 00007 { 00008 namespace kernels 00009 { 00010 const char * const matrix_prod_col_col_row_align1_prod_TT = 00011 "// file automatically generated - do not edit!\n" 00012 "// matrix-matrix multiplication C = A^T * B^T\n" 00013 "// matrix layouts: C...row_major, A...col_major, B...col_major\n" 00014 "__kernel void prod_TT(\n" 00015 " __global const float * A,\n" 00016 " unsigned int A_rows,\n" 00017 " unsigned int A_cols,\n" 00018 " unsigned int A_internal_rows,\n" 00019 " unsigned int A_internal_cols,\n" 00020 " __global const float * B, \n" 00021 " unsigned int B_rows,\n" 00022 " unsigned int B_cols,\n" 00023 " unsigned int B_internal_rows,\n" 00024 " unsigned int B_internal_cols,\n" 00025 " __global float * C,\n" 00026 " unsigned int C_rows,\n" 00027 " unsigned int C_cols,\n" 00028 " unsigned int C_internal_rows,\n" 00029 " unsigned int C_internal_cols,\n" 00030 " __local float * bufA,\n" 00031 " __local float * bufB) \n" 00032 "{ \n" 00033 " int block_size = get_local_size(0);\n" 00034 " int row_block_id = get_group_id(0);\n" 00035 " int col_block_id = get_group_id(1);\n" 00036 " int row_thread_id = get_local_id(0);\n" 00037 " int col_thread_id = get_local_id(1);\n" 00038 " int aBegin = row_block_id * block_size * A_internal_rows;\n" 00039 " int aStep = block_size;\n" 00040 " int bBegin = col_block_id * block_size;\n" 00041 " int bStep = block_size * B_internal_rows;\n" 00042 " int block_num = A_rows / block_size;\n" 00043 " if (block_num * block_size != A_rows)\n" 00044 " ++block_num;\n" 00045 " float Csub = 0;\n" 00046 " int aOffset = row_thread_id * A_internal_rows + col_thread_id;\n" 00047 " int bOffset = row_thread_id * B_internal_rows + col_thread_id;\n" 00048 " for (int block = 0;\n" 00049 " block < block_num;\n" 00050 " ++block)\n" 00051 " {\n" 00052 " if (block * block_size + col_thread_id < A_rows && get_global_id(0) < A_cols)\n" 00053 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n" 00054 " else\n" 00055 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n" 00056 " if ( (block * block_size + row_thread_id < B_cols) && get_global_id(1) < B_rows )\n" 00057 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n" 00058 " else\n" 00059 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n" 00060 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00061 " for (int k = 0; k < block_size; ++k)\n" 00062 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n" 00063 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00064 " aBegin += aStep;\n" 00065 " bBegin += bStep;\n" 00066 " }\n" 00067 " if (get_global_id(0) < A_cols && get_global_id(1) < B_rows)\n" 00068 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n" 00069 "}\n" 00070 ; //matrix_prod_col_col_row_align1_prod_TT 00071 00072 const char * const matrix_prod_col_col_row_align1_prod_TA = 00073 "// file automatically generated - do not edit!\n" 00074 "// matrix-matrix multiplication C = A^T * B\n" 00075 "// matrix layouts: C...row_major, A...col_major, B...col_major\n" 00076 "__kernel void prod_TA(\n" 00077 " __global const float * A,\n" 00078 " unsigned int A_rows,\n" 00079 " unsigned int A_cols,\n" 00080 " unsigned int A_internal_rows,\n" 00081 " unsigned int A_internal_cols,\n" 00082 " __global const float * B, \n" 00083 " unsigned int B_rows,\n" 00084 " unsigned int B_cols,\n" 00085 " unsigned int B_internal_rows,\n" 00086 " unsigned int B_internal_cols,\n" 00087 " __global float * C,\n" 00088 " unsigned int C_rows,\n" 00089 " unsigned int C_cols,\n" 00090 " unsigned int C_internal_rows,\n" 00091 " unsigned int C_internal_cols,\n" 00092 " __local float * bufA,\n" 00093 " __local float * bufB) \n" 00094 "{ \n" 00095 " int block_size = get_local_size(0);\n" 00096 " int row_block_id = get_group_id(0);\n" 00097 " int col_block_id = get_group_id(1);\n" 00098 " int row_thread_id = get_local_id(0);\n" 00099 " int col_thread_id = get_local_id(1);\n" 00100 " int aBegin = row_block_id * block_size * A_internal_rows;\n" 00101 " int aStep = block_size;\n" 00102 " int bBegin = col_block_id * block_size * B_internal_rows;\n" 00103 " int bStep = block_size;\n" 00104 " int block_num = A_rows / block_size;\n" 00105 " if (block_num * block_size != A_rows)\n" 00106 " ++block_num;\n" 00107 " float Csub = 0;\n" 00108 " int aOffset = row_thread_id * A_internal_rows + col_thread_id;\n" 00109 " int bOffset = row_thread_id + col_thread_id * B_internal_rows;\n" 00110 " for (int block = 0;\n" 00111 " block < block_num;\n" 00112 " ++block)\n" 00113 " {\n" 00114 " if (block * block_size + col_thread_id < A_rows && get_global_id(0) < A_cols)\n" 00115 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n" 00116 " else\n" 00117 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n" 00118 " if ( (block * block_size + row_thread_id < B_rows) && get_global_id(1) < B_cols )\n" 00119 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n" 00120 " else\n" 00121 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n" 00122 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00123 " for (int k = 0; k < block_size; ++k)\n" 00124 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n" 00125 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00126 " aBegin += aStep;\n" 00127 " bBegin += bStep;\n" 00128 " }\n" 00129 " if (get_global_id(0) < A_cols && get_global_id(1) < B_cols)\n" 00130 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n" 00131 "}\n" 00132 ; //matrix_prod_col_col_row_align1_prod_TA 00133 00134 const char * const matrix_prod_col_col_row_align1_prod_AA = 00135 "// file automatically generated - do not edit!\n" 00136 "// matrix-matrix multiplication C = A * B\n" 00137 "// matrix layouts: C...row_major, A...col_major, B...col_major\n" 00138 "__kernel void prod_AA(\n" 00139 " __global const float * A,\n" 00140 " unsigned int A_rows,\n" 00141 " unsigned int A_cols,\n" 00142 " unsigned int A_internal_rows,\n" 00143 " unsigned int A_internal_cols,\n" 00144 " __global const float * B, \n" 00145 " unsigned int B_rows,\n" 00146 " unsigned int B_cols,\n" 00147 " unsigned int B_internal_rows,\n" 00148 " unsigned int B_internal_cols,\n" 00149 " __global float * C,\n" 00150 " unsigned int C_rows,\n" 00151 " unsigned int C_cols,\n" 00152 " unsigned int C_internal_rows,\n" 00153 " unsigned int C_internal_cols,\n" 00154 " __local float * bufA,\n" 00155 " __local float * bufB) \n" 00156 "{ \n" 00157 " int block_size = get_local_size(0);\n" 00158 " int row_block_id = get_group_id(0);\n" 00159 " int col_block_id = get_group_id(1);\n" 00160 " int row_thread_id = get_local_id(0);\n" 00161 " int col_thread_id = get_local_id(1);\n" 00162 " int aBegin = row_block_id * block_size;\n" 00163 " int aStep = block_size * A_internal_rows;\n" 00164 " int bBegin = col_block_id * block_size * B_internal_rows;\n" 00165 " int bStep = block_size;\n" 00166 " int block_num = A_cols / block_size;\n" 00167 " if (block_num * block_size != A_cols)\n" 00168 " ++block_num;\n" 00169 " float Csub = 0;\n" 00170 " int aOffset = row_thread_id + col_thread_id * A_internal_rows;\n" 00171 " int bOffset = row_thread_id + col_thread_id * B_internal_rows;\n" 00172 " for (int block = 0;\n" 00173 " block < block_num;\n" 00174 " ++block)\n" 00175 " {\n" 00176 " if (block * block_size + col_thread_id < A_cols && get_global_id(0) < A_rows)\n" 00177 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n" 00178 " else\n" 00179 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n" 00180 " if ( (block * block_size + row_thread_id < B_rows) && get_global_id(1) < B_cols )\n" 00181 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n" 00182 " else\n" 00183 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n" 00184 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00185 " for (int k = 0; k < block_size; ++k)\n" 00186 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n" 00187 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00188 " aBegin += aStep;\n" 00189 " bBegin += bStep;\n" 00190 " }\n" 00191 " if (get_global_id(0) < A_rows && get_global_id(1) < B_cols)\n" 00192 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n" 00193 "}\n" 00194 ; //matrix_prod_col_col_row_align1_prod_AA 00195 00196 const char * const matrix_prod_col_col_row_align1_prod_AT = 00197 "// file automatically generated - do not edit!\n" 00198 "// matrix-matrix multiplication C = A * B^T\n" 00199 "// matrix layouts: C...row_major, A...col_major, B...col_major\n" 00200 "__kernel void prod_AT(\n" 00201 " __global const float * A,\n" 00202 " unsigned int A_rows,\n" 00203 " unsigned int A_cols,\n" 00204 " unsigned int A_internal_rows,\n" 00205 " unsigned int A_internal_cols,\n" 00206 " __global const float * B, \n" 00207 " unsigned int B_rows,\n" 00208 " unsigned int B_cols,\n" 00209 " unsigned int B_internal_rows,\n" 00210 " unsigned int B_internal_cols,\n" 00211 " __global float * C,\n" 00212 " unsigned int C_rows,\n" 00213 " unsigned int C_cols,\n" 00214 " unsigned int C_internal_rows,\n" 00215 " unsigned int C_internal_cols,\n" 00216 " __local float * bufA,\n" 00217 " __local float * bufB) \n" 00218 "{ \n" 00219 " int block_size = get_local_size(0);\n" 00220 " int row_block_id = get_group_id(0);\n" 00221 " int col_block_id = get_group_id(1);\n" 00222 " int row_thread_id = get_local_id(0);\n" 00223 " int col_thread_id = get_local_id(1);\n" 00224 " int aBegin = row_block_id * block_size;\n" 00225 " int aStep = block_size * A_internal_rows;\n" 00226 " int bBegin = col_block_id * block_size;\n" 00227 " int bStep = block_size * B_internal_rows;\n" 00228 " int block_num = A_cols / block_size;\n" 00229 " if (block_num * block_size != A_cols)\n" 00230 " ++block_num;\n" 00231 " float Csub = 0;\n" 00232 " int aOffset = row_thread_id + col_thread_id * A_internal_rows;\n" 00233 " int bOffset = row_thread_id * B_internal_rows + col_thread_id;\n" 00234 " for (int block = 0;\n" 00235 " block < block_num;\n" 00236 " ++block)\n" 00237 " {\n" 00238 " if (block * block_size + col_thread_id < A_cols && get_global_id(0) < A_rows)\n" 00239 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n" 00240 " else\n" 00241 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n" 00242 " if ( (block * block_size + row_thread_id < B_cols) && get_global_id(1) < B_rows )\n" 00243 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n" 00244 " else\n" 00245 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n" 00246 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00247 " for (int k = 0; k < block_size; ++k)\n" 00248 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n" 00249 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00250 " aBegin += aStep;\n" 00251 " bBegin += bStep;\n" 00252 " }\n" 00253 " if (get_global_id(0) < A_rows && get_global_id(1) < B_rows)\n" 00254 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n" 00255 "}\n" 00256 ; //matrix_prod_col_col_row_align1_prod_AT 00257 00258 } //namespace kernels 00259 } //namespace linalg 00260 } //namespace viennacl 00261 #endif
1.7.6.1