//###########################################################################// // Fast matrix multiplication with NVIDIA's CUDA // // // // Hongliang Gao, University of Central Florida // // Feb. 2008 // //###########################################################################// // Disclaimer: // // This code is provided on an "AS IS" basis, without warranty. The author // // does not have any liability to you or any other person or entity with // // respect to any liability, loss, or damage caused or alleged to have been // // caused directly or indirectly by this code. // // // // Credits: // // NVIDIA: for samples included in the CUDA SDK. // // Vasily Volkov @ UC Berkeley: for the code of using 16x64 tiles posted // // on NVIDIA's Forum. // // Paul Leventis @ Altera Corp: for prefetching. // // Wladimir J. van der Laan @ the University of Groningen: // // for his cubin disassembler (http://www.cs.rug.nl/~wladimir/decuda/). // //###########################################################################// // // // This code includes kernel code of three algorithms of matrix // // multiplications with CUDA: // // 1. A simple tiled algorithm using 16*16 tiles.The kernel function is // // matrix_simpletile. // // 2. Two tiled algorithms using 16*256 tiles. One is coded for easy // // understanding (matrix_large_tile_base). The other is optimized with // // prefetching (matrix_large_tile_optimized). // // // // The matrix_large_tile_optimized is tuned for peak performance with 4k*4k // // matrix size on a 512MB GEFORCE 8800GT card. // // // // Instructions: // // Please make sure the "common" directory of CUDA SDK samples is located// // at "../../common". If not, the Makefile has to be changed accordingly. // // When matrix size is smaller than 4k*4k: use "make maxregisters=32" // // to compile. Otherwise, just use "make" to compile. // //###########################################################################// // All matrix sizes are MATRIX_WIDTH x MATRIX_WIDTH // The MATRIX_WIDTH has to be a multiple of 256 for the large_tiled algorithms #define MATRIX_WIDTH 4096 // Verify results with those calculated by CPU //#define VERIFY // Including data transfer time between CPU and GPU // If not defined, only computation time on GPU is counted into the // performance calculation. //#define PERFORMANCE_TOTAL #include #include #include #include #include #include #include //cublas library #include #define MATRIX_SIZE (MATRIX_WIDTH * MATRIX_WIDTH) #define MATRIX_MEMSIZE (MATRIX_SIZE * 4) //----kernels---- // Calculation: C = A * B // All matrixes are stored in row-major format. // A tiled matrix multiplication algorithm using 16 * 16 tiles. The subtiles // of source matrixes are stored in shared memory. __global__ void matrix_simpletile( float* C, float* A, float* B) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; // beginning positions of the first subtile int a = MATRIX_WIDTH * 16 * by; int b = 16 * bx; float Psub = 0; // for each subtile in A and B for (int i = 0; i < MATRIX_WIDTH / 16; i++) { // shared memory arrays for source subtiles __shared__ float As[16][16]; __shared__ float Bs[16][16]; // each thread reads one from each source matrix As[ty][tx] = A[a + MATRIX_WIDTH * ty + tx]; Bs[ty][tx] = B[b + MATRIX_WIDTH * ty + tx]; __syncthreads(); #pragma unroll for (int k = 0; k < 16; ++k) Psub += As[ty][k] * Bs[k][tx]; __syncthreads(); // move to next subtile a += 16; b += 16 *MATRIX_WIDTH; } int c = MATRIX_WIDTH * 16 * by + 16 * bx; C[c + MATRIX_WIDTH * ty + tx] = Psub; } // Compute 16 MADs (Multiply-and-Add). One MAD for each output value. __device__ void comp16(float a, float *b, float *c) { c[0] += a*b[0]; c[1] += a*b[1]; c[2] += a*b[2]; c[3] += a*b[3]; c[4] += a*b[4]; c[5] += a*b[5]; c[6] += a*b[6]; c[7] += a*b[7]; c[8] += a*b[8]; c[9] += a*b[9]; c[10] += a*b[10]; c[11] += a*b[11]; c[12] += a*b[12]; c[13] += a*b[13]; c[14] += a*b[14]; c[15] += a*b[15]; } // A tiled matrix multiplication algorithm using 16 * 256 tiles. The subtiles // of source matrix A are stored in shared memory. The subtiles of source // matrix B are read one value a time into a register. // There are 256 threads for each tile (size: 16 * 256) of the output matrix. // Each thread will calcuate 16 values for the output matrix. __global__ void matrix_large_tile_base(float *A, float *B, float *C) { const int tx = threadIdx.x; const int ty = threadIdx.y; int bx = blockIdx.x * 256; int by = blockIdx.y * 16; A += by * MATRIX_WIDTH + tx + ty * MATRIX_WIDTH; B += bx + tx + ty * 16; C += by * MATRIX_WIDTH + bx + tx + ty * 16; // Subtile of matrix A is stored in shared memory. // A padding row is added to remove bank conflicts. __shared__ float ashare[16][17]; // 16 output values. float c[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; // A register to store values from the subtile of matrix B. float b; // for each subtile in A and B for (int i = 0; i < MATRIX_WIDTH/16; i++) { // read in the subtile of A ashare[tx][ty] = A[0]; __syncthreads(); #pragma unroll for (int k = 0; k < 16; k++) { // read in one value of subtile of B, which is shared by // all 16 output values. b = B[k * MATRIX_WIDTH]; comp16(b, &ashare[k][0], c); } // move to next subtile A += 16; B += 16 * MATRIX_WIDTH; __syncthreads(); }; for(int i = 0; i < 16; i++) { C[0] = c[i]; C += MATRIX_WIDTH; } } // An optimized version of the algorithm with 16 * 256 tiles. __global__ void matrix_large_tile_optimized(float *A, float *B, float *C) { const int tx = threadIdx.x; const int ty = threadIdx.y; int bx = blockIdx.x * 256; int by = blockIdx.y * 16; A += by * MATRIX_WIDTH + tx + ty * MATRIX_WIDTH; B += bx + tx + ty * 16; C += by * MATRIX_WIDTH + bx + tx + ty * 16; // a is the register used to prefetch a value from A float a = A[0]; // b is the array to prefetch 4 values a time from B float b[4] = {B[0], B[MATRIX_WIDTH], B[2*MATRIX_WIDTH], B[3*MATRIX_WIDTH]}; float *Alast = A + MATRIX_WIDTH; A += 16; __shared__ float ashare[16][17]; float c[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; // bb is used to buffer prefetched values in b float bb[4]; do { ashare[tx][ty] = a; __syncthreads(); // prefetch A[0] a = A[0]; bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; b[0] = B[4 * MATRIX_WIDTH]; b[1] = B[5 * MATRIX_WIDTH]; b[2] = B[6 * MATRIX_WIDTH]; b[3] = B[7 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i][0], c); bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; b[0] = B[8 * MATRIX_WIDTH]; b[1] = B[9 * MATRIX_WIDTH]; b[2] = B[10 * MATRIX_WIDTH]; b[3] = B[11 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i + 4][0], c); bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; b[0] = B[12 * MATRIX_WIDTH]; b[1] = B[13 * MATRIX_WIDTH]; b[2] = B[14 * MATRIX_WIDTH]; b[3] = B[15 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i + 8][0], c); bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; A += 16; B += 16 * MATRIX_WIDTH; b[0] = B[0 * MATRIX_WIDTH]; b[1] = B[1 * MATRIX_WIDTH]; b[2] = B[2 * MATRIX_WIDTH]; b[3] = B[3 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i + 12][0], c); __syncthreads(); } while( A < Alast ); // to prevent overflow of prefetching instructions in the loop, we // calculate the last iteration here without prefetches: ashare[tx][ty] = a; __syncthreads(); bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; b[0] = B[4 * MATRIX_WIDTH]; b[1] = B[5 * MATRIX_WIDTH]; b[2] = B[6 * MATRIX_WIDTH]; b[3] = B[7 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i][0], c); bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; b[0] = B[8 * MATRIX_WIDTH]; b[1] = B[9 * MATRIX_WIDTH]; b[2] = B[10 * MATRIX_WIDTH]; b[3] = B[11 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i + 4][0], c); bb[0] = b[0]; bb[1] = b[1]; bb[2] = b[2]; bb[3] = b[3]; b[0] = B[12 * MATRIX_WIDTH]; b[1] = B[13 * MATRIX_WIDTH]; b[2] = B[14 * MATRIX_WIDTH]; b[3] = B[15 * MATRIX_WIDTH]; for (int i = 0; i < 4; i ++) comp16(bb[i], &ashare[i + 8][0], c); for (int i = 0; i < 4; i ++) comp16(b[i], &ashare[i + 12][0], c); // output for(int i = 0; i < 16; i++, C += MATRIX_WIDTH ) C[0] = c[i]; } void run(int argc, char** argv); void randomInit(float*, int); void printDiff(float*, float*); void matrix_cpu(float*, const float*, const float*); void transpose(float *m); //copy results back to the host and check against CPU results __host__ void copyCheck(float *h_C, float *d_C, float *reference, int tran); __host__ int main(int argc, char** argv) { printf ("Matrix size: %i * %i \nSize:%i (%i bytes)\n", MATRIX_WIDTH, MATRIX_WIDTH, MATRIX_SIZE, MATRIX_MEMSIZE); run(argc, argv); } __host__ void run(int argc, char** argv) { CUT_DEVICE_INIT(); cublasStatus status; status = cublasInit(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); exit (1); } // set seed for rand() srand(2008); // host memory for matrices A and B float* h_A = (float*) malloc(MATRIX_MEMSIZE); float* h_B = (float*) malloc(MATRIX_MEMSIZE); randomInit(h_A, MATRIX_SIZE); randomInit(h_B, MATRIX_SIZE); // device memory for A and B float* d_A; CUDA_SAFE_CALL(cudaMalloc((void**) &d_A, MATRIX_MEMSIZE)); float* d_B; CUDA_SAFE_CALL(cudaMalloc((void**) &d_B, MATRIX_MEMSIZE)); // copy source matrixes to device CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, MATRIX_MEMSIZE,cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, MATRIX_MEMSIZE,cudaMemcpyHostToDevice) ); // device memory for result matrix C float* d_C; CUDA_SAFE_CALL(cudaMalloc((void**) &d_C, MATRIX_MEMSIZE)); // host memory for the result matrix C float* h_C = (float*) malloc(MATRIX_MEMSIZE); // create and start timer unsigned int timer = 0; CUT_SAFE_CALL(cutCreateTimer(&timer)); //total effective operations double totalops = (double)(2 * MATRIX_WIDTH) * (double)MATRIX_WIDTH * (double)MATRIX_WIDTH; // setup execution parameters dim3 threads; dim3 grid; // compute reference solution float* reference = (float*) malloc(MATRIX_MEMSIZE); #ifdef VERIFY printf ("On CPU, Start ...\n"); CUT_SAFE_CALL(cutStartTimer(timer)); matrix_cpu(reference, h_A, h_B); CUT_SAFE_CALL(cutStopTimer(timer)); printf ("On CPU, End ...\n"); printf("CPU Processing time: %f (ms) %.3f GFlops\n", cutGetTimerValue(timer), totalops / (1e6 * cutGetTimerValue(timer))); #endif string names[4] = {"Tiled(16 * 16)", "large_tile_base", "large_tile_optimized", "cublas"}; threads.x = 16; threads.y = 16; for (int n = 0; n < 4; n++) { if (n == 0) { grid.x = MATRIX_WIDTH / 16; grid.y = MATRIX_WIDTH / 16; }else { grid.x = MATRIX_WIDTH / 256; grid.y = MATRIX_WIDTH / 16; } printf("\n-------%s-------\n", names[n].c_str()); if (n != 3) printf("Threads(%i * %i), Grid(%i * %i)\n", threads.x, threads.y, grid.x, grid.y); CUT_SAFE_CALL(cudaMemset(d_C, 0, MATRIX_MEMSIZE)); CUT_SAFE_CALL(cutResetTimer(timer)); CUT_SAFE_CALL(cutStartTimer(timer)); // all performance results are measured as the average of 5 runs for (int i = 0; i < 5; i ++) { #ifdef PERFORMANCE_TOTAL CUDA_SAFE_CALL(cudaMemcpy(d_A, h_A, MATRIX_MEMSIZE,cudaMemcpyHostToDevice) ); CUDA_SAFE_CALL(cudaMemcpy(d_B, h_B, MATRIX_MEMSIZE,cudaMemcpyHostToDevice) ); #endif switch(n) { case 0: matrix_simpletile<<< grid, threads >>>(d_C, d_A, d_B); break; case 1: matrix_large_tile_base<<< grid, threads >>>(d_A, d_B, d_C); break; case 2: matrix_large_tile_optimized<<< grid, threads >>>(d_A, d_B, d_C); break; case 3: cublasSgemm('t', 't', MATRIX_WIDTH, MATRIX_WIDTH, MATRIX_WIDTH, 1.0f, d_A, MATRIX_WIDTH, d_B, MATRIX_WIDTH, 0.0f, d_C, MATRIX_WIDTH); break; default: assert(0); } cudaThreadSynchronize(); #ifdef PERFORMANCE_TOTAL CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, MATRIX_MEMSIZE, cudaMemcpyDeviceToHost) ); #endif } CUT_CHECK_ERROR("Kernel execution failed"); CUT_SAFE_CALL(cutStopTimer(timer)); printf("GPU(%s) Processing time: %f (ms) %.3f GFlops\n", names[n].c_str(), cutGetTimerValue(timer), 5*totalops / (1e6 * cutGetTimerValue(timer))); copyCheck(h_C, d_C, reference, (n == 3 ? 1 : 0)); } CUT_SAFE_CALL(cutDeleteTimer(timer)); free(h_A); free(h_B); free(h_C); free(reference); CUDA_SAFE_CALL(cudaFree(d_A)); CUDA_SAFE_CALL(cudaFree(d_B)); CUDA_SAFE_CALL(cudaFree(d_C)); status = cublasShutdown(); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! shutdown error\n"); } } // Allocates a matrix with random float entries. // (From CUDA SDK examples.) __host__ void randomInit(float* data, int size) { for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX; } // multiplication on CPU for results verificaiton __host__ void matrix_cpu(float* C, const float* A, const float* B) { if (MATRIX_WIDTH > 2048) printf("Warning: it is slow for large matrix on CPU. Please wait...\n"); for (unsigned int i = 0; i < MATRIX_WIDTH; ++i) for (unsigned int j = 0; j < MATRIX_WIDTH; ++j) { float sum = 0; for (unsigned int k = 0; k < MATRIX_WIDTH; ++k) { float a = A[i * MATRIX_WIDTH + k]; float b = B[k * MATRIX_WIDTH + j]; sum += (float)(a * b); } C[i * MATRIX_WIDTH + j] = (float)sum; } } __host__ void transpose(float *m) { for (unsigned int i = 0; i < MATRIX_WIDTH; i++) for (unsigned int j = 0; j < i; j++) { float f = m[i * MATRIX_WIDTH + j]; m[i * MATRIX_WIDTH + j] = m[j * MATRIX_WIDTH + i]; m[j * MATRIX_WIDTH + i] = f; } } __host__ void copyCheck(float *h_C, float *d_C, float *reference, int tran) { // copy result from device to host CUDA_SAFE_CALL(cudaMemcpy(h_C, d_C, MATRIX_MEMSIZE, cudaMemcpyDeviceToHost) ); //output of cublas is column-major, transpose to row-major if (tran) transpose(h_C); #ifdef VERIFY // check result CUTBoolean res = cutCompareL2fe(reference, h_C, MATRIX_SIZE, 1e-6f); printf("Test %s \n", (1 == res) ? "PASSED" : "FAILED"); #endif }