-
Notifications
You must be signed in to change notification settings - Fork 1
/
multiply.cu
149 lines (122 loc) · 3.66 KB
/
multiply.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#include <stdio.h>
#include <stdlib.h> /* srand, rand */
#include <time.h> /* time */
// CUDA RumTime API
#define TILE_WIDTH 32
void MatrixMultiplyOnHost(float* M, float* N, float* P, int width)
{
for(int i=0; i<width; ++i)
{
for (int j=0; j<width; ++j)
{
float sum = 0;
for(int k=0; k<width; ++k)
{
float a = M[i*width+k];
float b = N[k*width+j];
sum += a*b;
}
P[i*width+j] = sum;
}
}
}
__global__ void MatirxMultiplyKernel(const float* devM, const float* devN, float* devP, const int width, const int tile_width)
{
__shared__ float sM[TILE_WIDTH][TILE_WIDTH];
__shared__ float sN[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int col = bx*tile_width+tx;
int row = by*tile_width+ty;
//Initialize accumulator to 0
float pValue = 0;
//Multiply and add
for(int m=0; m<width/tile_width;m++)
{
sM[ty][tx] = devM[row*width+(m*tile_width+tx)];
sN[ty][tx] = devN[col+(m*tile_width+ty)*width];
//each time bring one element from devM and devN into shared memory
//threads are blocked until all the threads reach this point
__syncthreads();
for(int k=0; k<tile_width; k++)
pValue += sM[ty][k]*sN[k][tx];
__syncthreads();
}
//Write value to device memory - each thread has unique index to write to
devP[row*width+col] = pValue;
}
void MatrixMultiplyOnDevice(const float* hostM, const float* hostN, float* hostP, const int width, const int tile_width)
{
int sizeInBytes = width*width*sizeof(float);
float *devM, *devN, *devP;
//Allocate M and N on devide
cudaMalloc((void**)&devM, sizeInBytes);
cudaMalloc((void**)&devN, sizeInBytes);
//Allocate P
cudaMalloc((void**)&devP, sizeInBytes);
//Copy M and N from host to device
cudaMemcpy(devM, hostM, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(devN, hostN, sizeInBytes, cudaMemcpyHostToDevice);
//Setup thread/block execution configuration
dim3 dimBlocks(tile_width,tile_width); //Each block has (width,width) threads
dim3 dimGrid(width/tile_width,width/tile_width); //Launch 1 block
//Launch the kernel
clock_t begin = clock();
MatirxMultiplyKernel<<<dimGrid,dimBlocks>>>(devM,devN,devP,width,tile_width);
clock_t end = clock();
float elapsed_secs = float(end - begin) / CLOCKS_PER_SEC;
printf("Matrix Multiply on Device: %fs\n",elapsed_secs);
//Copy P matrix from device to host
cudaMemcpy(hostP, devP, sizeInBytes, cudaMemcpyDeviceToHost);
//Free allocated memory
cudaFree(devM); cudaFree(devN); cudaFree(devP);
}
void PrintMatrix(float* M, int width)
{
for(int i=0; i<width; i++)
{
for(int j=0; j<width; j++)
{
printf("%f ",M[i*width+j]);
}
printf("\n");
}
}
int main()
{
int width = 1024;
int tile_width = 32;
int size = width*width;
float* M = new float[size];
float* N = new float[size];
float* P = new float[size];
float* Q = new float[size];
srand (time(NULL));
for(int i=0; i<size; i++)
{
M[i] = rand() / (RAND_MAX + 1.);
N[i] = rand() / (RAND_MAX + 1.);
}
//multiply on host
clock_t begin = clock();
MatrixMultiplyOnHost(M,N,P,width);
clock_t end = clock();
float elapsed_secs = float(end - begin) / CLOCKS_PER_SEC;
printf("Matrix Multiply on Host: %fs\n",elapsed_secs);
//std::cout << "Matrix Multiply on Host: " << elapsed_secs << std::endl;
//multiply on device
//1. Copy M,N matrices to device
//2. M*N on device
//3. Copy P matrix to host and output
MatrixMultiplyOnDevice(M,N,Q,width,tile_width);
float avg_err = 0;
for(int i=0; i<size; i++)
avg_err += fabs(P[i]-Q[i]);
avg_err /= size;
printf("Average error is: %f\n",avg_err);
//PrintMatrix(M,width);
//PrintMatrix(N,width);
//PrintMatrix(P,width);
//PrintMatrix(Q,width);
return 0;
}