-
Notifications
You must be signed in to change notification settings - Fork 2
/
DCPMM.py
220 lines (197 loc) · 9.55 KB
/
DCPMM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Libaries used
import numpy as np
import cupy as cp # CUDA implementation of numpy functionalities (installation requirement for GPU implementation)
import time as time # Used for timing execution
import math
from scipy.io import mmread # For reading matrix files provided by SuiteSparse Matrix Collection
from multiprocessing import cpu_count # For obtaining the number of cpu cores
from scipy.linalg import blas as FB # Scipy implementation of BLAS dgemm
from pynvml import * # Nvidia library for measuring GPU memory status
"""
Matrix Multiplication
Input: Matrices A, B, and a partitioning threshold c
Output: Resultant Matrix from multiplying A and B
This function first checks to see if the number of rows in A (m)
is less than or equal to the partitioning threshold.
If so, it will perform the matrix multiplication on the GPU using CuPy arrays
and return cupy.matmul(A,B).
Else, the matrices have not met the partitioning threshold and will therefore
be split using the Partition function.
"""
def MatrixMultiply(A,B,c):
m,n = A.shape
if m <= c: # Base Case (Partitioning threshold met)
A = cp.array(A) # Move A to GPU
B = cp.array(B) # Move B to GPU
return cp.matmul(A,B) # Perform matrix multiplication on GPU
return Partition(A,B,c)
"""
Partition
Input: Matrices A, B, and a partitioning threshold c
Output: Resultant Matrix from multiplying A and B
This function first checks to see if the partitioning of A and B can be
performed entirely on the GPU.
This condition checks if there is enough GPU memory to store all the elements of A and B,
which is the dimensions of A and B added times 8 bytes for each element.
If there is enough GPU memory, the partitioning will occur on the GPU.
Else, the partitioning occurs on the CPU.
"""
def Partition(A,B,c):
m,n = A.shape
n,p = B.shape
info = nvmlDeviceGetMemoryInfo(h) # Getting GPU device memory
if (((m*n) + (n*p)) * 8 < info.free): # Determining if there is enough free GPU memory to store both matrices' elements
return PartitionGPU(A,B,c)
else:
return PartitionCPU(A,B,c)
"""
Partition GPU
Input: Matrices A, B, and a partitioning threshold c
Output: Resultant Matrix from multiplying A and B
This function partitions A and B into smaller blocks, which will then be multiplied
into intermediate results that are combined into the final result.
If A is "wide" (m <= n), A is split in half by its columns and
B is split in half by its rows.
Else if A is "skinny" (m > n), A is split in half by its rows and
B is split in half by its columns.
The submatrix creation is carefully managed to only store what is absolutely necessary
on the GPU and to delete any matrices that no longer need to be used for calculation.
"""
def PartitionGPU(A,B,c):
m,n = A.shape
if (type(A) == np.ndarray): # If the arrays are coming from the CPU (numpy arrays),
A = cp.array(A) # move them over to the GPU (cupy arrays).
B = cp.array(B)
if m <= n: # Case 1 (A is "wide")
#axis 0 = rows, axis 1 = columns
[A1,A2] = cp.array_split(A, 2, axis=1) # Split A by columns
[B1,B2] = cp.array_split(B, 2, axis=0) # Split B by rows
C1 = cp.array(MatrixMultiply(A1,B1,c)) # Perform A1*B1
del A1 # A1 no longer needed (remove from GPU)
del B1 # B1 no longer needed
C2 = cp.array(MatrixMultiply(A2,B2,c)) # Perform A2*B2
del A2 # A2 no longer needed
del B2 # B2 no longer needed
return C1 + C2 # Return the final result C = C1 + C2 = A1*B1 + A2*B2
else: #m > n Case 2 (A is "skinny")
[A1,A2] = cp.array_split(A, 2, axis=0) # Split A by rows
[B1,B2] = cp.array_split(B, 2, axis=1) # Split B by columns
C1 = MatrixMultiply(A1,B1,c) # Perform A1*B1
C2 = MatrixMultiply(A1,B2,c) # Perform A1*B2
C12 = cp.hstack((C1,C2)) # Stack C1 and C2 horizontally -> C12
del C1 # C1 no longer needed
del C2 # C2 no longer needed
C3 = MatrixMultiply(A2,B1,c) # Perform A2*B1
C4 = MatrixMultiply(A2,B2,c) # Perform A2*B2
C34 = cp.hstack((C3,C4)) # Stack C3 and C4 horizontally -> C34
del C3 # C3 no longer needed
del C4 # C4 no longer needed
return cp.vstack((C12,C34)) # Stack C12 and C34 vertically to form final result C
"""
Partition CPU
Input: Matrices A, B, and a partitioning threshold c
Output: Resultant Matrix from multiplying A and B
This function partitions A and B into smaller blocks, which will then be multiplied
into intermediate results that are combined into the final result.
If A is "wide" (m <= n), A is split in half by its columns and
B is split in half by its rows.
Else if A is "skinny" (m > n), A is split in half by its rows and
B is split in half by its columns.
"""
def PartitionCPU(A,B,c):
m,n = A.shape
if (type(A) == cp.ndarray): # If the arrays are coming from the GPU (cupy arrays),
A = cp.asnumpy(A) # move them over to the CPU (numpy arrays).
B = cp.asnumpy(B)
# Everything is the same as PartitionGPU with numpy functions instead of cupy functions.
# Also, intermediate results of matrix multiplication moved to CPU so no GPU memory management needed.
if m <= n:
#axis 0 = rows, axis 1 = columns
[A1,A2] = np.array_split(A, 2, axis=1)
[B1,B2] = np.array_split(B, 2, axis=0)
C = cp.asnumpy(MatrixMultiply(A1,B1,c)) + cp.asnumpy(MatrixMultiply(A2,B2,c))
return C
else: #m>n
[A1,A2] = np.array_split(A, 2, axis=0)
[B1,B2] = np.array_split(B, 2, axis=1)
C1 = cp.asnumpy(MatrixMultiply(A1,B1,c))
C2 = cp.asnumpy(MatrixMultiply(A1,B2,c))
C3 = cp.asnumpy(MatrixMultiply(A2,B1,c))
C4 = cp.asnumpy(MatrixMultiply(A2,B2,c))
C12 = np.hstack((C1,C2))
C34 = np.hstack((C3,C4))
C = np.vstack((C12,C34))
return C
# Numpy implementation of Matrix Multiplication
# Uses BLAS level 3 routine dgemm under the hood
def numpyMult(A,B):
return np.dot(A,B)
if __name__ == "__main__":
nvmlInit() # Initializing GPU memory tracker
h = nvmlDeviceGetHandleByIndex(0) # GPU device labeled h
info = nvmlDeviceGetMemoryInfo(h) # Get info from GPU
# Establishing the partitioning threshold
# The threshold should be such that we can store both A and B on the GPU to perform matrix multiplication.
# The maximum number of elements would therefore be the amount of free memory / 8 bytes per element.
# The max number of rows would be the square root of the number of elements assuming a square matrix.
# Both A and B must be stored so the threshold for A would be the number of rows divided by 2.
partitionLimit = math.sqrt(info.free / (8)) / 2
# Running the algorithm on 5 matrices from the SSMC and 5 dense matrices of the same dimensions
d = {} # Dictionary for storing execution time of each dataset
fileNames = ['datasets/494_bus.mtx', 'datasets/bcsstk17/bcsstk17.mtx', 'datasets/ex11/ex11.mtx', 'datasets/gupta3/gupta3.mtx', 'human_gene1/human_gene1.mtx', 'human_gene2/human_gene2.mtx']
for fileName in fileNames: # Initialize times to 0
d[fileName] = 0
d[fileName + 'Dense'] = 0
#d[fileName+'Numpy'] = 0
#d[fileName+'NumpyDense'] = 0
for i in range(10): # For 10 iterations (totaling time for 10 and then averaging)
print("Iteration", i)
for fileName in fileNames: # for each dataset
print()
print(fileName)
mat = mmread(fileName) # reads the mtx file
A = mat.todense(None,None) # changes the matrix type to numpy.matrix
A = np.asarray(np.float32(A)) # converting matrices to ndarrays with float32 elements (8 bytes)
B = np.asarray(np.float32(A))
row,col = A.shape
print("Rows:", row)
print("Cols:", col)
start = time.time()
result = MatrixMultiply(A,B,partitionLimit) # Tracking execution time for our algorithm on the sparse dataset
end = time.time()
print("Time Taken:", end - start)
d[fileName] += end - start # Adding to time dictionary
# Numpy executions on sparse datasets (commented out because of long execution times)
# start = time.time()
# result = numpyMult(A, B)
# end = time.time()
#print("Time Taken:", end - start)
#d[fileName+'Numpy'] += end - start
print("Dense:")
A = np.random.randn(row, col)
A = np.asarray(np.float32(A))
B = np.asarray(np.float32(A))
start = time.time()
result = MatrixMultiply(A,B,partitionLimit) # Tracking execution time for our algorithm on the dense dataset
end = time.time()
print("Time Taken:", end - start)
d[fileName + 'Dense'] += end - start
# Numpy executions on dense datasets (commented out because of long execution times)
# start = time.time()
# result = numpyMult(A, B)
# end = time.time()
#print("Time Taken:", end - start)
#d[fileName+'NumpyDense'] += end - start
print("Current Total:", d)
# Averaging execution time across all 10 runs
for fileName in fileNames:
d[fileName] /= 10.0
d[fileName + 'Dense'] /= 10.0
#d[fileName+'Numpy'] /= 10.0
#d[fileName+'NumpyDense'] /= 10.0
print("Avg (10 iterations):", d)
# print()
# import pandas as pd
# pd.set_option("display.max_rows", None, "display.max_columns", None)
# data = pd.DataFrame(d, index=[0])
# print(data)