-
Notifications
You must be signed in to change notification settings - Fork 17
/
gemm.cu
186 lines (153 loc) · 5.25 KB
/
gemm.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#include <unistd.h>
#include <iostream>
#include <stdlib.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "fp16_conversion.h"
using namespace std;
// #define FP16MM
const char* cublasGetErrorString(cublasStatus_t status)
{
switch(status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "unknown error";
}
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
inline
cublasStatus_t checkCublas(cublasStatus_t result)
{
if (result != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cublasGetErrorString(result));
assert(result == CUBLAS_STATUS_SUCCESS);
}
return result;
}
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on CPU
void CPU_fill_rand(float *A, int nr_rows_A, int nr_cols_A) {
int a=1;
for(int i = 0; i < nr_rows_A * nr_cols_A; i++){
A[i] = (float)rand()/(float)(RAND_MAX/a);
}
}
int main(int argc, char ** argv){
int min_m_k_n = 2;
int max_m_k_n = 4096*4;
int repeats = 2;
int verbose = 1;
#ifndef FP16MM
cout << "\ncublasSgemm test result:\n" << endl;
#else
cout << "\ncublasHgemm test result:\n" << endl;
#endif
if(verbose)
cout << "running with"
<< " min_m_k_n: " << min_m_k_n
<< " max_m_k_n: " << max_m_k_n
<< " repeats: " << repeats
<< endl;
cublasStatus_t stat;
cublasHandle_t handle;
checkCublas(cublasCreate(&handle));
if(verbose) cout << "allocating device variables" << endl;
// Allocate 3 arrays on CPU
float *h_A = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
float *h_B = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
float *h_C = (float *)malloc(max_m_k_n * max_m_k_n * sizeof(float));
CPU_fill_rand(h_A, max_m_k_n, max_m_k_n);
CPU_fill_rand(h_B, max_m_k_n, max_m_k_n);
CPU_fill_rand(h_C, max_m_k_n, max_m_k_n);
#ifndef FP16MM
// Allocate 3 arrays on GPU
float *d_A, *d_B, *d_C;
checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(float)));
checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(float)));
checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(float)));
checkCuda(cudaMemcpy(d_A,h_A,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(d_B,h_B,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(d_C,h_C,max_m_k_n * max_m_k_n * sizeof(float),cudaMemcpyHostToDevice));
int lda, ldb, ldc, m, n, k;
const float alf = 1.0f;
const float bet = 0.0f;
const float *alpha = &alf;
const float *beta = &bet;
#else
__half *d_A, *d_B, *d_C;
checkCuda(cudaMallocManaged(&d_A, max_m_k_n * max_m_k_n * sizeof(__half)));
checkCuda(cudaMallocManaged(&d_B, max_m_k_n * max_m_k_n * sizeof(__half)));
checkCuda(cudaMallocManaged(&d_C, max_m_k_n * max_m_k_n * sizeof(__half)));
for (int i = 0; i < max_m_k_n * max_m_k_n; i++) {
d_A[i] = approx_float_to_half(h_A[i]);
d_B[i] = approx_float_to_half(h_B[i]);
d_C[i] = approx_float_to_half(h_C[i]);
}
int lda, ldb, ldc, m, n, k;
const __half alf = approx_float_to_half(1.0);
const __half bet = approx_float_to_half(0.0);
const __half *alpha = &alf;
const __half *beta = &bet;
#endif
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for(int size = min_m_k_n; size <= max_m_k_n; size=size*2){
double sum = 0.0;
for(int rep = 0; rep < repeats; rep++){
cudaEventRecord(start, 0);
m=n=k=size;
lda = m;
ldb = k;
ldc = m;
#ifndef FP16MM
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
#else
stat = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, d_A, lda, d_B, ldb, beta, d_C, ldc);
#endif
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
if(stat != CUBLAS_STATUS_SUCCESS){
cerr << "cublasSgemmBatched failed" << endl;
exit(1);
}
assert(!cudaGetLastError());
float elapsed;
cudaEventElapsedTime(&elapsed, start, stop);
elapsed /= 1000.0f;
sum += elapsed;
}
#ifndef FP16MM
cout << "float32: size "
#else
cout << "float16: size "
#endif
<< size << " average: " << sum/repeats << " s "<< endl;
}
//Free GPU memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free CPU memory
free(h_A);
free(h_B);
free(h_C);
return 0;
}