-
Notifications
You must be signed in to change notification settings - Fork 0
/
DFS.cu
127 lines (94 loc) · 3.56 KB
/
DFS.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <chrono>
#ifdef __INTELLISENSE__
void __syncthreads();
#endif
using namespace std;
using namespace std::chrono;
__global__ void fill_cases(short* result, size_t start, size_t value){
size_t i= threadIdx.x+start;
result[i] = value;
}
__global__ void fillByLevel(short* result, size_t dim, size_t level, size_t fill_size, size_t fill_offset, size_t global_offset) {
if (level <= 0) return;
short value = threadIdx.x;
size_t start = fill_size * value + fill_offset + global_offset;
size_t end = start + fill_size;
for (size_t i = start; i < end; i++) result[i] = value;
fillByLevel <<<1, dim >> > (result, dim, level - 1, fill_size / dim, start, global_offset);
__syncthreads();
}
__global__ void DFSKernel(short *result, size_t dim, size_t level, size_t _itr_size) {
short value = threadIdx.x;
size_t fill_size = powf(dim, level - 1);
size_t start = fill_size * value;
size_t end = start + fill_size;
for (size_t i = start; i < end; i++) result[i] = value;
fillByLevel << <1, dim >> > (result, dim, level - 1, fill_size / dim, start, powf(dim, level));
__syncthreads();
}
cudaError_t DFS(short*, size_t, size_t, size_t, size_t);
int main()
{
const size_t dim = 3; //M
const size_t level = 8; //N
short* combinations;
size_t itr_size = pow(dim, level) * level;
size_t combinations_size = sizeof(short) * itr_size;
std::cout << "Iteration : " << level << std::endl;
std::cout << "nbr Transformations : " << dim << std::endl;
std::cout << "Array length : " << itr_size << std::endl;
std::cout << "memory size : " << combinations_size << std::endl;
combinations = (short*)malloc(combinations_size);
if(combinations == NULL) printf("\n\n ############ Memory allocation failed ############\n\n");
else
{
auto start = high_resolution_clock::now();
DFS(combinations, dim, level, combinations_size, itr_size);
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start)*pow(10,-6);
cout <<"\n\nExecution time : " << duration.count() << "\n\n";
}
}
cudaError_t DFS(short* combinations, size_t dim, size_t level, size_t combinations_size, size_t itr_size) {
short* result = 0;
size_t size_Col=pow(dim,level);
cudaError_t cudaStatus;
// auto start = std::chrono::high_resolution_clock::now();
cudaStatus = cudaMalloc((void**)& result, combinations_size);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed: %s\n", cudaGetErrorString(cudaStatus));
//goto Error;
}
DFSKernel<<<1, dim >>> (result, dim, level, itr_size);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
//goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
//goto Error;
}
// auto stop = high_resolution_clock::now();
// auto duration = duration_cast<microseconds>(stop - start)*pow(10,-6);
// cout <<"\n\nExecution time : " << duration.count() << "\n\n";
cudaStatus = cudaMemcpy(combinations, result, combinations_size, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
//goto Error;
}
/*for (int i = 0; i < itr_size; i++) {
if(i%size_Col==0) std::cout << std::endl;
std::cout << combinations[i] << " ";
}*/
//std::cout << "\n\nEnd.\n";
//Error:
cudaFree(result);
return cudaStatus;
}