-
Notifications
You must be signed in to change notification settings - Fork 4
/
benchmark_gpu_no-partitioning_join.py
executable file
·127 lines (107 loc) · 4.3 KB
/
benchmark_gpu_no-partitioning_join.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
#
# Copyright 2021-2022 Clemens Lutz
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Information
# ===========
#
# This script runs a GPU no-partitioning join. The parameters are tuned for an
# IBM AC922 machine, which has two IBM POWER9 CPUs and two Nvidia V100 GPUs.
# The machine also has NVLink 2.0.
#
# The no-partitioning join is able to run on PCI-e and NVLink GPUs.
#
# Setup notes
# ===========
#
# Before running this benchmark, follow our guide to allocate huge pages on
# early boot to avoid page fragmentation:
#
# ./guides/huge_pages.md#reserving-2-mb-huge-pages-on-early-boot
#
# Performance typically begins to degrade for very large data sizes after
# several runs due to page fragmentation. Reboot the machine to defrag.
import subprocess
import socket
import itertools
import shlex
import tempfile
from os import path
import math
import pandas
repeat = 10
tuples = [ 128 * x for x in range(1, 17) ]
tuple_bytes = [ 16 ]
data_location = 0
device_mem_bytes = 16 * 2**30
ht_location_gpu = 255
ht_location_cpu = 0
hybrid_hash_table = [ False ]
exec_method = [ 'Gpu' ]
hashing_scheme = [ ('Perfect', 1), ('LinearProbing', 2) ]
page_type = [ 'Huge2MB' ]
hostname = socket.gethostname()
def main():
file_id = 0
file_list = []
out_dir = tempfile.mkdtemp()
out_csv = path.join(out_dir, f'benchmark_gpu_no-partitioning_join_{hostname}.csv')
print(f"Writing CSV file to {out_csv}")
for em, ts, (hs, lf), hht, tb, pt in itertools.product(exec_method, tuples, hashing_scheme, hybrid_hash_table, tuple_bytes, page_type):
print(f"Running {em} with {hs} and hybrid HT: {hht !s} with {ts!s}Mtuples tuple bytes: {tb !s} page type: {pt}")
ht_loc = ht_location_gpu
if not hht:
if hs == 'Perfect' and ts * 10**6 * tb >= device_mem_bytes:
ht_loc = ht_location_cpu
elif hs == 'LinearProbing' and 2**math.ceil(math.log2(lf * ts * 10**6 * tb)) >= device_mem_bytes:
ht_loc = ht_location_cpu
for count in range(0, repeat):
print('.', end='', flush=True)
tmp_csv = path.join(out_dir, f'tmp_{file_id !s}.csv')
cmd = f'''
cargo run \
--quiet \
--package hashjoin \
--release \
-- \
--data-set custom \
--execution-method {em} \
--hash-table-mem-type Numa \
--hash-table-location {ht_loc !s} \
--spill-hash-table { str(hht).lower() } \
--rel-mem-type Numa \
--page-type {pt} \
--inner-rel-location {data_location !s} \
--outer-rel-location {data_location !s} \
--inner-rel-tuples {ts * 10**6 !s} \
--outer-rel-tuples {ts * 10**6 !s} \
--hashing-scheme {hs} \
--tuple-bytes Bytes{tb !s} \
--repeat 2 \
--csv {tmp_csv}
'''
cmdfuture = subprocess.run(shlex.split(cmd), check = False)
cmdfuture.check_returncode()
file_list.append(tmp_csv)
file_id += 1
print('')
csv_append(out_csv, file_list)
print(f"Finished CSV file at {out_csv}")
def csv_append(accumulator_file, append_files):
df_list = [pandas.read_csv(f) for f in append_files]
df = pandas.concat(df_list)
df.to_csv(accumulator_file, index = False)
if __name__ == "__main__":
main()