-
Notifications
You must be signed in to change notification settings - Fork 0
/
HQL Data Analysis (Performance Tuning).HQL
134 lines (113 loc) · 2.23 KB
/
HQL Data Analysis (Performance Tuning).HQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
1. Experiment with different file formats (like ORC, Parquet) and
measure their impact on the performance of your Hive queries.
a. text file :
CREATE EXTERNAL TABLE car_insurance_data (
Id INT,
Age INT,
Job STRING,
Marital STRING,
Education STRING,
Default INT,
Balance INT,
HHInsurance INT,
CarLoan INT,
Communication STRING,
LastContactDay INT,
LastContactMonth INT,
NoOfContacts INT,
DaysPassed INT,
PrevAttempts INT,
Outcome STRING,
CallStart STRING,
CallEnd STRING,
CarInsurance INT)
row format delimited
fields terminated by ','
stored as textfile
location '/tmp/input_data/';
b. orc:
CREATE TABLE car_insurance_data_orc (
Id INT,
Age INT,
Job STRING,
Marital STRING,
Education STRING,
Default INT,
Balance INT,
HHInsurance INT,
CarLoan INT,
Communication STRING,
LastContactDay INT,
LastContactMonth INT,
NoOfContacts INT,
DaysPassed INT,
PrevAttempts INT,
Outcome STRING,
CallStart STRING,
CallEnd STRING,
CarInsurance INT)
row format delimited
fields terminated by ','
stored as orc;
INSERT OVERWRITE TABLE
car_insurance_data_orc
SELECT * FROM car_insurance_data;
c. parquet file :
CREATE TABLE car_insurance_data_parquet (
Id INT,
Age INT,
Job STRING,
Marital STRING,
Education STRING,
Default INT,
Balance INT,
HHInsurance INT,
CarLoan INT,
Communication STRING,
LastContactDay INT,
LastContactMonth INT,
NoOfContacts INT,
DaysPassed INT,
PrevAttempts INT,
Outcome STRING,
CallStart STRING,
CallEnd STRING,
CarInsurance INT)
row format delimited
fields terminated by ','
stored as parquet;
INSERT OVERWRITE TABLE
car_insurance_data_parquet
SELECT * FROM car_insurance_data;
d. avro:
CREATE table car_insurance_data_avro (
Id INT,
Age INT,
Job STRING,
Marital STRING,
Education STRING,
Default INT,
Balance INT,
HHInsurance INT,
CarLoan INT,
Communication STRING,
LastContactDay INT,
LastContactMonth INT,
NoOfContacts INT,
DaysPassed INT,
PrevAttempts INT,
Outcome STRING,
CallStart STRING,
CallEnd STRING,
CarInsurance INT)
row format delimited
fields terminated by ','
stored as avro;
INSERT OVERWRITE TABLE
car_insurance_data_avro
SELECT * FROM car_insurance_data;
and for experiment i only run single query like select* from table and check performance
text - 0.258 seconds
parquet - 0.223 seconds
avro - 0.167 seconds
orc - 0.182 seconds