-
Notifications
You must be signed in to change notification settings - Fork 4
/
Section_3_Mid_Term_Project_Solved.sql
464 lines (348 loc) · 18.8 KB
/
Section_3_Mid_Term_Project_Solved.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
select * from website_pageviews;
select * from website_sessions;
select * from order_items;
select * from orders;
-- nov 27 2012
/*
1. Gsearch seems to be the biggest driver of our business. Could you pull monthly
trends for gsearch sessions and orders so that we can showcase the growth there?
*/
select
*
from website_sessions
left join orders
on website_sessions.website_session_id = orders.website_session_id
where utm_source ='gsearch' and website_sessions.created_at < '2012-11-27'
group by 1,2;
select
year(website_sessions.created_at) as year,
month(website_sessions.created_at) as month,
count(website_sessions.website_session_id) sessions,
count(orders.order_id) as orders,
count(orders.order_id)/ count(website_sessions.website_session_id) as conv_rate
from website_sessions
left join orders
on website_sessions.website_session_id = orders.website_session_id
where utm_source ='gsearch' and website_sessions.created_at < '2012-11-27'
group by 1,2;
-- the amount of orders are different then total sessions is that because the orders_id had null values meaning no order was placed ?
-- part 2
/*
2. Next, it would be great to see a similar monthly trend for Gsearch, but this time splitting out nonbrand
and brand campaigns separately. I am wondering if brand is picking up at all. If so, this is a good story to tell.
*/
select
year(website_sessions.created_at) as year,
month(website_sessions.created_at) as month,
-- count(website_sessions.website_session_id) sessions,
-- count(orders.order_id) as orders,
COUNT(DISTINCT CASE WHEN utm_campaign = 'nonbrand' THEN website_sessions.website_session_id ELSE NULL END) AS nonbrand_sessions,
COUNT(DISTINCT CASE WHEN utm_campaign = 'nonbrand' THEN orders.order_id ELSE NULL END) AS nonbrand_orders,
COUNT(DISTINCT CASE WHEN utm_campaign = 'brand' THEN website_sessions.website_session_id ELSE NULL END) AS brand_sessions,
COUNT(DISTINCT CASE WHEN utm_campaign = 'brand' THEN orders.order_id ELSE NULL END) AS bradn_orders,
COUNT(DISTINCT CASE WHEN utm_campaign = 'nonbrand' THEN orders.order_id ELSE NULL END)
/COUNT(DISTINCT CASE WHEN utm_campaign = 'nonbrand' THEN website_sessions.website_session_id ELSE NULL END) as nonbrand_conv_rate,
COUNT(DISTINCT CASE WHEN utm_campaign = 'brand' THEN orders.order_id ELSE NULL END)
/COUNT(DISTINCT CASE WHEN utm_campaign = 'brand' THEN website_sessions.website_session_id ELSE NULL END) as brand_conv_rate
from website_sessions
left join orders
on website_sessions.website_session_id = orders.website_session_id
where utm_source ='gsearch' and website_sessions.created_at < '2012-11-27'
group by 1,2;
-- part 3
/*
3. While we’re on Gsearch, could you dive into nonbrand, and pull monthly sessions and orders split by device type?
I want to flex our analytical muscles a little and show the board we really know our traffic sources.
*/
select
year(website_sessions.created_at) as year,
month(website_sessions.created_at) as month,
-- count(website_sessions.website_session_id) sessions,
-- count(orders.order_id) as orders,
COUNT(DISTINCT CASE WHEN device_type = 'desktop' THEN website_sessions.website_session_id ELSE NULL END) AS desktop_sessions,
COUNT(DISTINCT CASE WHEN device_type = 'desktop' THEN orders.order_id ELSE NULL END) AS desktop_orders,
COUNT(DISTINCT CASE WHEN device_type = 'mobile' THEN website_sessions.website_session_id ELSE NULL END) AS mobile_sessions,
COUNT(DISTINCT CASE WHEN device_type = 'mobile' THEN orders.order_id ELSE NULL END) AS mobile_orders
from website_sessions
left join orders
on website_sessions.website_session_id = orders.website_session_id
where utm_source ='gsearch' and website_sessions.created_at < '2012-11-27' and utm_campaign = 'nonbrand'
group by 1,2;
-- part 4
/*
4. I’m worried that one of our more pessimistic board members may be concerned about the large % of traffic from Gsearch.
Can you pull monthly trends for Gsearch, alongside monthly trends for each of our other channels?
*/
-- first, finding the various utm sources and referers to see the traffic we're getting
-- finding all the sessions
select
distinct utm_source,
utm_campaign,
utm_content,
http_referer
from website_sessions
where website_sessions.created_at < '2012-11-27';
-- when all 4 are null (including http_refferer) then it means someone typed in directly the company name and accessed the website
-- when all three are null except http refferer then it means that the some search engine was used but its not being tracked by the marketing team or
-- is not controlled by us /paid by us
select
year(website_sessions.created_at) as year,
month(website_sessions.created_at) as month,
COUNT(DISTINCT CASE WHEN utm_source = 'gsearch' THEN website_sessions.website_session_id ELSE NULL END) AS gsearch_paid_sessions,
COUNT(DISTINCT CASE WHEN utm_source = 'bsearch' THEN website_sessions.website_session_id ELSE NULL END) AS bsearch_paid_sessions,
COUNT(DISTINCT CASE WHEN utm_source IS NULL and utm_campaign is null and utm_content is null AND http_referer IS NOT NULL THEN website_sessions.website_session_id ELSE NULL END) as organic_research_session,
COUNT(DISTINCT CASE WHEN utm_source IS NULL and utm_campaign is null and utm_content is null AND http_referer IS NULL THEN website_sessions.website_session_id ELSE NULL END) as direct_typed_in_session
from website_sessions
left join orders
on website_sessions.website_session_id = orders.website_session_id
where website_sessions.created_at < '2012-11-27'
group by 1,2;
#----------------------------------------------------------------------------------what is the point of this join ?
select
year(website_sessions.created_at) as year,
month(website_sessions.created_at) as month,
COUNT(DISTINCT CASE WHEN utm_source = 'gsearch' THEN website_sessions.website_session_id ELSE NULL END) AS gsearch_paid_sessions,
COUNT(DISTINCT CASE WHEN utm_source = 'bsearch' THEN website_sessions.website_session_id ELSE NULL END) AS bsearch_paid_sessions,
COUNT(DISTINCT CASE WHEN utm_source IS NULL and utm_campaign is null and utm_content is null AND http_referer IS NOT NULL THEN website_sessions.website_session_id ELSE NULL END) as organic_research_session,
COUNT(DISTINCT CASE WHEN utm_source IS NULL and utm_campaign is null and utm_content is null AND http_referer IS NULL THEN website_sessions.website_session_id ELSE NULL END) as direct_typed_in_session
from website_sessions
where website_sessions.created_at < '2012-11-27'
group by 1,2;
-- part 5 same as session 5
/*
5. I’d like to tell the story of our website performance improvements over the course of the first 8 months.
Could you pull session to order conversion rates, by month?
*/
-- same as question 1
/*
6. For the gsearch lander test, please estimate the revenue that test earned us
(Hint: Look at the increase in CVR from the test (Jun 19 – Jul 28), and use
nonbrand sessions and revenue since then to calculate incremental value)
*/
-- first we will find the minimum website_session id when the lander-1 test started
select
min(website_pageviews.website_pageview_id) as first_pageview_id_for_lander
from website_pageviews
where website_pageviews.pageview_url = '/lander-1'; -- 23504
SELECT
website_pageviews.website_session_id,
MIN(website_pageviews.website_pageview_id) AS min_pageview_id
FROM website_pageviews
INNER JOIN website_sessions
ON website_sessions.website_session_id = website_pageviews.website_session_id
AND website_sessions.created_at < '2012-07-28' -- prescribed by the assignment
AND website_pageviews.website_pageview_id >= 23504 -- first page_view
AND utm_source = 'gsearch'
AND utm_campaign = 'nonbrand' #---------------------------what is the point of inner join here
GROUP BY
website_pageviews.website_session_id;
CREATE TEMPORARY TABLE first_test_pageviews
SELECT
website_pageviews.website_session_id,
MIN(website_pageviews.website_pageview_id) AS min_pageview_id
FROM website_pageviews
INNER JOIN website_sessions
ON website_sessions.website_session_id = website_pageviews.website_session_id
AND website_sessions.created_at < '2012-07-28' -- prescribed by the assignment restriction
AND website_pageviews.website_pageview_id >= 23504 -- first page_view id for lander, will choose date between this id and the statement above
AND utm_source = 'gsearch'
AND utm_campaign = 'nonbrand' #---------------------------join is required since we don't have the columns utm_source etc in website_pageview
GROUP BY
website_pageviews.website_session_id;
select * from first_test_pageviews;
-- next, we'll bring in the landing page to each session, like last time, but restricting to home or lander-1 this time
CREATE TEMPORARY TABLE nonbrand_test_sessions_w_landing_pages
SELECT
first_test_pageviews.website_session_id,
website_pageviews.pageview_url AS landing_page
FROM first_test_pageviews
LEFT JOIN website_pageviews
ON website_pageviews.website_pageview_id = first_test_pageviews.min_pageview_id
WHERE website_pageviews.pageview_url IN ('/home','/lander-1'); #---------------------------------------------------why did we choose home
select * from nonbrand_test_sessions_w_landing_pages;
-- then we make a table to bring in orders
CREATE TEMPORARY TABLE nonbrand_test_sessions_w_orders
SELECT
nonbrand_test_sessions_w_landing_pages.website_session_id,
nonbrand_test_sessions_w_landing_pages.landing_page,
orders.order_id AS order_id
FROM nonbrand_test_sessions_w_landing_pages
LEFT JOIN orders
ON orders.website_session_id = nonbrand_test_sessions_w_landing_pages.website_session_id
;
select * from nonbrand_test_sessions_w_orders;
-- to find the difference between conversion rates
SELECT
landing_page,
COUNT(DISTINCT website_session_id) AS sessions,
COUNT(DISTINCT order_id) AS orders,
COUNT(DISTINCT order_id)/COUNT(DISTINCT website_session_id) AS conv_rate
FROM nonbrand_test_sessions_w_orders
GROUP BY 1;
-- .0319 for /home, vs .0406 for /lander-1
-- .0087 additional orders per session
-- finding the most reent pageview for gsearch nonbrand where the traffic was sent to /home
SELECT
MAX(website_sessions.website_session_id) AS most_recent_gsearch_nonbrand_home_pageview
FROM website_sessions
LEFT JOIN website_pageviews
ON website_pageviews.website_session_id = website_sessions.website_session_id
WHERE utm_source = 'gsearch'
AND utm_campaign = 'nonbrand'
AND pageview_url = '/home'
AND website_sessions.created_at < '2012-11-27'
;
-- max website_session_id = 17145, since then all the traffic is rerouted to else where/site
SELECT
COUNT(website_session_id) AS sessions_since_test
FROM website_sessions
WHERE created_at < '2012-11-27'
AND website_session_id > 17145 -- last /home session
AND utm_source = 'gsearch'
AND utm_campaign = 'nonbrand'
;
-- 22,972 website sessions since the test
-- X .0087 incremental conversion = 202 incremental orders since 7/29
-- roughly 4 months, so roughly 50 extra orders per month. Not b
-- ############################################## why are we mulitplying it with .0087, if we do this we are assuming that there are no other urls except lander-1 and all theo orders are then via lander-1
SELECT
website_sessions.website_session_id,
website_pageviews.pageview_url,
-- website_pageviews.created_at AS pageview_created_at,
CASE WHEN pageview_url = '/home' THEN 1 ELSE 0 END AS homepage,
CASE WHEN pageview_url = '/lander-1' THEN 1 ELSE 0 END AS custom_lander,
CASE WHEN pageview_url = '/products' THEN 1 ELSE 0 END AS products_page,
CASE WHEN pageview_url = '/the-original-mr-fuzzy' THEN 1 ELSE 0 END AS mrfuzzy_page,
CASE WHEN pageview_url = '/cart' THEN 1 ELSE 0 END AS cart_page,
CASE WHEN pageview_url = '/shipping' THEN 1 ELSE 0 END AS shipping_page,
CASE WHEN pageview_url = '/billing' THEN 1 ELSE 0 END AS billing_page,
CASE WHEN pageview_url = '/thank-you-for-your-order' THEN 1 ELSE 0 END AS thankyou_page
FROM website_sessions
LEFT JOIN website_pageviews
ON website_sessions.website_session_id = website_pageviews.website_session_id
WHERE website_sessions.utm_source = 'gsearch'
AND website_sessions.utm_campaign = 'nonbrand'
AND website_sessions.created_at < '2012-07-28'
AND website_sessions.created_at > '2012-06-19'
ORDER BY
website_sessions.website_session_id,
website_pageviews.created_at;
CREATE TEMPORARY TABLE session_level_made_it_flagged
SELECT
website_session_id,
MAX(homepage) AS saw_homepage,
MAX(custom_lander) AS saw_custom_lander,
MAX(products_page) AS product_made_it,
MAX(mrfuzzy_page) AS mrfuzzy_made_it,
MAX(cart_page) AS cart_made_it,
MAX(shipping_page) AS shipping_made_it,
MAX(billing_page) AS billing_made_it,
MAX(thankyou_page) AS thankyou_made_it
FROM(
SELECT
website_sessions.website_session_id,
website_pageviews.pageview_url,
-- website_pageviews.created_at AS pageview_created_at,
CASE WHEN pageview_url = '/home' THEN 1 ELSE 0 END AS homepage,
CASE WHEN pageview_url = '/lander-1' THEN 1 ELSE 0 END AS custom_lander,
CASE WHEN pageview_url = '/products' THEN 1 ELSE 0 END AS products_page,
CASE WHEN pageview_url = '/the-original-mr-fuzzy' THEN 1 ELSE 0 END AS mrfuzzy_page,
CASE WHEN pageview_url = '/cart' THEN 1 ELSE 0 END AS cart_page,
CASE WHEN pageview_url = '/shipping' THEN 1 ELSE 0 END AS shipping_page,
CASE WHEN pageview_url = '/billing' THEN 1 ELSE 0 END AS billing_page,
CASE WHEN pageview_url = '/thank-you-for-your-order' THEN 1 ELSE 0 END AS thankyou_page
FROM website_sessions
LEFT JOIN website_pageviews
ON website_sessions.website_session_id = website_pageviews.website_session_id
WHERE website_sessions.utm_source = 'gsearch'
AND website_sessions.utm_campaign = 'nonbrand'
AND website_sessions.created_at < '2012-07-28'
AND website_sessions.created_at > '2012-06-19'
ORDER BY
website_sessions.website_session_id,
website_pageviews.created_at
) AS pageview_level
GROUP BY
website_session_id
;
-- then this would produce the final output, part 1
SELECT
CASE
WHEN saw_homepage = 1 THEN 'saw_homepage'
WHEN saw_custom_lander = 1 THEN 'saw_custom_lander' -- these urls are mutually exclusive so we don't have to worry about that someone might have visited both pages - that is not possible
-- cz we ran a split test , can you explain that ???????????????????????????
ELSE 'uh oh... check logic' #--------------------------------- why can't we get others except these ? why would be check logic
END AS segment,
COUNT(DISTINCT website_session_id) AS sessions,
COUNT(DISTINCT CASE WHEN product_made_it = 1 THEN website_session_id ELSE NULL END) AS to_products,
COUNT(DISTINCT CASE WHEN mrfuzzy_made_it = 1 THEN website_session_id ELSE NULL END) AS to_mrfuzzy,
COUNT(DISTINCT CASE WHEN cart_made_it = 1 THEN website_session_id ELSE NULL END) AS to_cart,
COUNT(DISTINCT CASE WHEN shipping_made_it = 1 THEN website_session_id ELSE NULL END) AS to_shipping,
COUNT(DISTINCT CASE WHEN billing_made_it = 1 THEN website_session_id ELSE NULL END) AS to_billing,
COUNT(DISTINCT CASE WHEN thankyou_made_it = 1 THEN website_session_id ELSE NULL END) AS to_thankyou
FROM session_level_made_it_flagged
GROUP BY 1
;
-- then this as final output part 2 - click rates
SELECT
CASE
WHEN saw_homepage = 1 THEN 'saw_homepage'
WHEN saw_custom_lander = 1 THEN 'saw_custom_lander'
ELSE 'uh oh... check logic'
END AS segment,
COUNT(DISTINCT CASE WHEN product_made_it = 1 THEN website_session_id ELSE NULL END)/COUNT(DISTINCT website_session_id) AS lander_click_rt,
COUNT(DISTINCT CASE WHEN mrfuzzy_made_it = 1 THEN website_session_id ELSE NULL END)/COUNT(DISTINCT CASE WHEN product_made_it = 1 THEN website_session_id ELSE NULL END) AS products_click_rt,
COUNT(DISTINCT CASE WHEN cart_made_it = 1 THEN website_session_id ELSE NULL END)/COUNT(DISTINCT CASE WHEN mrfuzzy_made_it = 1 THEN website_session_id ELSE NULL END) AS mrfuzzy_click_rt,
COUNT(DISTINCT CASE WHEN shipping_made_it = 1 THEN website_session_id ELSE NULL END)/COUNT(DISTINCT CASE WHEN cart_made_it = 1 THEN website_session_id ELSE NULL END) AS cart_click_rt,
COUNT(DISTINCT CASE WHEN billing_made_it = 1 THEN website_session_id ELSE NULL END)/COUNT(DISTINCT CASE WHEN shipping_made_it = 1 THEN website_session_id ELSE NULL END) AS shipping_click_rt,
COUNT(DISTINCT CASE WHEN thankyou_made_it = 1 THEN website_session_id ELSE NULL END)/COUNT(DISTINCT CASE WHEN billing_made_it = 1 THEN website_session_id ELSE NULL END) AS billing_click_rt
FROM session_level_made_it_flagged
GROUP BY 1
;
/*
8. I’d love for you to quantify the impact of our billing test, as well. Please analyze the lift generated
from the test (Sep 10 – Nov 10), in terms of revenue per billing page session, and then pull the number
of billing page sessions for the past month to understand monthly impact.
*/
SELECT
website_pageviews.website_session_id,
website_pageviews.pageview_url AS billing_version_seen,
orders.order_id,
orders.price_usd
FROM website_pageviews
LEFT JOIN orders
ON orders.website_session_id = website_pageviews.website_session_id
WHERE website_pageviews.created_at > '2012-09-10' -- prescribed in assignment
AND website_pageviews.created_at < '2012-11-10' -- prescribed in assignment
AND website_pageviews.pageview_url IN ('/billing','/billing-2'); -- # it can't choose both since it can be either billing 1 or billing 2?????????????????????????
SELECT
billing_version_seen,
COUNT(DISTINCT website_session_id) AS sessions,
SUM(price_usd)/COUNT(DISTINCT website_session_id) AS revenue_per_billing_page_seen
FROM(
SELECT
website_pageviews.website_session_id,
website_pageviews.pageview_url AS billing_version_seen,
orders.order_id,
orders.price_usd
FROM website_pageviews
LEFT JOIN orders
ON orders.website_session_id = website_pageviews.website_session_id
WHERE website_pageviews.created_at > '2012-09-10' -- prescribed in assignment
AND website_pageviews.created_at < '2012-11-10' -- prescribed in assignment
AND website_pageviews.pageview_url IN ('/billing','/billing-2')
) AS billing_pageviews_and_order_data
GROUP BY 1
;
-- $22.83 revenue per billing page seen for the old version
-- $31.34 for the new version
-- LIFT: $8.51 per billing page view
SELECT
COUNT(website_session_id) AS billing_sessions_past_month
FROM website_pageviews
WHERE website_pageviews.pageview_url IN ('/billing','/billing-2')
AND created_at BETWEEN '2012-10-27' AND '2012-11-27' -- past month
-- 1,194 billing sessions past month
-- LIFT: $8.51 per billing session
-- VALUE OF BILLING TEST: $10,160 over the past month
-- but this 1,194 can be billing 1 or billing to so why are we multiplitying it with 8.51, that would mean we are assumin that all 1194 billing pages are now billing-2 ??????????????????