-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dispatched and Ordered
39 lines (35 loc) · 1.09 KB
/
Dispatched and Ordered
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# all dispatched orders only which also have ordered status
data = [
(1, "1-Jan", "Ordered"),
(1, "2-Jan", "dispatched"),
(1, "3-Jan", "dispatched"),
(1, "4-Jan", "Shipped"),
(1, "5-Jan", "Shipped"),
(1, "6-Jan", "Delivered"),
(2, "1-Jan", "Ordered"),
(2, "2-Jan", "dispatched"),
(2, "3-Jan", "shipped"),
(3, "4-Jan", "dispatched")]
myschema = ["orderid", "statusdate", "status"]
df = spark.createDataFrame(data,schema=myschema)
# df.show()
#Through SQL
df.createOrReplaceTempView("ordertab")
spark.sql("select * from ordertab where status = 'dispatched' and "
"orderid in(select orderid from ordertab where status = 'Ordered')").show()
#Through DSL
result = df.filter(
(col("status") == "dispatched") &
(col("orderid").isin(
*[row[0] for row in df.filter(col("status") == "Ordered").select("orderid").collect()]
))
)
# result.show()
# -----------------------my solution
df.persist()
df1 = df.filter("status = 'dispatched'")
df1.show()
df2 = df.filter("status = 'Ordered'")
df2.show()
result = df1.join(df2, "orderid", "inner")
result.show()