-
Notifications
You must be signed in to change notification settings - Fork 1
/
Load test data in PySpark.py
96 lines (73 loc) · 2.44 KB
/
Load test data in PySpark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Databricks notebook source
# DBTITLE 1,List of States (single dimensional)
states:list = [
'Colorado',
'New York',
'Georgia',
'Tennessee',
'Utah'
]
# COMMAND ----------
from pyspark.sql.types import StringType
states_df = spark.createDataFrame(states, schema=StringType()).toDF('State_Name')
states_df.printSchema()
display(states_df)
# COMMAND ----------
# DBTITLE 1,Data as Key-Value pair
states_JSONs:list = [
{'CA': 'Colorado'},
{'NY': 'New York'},
{'GA': 'Georgia'},
{'TN': 'Tennessee'},
{'UT': 'Utah'}
]
# COMMAND ----------
from pyspark.sql.types import StructType, StructField, MapType
# abbreviation_type = StructType(
# [
# StructField("Code", StringType(), False)
# ]
# )
# state_name_type = StructType(
# [
# StructField("State_Name", StringType(), False)
# ]
# )
# states_dict_schema = MapType(keyType=abbreviation_type, valueType=state_name_type, valueContainsNull=False)
states_dict_schema = MapType(keyType=StringType(), valueType=StringType(), valueContainsNull=False)
states_json_df = spark.createDataFrame(data=states_JSONs, schema=states_dict_schema)
states_json_df.printSchema()
display(states_json_df)
# COMMAND ----------
# DBTITLE 1,States info JSON
states_key_values:list = [
{'Code': 'CA', 'State_Name': 'Colorado', 'Airport': 'DEN'},
{'Code': 'NY', 'State_Name': 'New York', 'Airport': 'JFK'},
{'Code': 'GA', 'State_Name': 'Georgia', 'Airport': 'ALT'},
{'Code': 'TN', 'State_Name': 'Tennessee', 'Airport': 'BNA'},
{'Code': 'UT', 'State_Name': 'Utah', 'Airport': 'SLC'}
]
# COMMAND ----------
states_and_abbreviations_df = spark.createDataFrame(data=states_key_values)
states_and_abbreviations_df.printSchema()
display(states_and_abbreviations_df)
# COMMAND ----------
# DBTITLE 1,List of States and their Capitals (Map type yet with explicit schema)
state_and_capital_list:list = [
['Colorado', 'Denver'],
['New York', 'Albany'],
['Georgia', 'Atlanta'],
['Tennessee', 'Nashville'],
['Utah', 'Salt Lake City']
]
# COMMAND ----------
from pyspark.sql.types import StringType, StructType, StructField
simple_tuple_schema = StructType (
[
StructField("State", StringType(), nullable=False),
StructField("Capital", StringType(), nullable=False)
]
)
state_and_capital_df = spark.createDataFrame(state_and_capital_list, schema=simple_tuple_schema)
state_and_capital_df.printSchema()
display(state_and_capital_df)