-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_wrangling.py
71 lines (65 loc) · 2.59 KB
/
data_wrangling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
def create_yearly_table(df):
"""reads in source csv file, transforms data and merges with locations table"""
df.rename(columns={"Zählstelle Inbetriebnahme": "timestamp"}, inplace=True)
df.timestamp = pd.to_datetime(df.timestamp).copy()
stacked = df.stack().reset_index()
merged = pd.merge(
stacked, df.timestamp.reset_index(), "left", left_on="level_0", right_on="index"
).drop(columns=["level_0", "index"])
temp_df = (
merged[merged.level_1 != "timestamp"]
.set_index("timestamp")
.rename(columns={"level_1": "station", 0: "total_bikes"})
)
temp_df["total_bikes"] = temp_df["total_bikes"].astype(int).copy()
temp_df.station = temp_df.station.str[:-11].copy()
temp_df.station = temp_df.station.str.strip().copy()
temp_df.station.replace("17-SZ-BRE-O", "17-SK-BRE-O", inplace=True)
temp_df.station.replace("17-SZ-BRE-W", "17-SK-BRE-W", inplace=True)
temp_df["hour"] = temp_df.index.hour
temp_df["hour_str"] = temp_df.hour.astype(str) + " Uhr"
temp_df["weekday"] = temp_df.index.weekday.copy()
temp_df["day_name"] = temp_df.index.day_name().copy()
temp_df["month"] = temp_df.index.month.copy()
temp_df["month_name"] = temp_df.index.month_name().copy()
temp_df["year"] = temp_df.index.year.copy()
final_geo = (
pd.merge(
temp_df.reset_index(),
locations.drop(columns="Installationsdatum"),
"left",
left_on="station",
right_on="Zählstelle",
)
.drop(columns=["Zählstelle"])
.rename(
columns={
"Beschreibung - Fahrtrichtung": "description",
"Breitengrad": "lat",
"Längengrad": "lon",
}
)
.set_index("timestamp")
)
return final_geo
def transform_concat_dataframes(dataframes_list):
"""transform all loaded dataframes and concat them"""
processed_dataframes = []
for dataframe in dataframes_list:
processed_dataframes.append(create_yearly_table(dataframe))
return pd.concat(processed_dataframes)
if __name__ == "__main__":
locations = pd.read_excel(
"gesamtdatei_stundenwerte_2012-2019.xlsx", sheet_name="Standortdaten"
)
dataframes = []
for year in range(2017, 2020):
dataframes.append(
pd.read_excel(
"gesamtdatei_stundenwerte_2012-2019.xlsx",
sheet_name=f"Jahresdatei {year}",
)
)
final_table = transform_concat_dataframes(dataframes)
final_table.to_csv("berlin_bikedata_2017-2019.csv")