-
Notifications
You must be signed in to change notification settings - Fork 0
/
export_accidents.py
124 lines (86 loc) · 4.04 KB
/
export_accidents.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import datetime
import logging
import zipfile
import pandas as pd
import requests
from pyogrio import read_dataframe
from pyogrio.errors import DataSourceError
MIN_YEAR = 2016
MAX_YEAR = 2022
def get_geojson_path(year):
return f"./data/accidents/accidents_{year}.geojson"
def get_shp_path(year):
# for some reason 2016 has another _ in the .shp file name
if year == 2016:
return f"{get_zip_extract_directory_path(year)}/Shapefile/Unfaelle_{year}_LinRef.shp"
# after 2021 the accidents in the zip file are stored in a folder called "shp"
if year > 2021:
return f"{get_zip_extract_directory_path(year)}/shp/Unfallorte{year}_LinRef.shp"
# for years 2017 to 2020
return f"{get_zip_extract_directory_path(year)}/Shapefile/Unfallorte{year}_LinRef.shp"
def get_zip_path(year):
return f"./data/temp/{year}.zip"
def get_zip_extract_directory_path(year):
return f"./data/temp/{year}"
def get_url_by_year(year):
#https://www.opengeodata.nrw.de/produkte/transport_verkehr/unfallatlas/Unfallorte2022_EPSG25832_CSV.zip
return f"https://www.opengeodata.nrw.de/produkte/transport_verkehr/unfallatlas/Unfallorte{year}_EPSG25832_Shape.zip"
def download_zip(year):
logging.info("Downloading .zip file.")
url = get_url_by_year(year)
output_file = get_zip_path(year)
response = requests.get(url)
if response.status_code != 200:
return False
with open(output_file, 'wb') as file:
file.write(response.content)
return True
def extract_zip(year):
logging.info("Extracting .zip file.")
zip_file_path = get_zip_path(year)
extracted_dir_path = get_zip_extract_directory_path(year)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extracted_dir_path)
def process_shp_file_of_year(year):
logging.info("Reading .shp file.")
shp_file_path = get_shp_path(year)
shp_file = None
try:
# using pyogrio for better performance while reading large .shp files
shp_file = read_dataframe(shp_file_path)
except DataSourceError:
raise FileNotFoundError(f"The .shp file with the accident data for the year {year} could not be found. It is very likely that the 'Statistische Ämter' have changed the structure of the .zip files (containing the .shp file) they offer for download. This has already happened in the past. Please compare the structure of the folder `data/temp/{year}` with the given path {shp_file_path} (from get_shp_path({year})). Check the README.md for more information.")
# see documentation (link in readme), 02 is code for hamburg, 14 for dresden
logging.info("Selecting features from Hamburg and Dresden.")
shp_file_hamburg = shp_file[shp_file["ULAND"] == "02"]
shp_file_dresden = shp_file[shp_file["ULAND"] == "14"]
shp_file = pd.concat([shp_file_hamburg, shp_file_dresden])
logging.info("Selecting features where bike was involved.")
shp_file = shp_file[shp_file["IstRad"] != "0"]
logging.info("Convert CRS to EPSG:4326")
shp_file = shp_file.to_crs("EPSG:4326")
return shp_file
def process_year(year):
success = download_zip(year)
if not success:
logging.error(f"Failed to download .shp file for year: {year}")
return None
extract_zip(year)
accidents_of_year = process_shp_file_of_year(year)
# for some reason there is no OBJECTID column in 2018
if year != 2018:
accidents_of_year = accidents_of_year.drop('OBJECTID', axis=1)
accidents_of_year.to_file(get_geojson_path(year), driver="GeoJSON")
return accidents_of_year
def main():
accidents = []
for year in range(MIN_YEAR, MAX_YEAR + 1):
logging.info(f"Processing year: {year} / {MAX_YEAR}")
accidents_of_year = process_year(year)
if accidents_of_year is None:
continue
accidents.append(accidents_of_year)
accidents_total = pd.concat(accidents, ignore_index=True)
accidents_total.to_file("./data/generated/accidents/accidents_total.geojson", driver="GeoJSON")
if __name__ == "__main__":
main()