diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 53daeb46..bba3789d 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.14.1 +current_version = 0.14.2 parse = (?P\d+)\.(?P\d+)\.(?P\d+)((?P(a|na))+(?P\d+))? serialize = {major}.{minor}.{patch}{release}{build} diff --git a/.github/workflows/ci-production.yml b/.github/workflows/ci-production.yml index d73609ea..87457cf0 100644 --- a/.github/workflows/ci-production.yml +++ b/.github/workflows/ci-production.yml @@ -31,7 +31,7 @@ jobs: - name: create package run: python setup.py sdist - name: import open-mastr - run: python -m pip install ./dist/open_mastr-0.14.1.tar.gz + run: python -m pip install ./dist/open_mastr-0.14.2.tar.gz - name: Create credentials file env: MASTR_TOKEN: ${{ secrets.MASTR_TOKEN }} diff --git a/.gitignore b/.gitignore index 41689726..89d325b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# own testing files +tmptest.py + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 764b0476..821c6237 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,4 +3,4 @@ repos: rev: 22.6.0 hooks: - id: black - language_version: python3.10 + language_version: python3.11 diff --git a/CHANGELOG.md b/CHANGELOG.md index bc553f16..2982a01c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,17 @@ For each version important additions, changes and removals are listed here. The format is inspired from [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [v0.14.1] Hotfix - 2023-01-17 +## [v0.14.2] Maintenance - 2024-04-10 +### Changed +- Fix and add URLs of example projects in readme [#481](https://github.com/OpenEnergyPlatform/open-MaStR/pull/481) +- No longer require web scraping for bulk download [#488](https://github.com/OpenEnergyPlatform/open-MaStR/pull/488) +- Replace deprecated pandas map function [#491](https://github.com/OpenEnergyPlatform/open-MaStR/pull/491) +- Fix the handling of corrupted xml syntax in the downloaded files [#494](https://github.com/OpenEnergyPlatform/open-MaStR/pull/494) +- Implement relevant API WSDL Patchnotes V24.1.128 [#499](https://github.com/OpenEnergyPlatform/open-MaStR/pull/499) +### Removed +- Remove unused Docker File [#501](https://github.com/OpenEnergyPlatform/open-MaStR/pull/501) + +## [v0.14.1] Hotfix - 2024-01-17 ### Changed - Change data type of NetzbetreiberpruefungStatus to string [#483](https://github.com/OpenEnergyPlatform/open-MaStR/pull/483) diff --git a/CITATION.cff b/CITATION.cff index 90399709..69b1ae0a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -28,7 +28,7 @@ authors: title: "open-MaStR" type: software license: AGPL-3.0 -version: 0.14.1 +version: 0.14.2 doi: -date-released: 2024-01-17 +date-released: 2024-04-10 url: "https://github.com/OpenEnergyPlatform/open-MaStR/" diff --git a/README.rst b/README.rst index 6c71c411..8f28a180 100644 --- a/README.rst +++ b/README.rst @@ -110,8 +110,8 @@ changes in a `Pull Request `_ - `Wasserstoffatlas `_ -- `EE-Status App `_ - +- `EE-Status App `_ +- `Digiplan Anhalt `_ Collaboration diff --git a/open_mastr/utils/Dockerfile.postgis b/open_mastr/utils/Dockerfile.postgis deleted file mode 100644 index a6471ca0..00000000 --- a/open_mastr/utils/Dockerfile.postgis +++ /dev/null @@ -1,7 +0,0 @@ -FROM postgres:12 - -RUN apt-get update -RUN apt-get install -y postgresql-12-postgis-3 - - - diff --git a/open_mastr/utils/constants.py b/open_mastr/utils/constants.py index 52e34917..c1b2cc22 100644 --- a/open_mastr/utils/constants.py +++ b/open_mastr/utils/constants.py @@ -310,7 +310,7 @@ "AuflagenAbschaltungTierschutz": "requirementShutdownAnimalProtection", "AnlagenkennzifferAnlagenregister_nv": "plantIdentificationNumberRegister_nv", "BiogasDatumLeistungserhoehung": "biogasCapacityIncreaseDate", - "InAnspruchGenommeneAckerflaeche": "areaOfAgriculturalLandInUse", + "InAnspruchGenommeneLandwirtschaftlichGenutzteFlaeche": "areaOfAgriculturalLandInUse", "Aktenzeichen": "fileReference", "NetzbetreiberpruefungStatus": "gridOperatorCheckStatus", "AnlageBetriebsstatus": "plantOperatingStatus", @@ -511,4 +511,8 @@ "MastrNummer": "mastrNumber", "Kuestenentfernung": "distanceToCoast", "eegAusschreibungZuschlag": "eegAuctionBidAward", + "DatumUeberfuehrungInReserve": "dateTransferToReserve", + "ReserveartNachDemEnWG": "typeOfReserveFromEnWG", + "WebportalDesNetzbetreibers": "webPortalGridOperator", + "RegisternummerPraefix": "registerNumberPrefix", } diff --git a/open_mastr/utils/orm.py b/open_mastr/utils/orm.py index efe24810..5225f2cb 100644 --- a/open_mastr/utils/orm.py +++ b/open_mastr/utils/orm.py @@ -134,6 +134,8 @@ class Extended(object): PraequalifiziertFuerRegelenergie = Column(Boolean) GenMastrNummer = Column(String) Netzbetreiberzuordnungen = Column(String) + ReserveartNachDemEnWG = Column(String) + DatumUeberfuehrungInReserve = Column(Date) # from bulk download Hausnummer_nv = Column(Boolean) Weic_nv = Column(Boolean) @@ -185,7 +187,7 @@ class SolarExtended(Extended, ParentAllTables, Base): NebenausrichtungNeigungswinkel = Column(String) InAnspruchGenommeneFlaeche = Column(Float) ArtDerFlaeche = Column(String) - InAnspruchGenommeneAckerflaeche = Column(Float) + InAnspruchGenommeneLandwirtschaftlichGenutzteFlaeche = Column(Float) Nutzungsbereich = Column(String) Buergerenergie = Column(Boolean) EegMastrNummer = Column(String) @@ -202,16 +204,12 @@ class BiomassExtended(Extended, ParentAllTables, Base): EegMastrNummer = Column(String) KwkMastrNummer = Column(String) - class CombustionExtended(Extended, ParentAllTables, Base): __tablename__ = "combustion_extended" NameKraftwerk = Column(String) NameKraftwerksblock = Column(String) DatumBaubeginn = Column(Date) - AnzeigeEinerStilllegung = Column(Boolean) - ArtDerStilllegung = Column(String) - DatumBeginnVorlaeufigenOderEndgueltigenStilllegung = Column(Date) SteigerungNettonennleistungKombibetrieb = Column(Float) AnlageIstImKombibetrieb = Column(Boolean) MastrNummernKombibetrieb = Column(String) @@ -230,7 +228,6 @@ class CombustionExtended(Extended, ParentAllTables, Base): Technologie = Column(String) AusschliesslicheVerwendungImKombibetrieb = Column(Boolean) - class GsgkExtended(Extended, ParentAllTables, Base): __tablename__ = "gsgk_extended" @@ -244,9 +241,6 @@ class HydroExtended(Extended, ParentAllTables, Base): NameKraftwerk = Column(String) ArtDerWasserkraftanlage = Column(String) - AnzeigeEinerStilllegung = Column(Boolean) - ArtDerStilllegung = Column(String) - DatumBeginnVorlaeufigenOderEndgueltigenStilllegung = Column(Date) MinderungStromerzeugung = Column(Boolean) BestandteilGrenzkraftwerk = Column(Boolean) NettonennleistungDeutschland = Column(Float) @@ -274,7 +268,7 @@ class StorageExtended(Extended, ParentAllTables, Base): Notstromaggregat = Column(Boolean) BestandteilGrenzkraftwerk = Column(Boolean) NettonennleistungDeutschland = Column(Float) - ZugeordnenteWirkleistungWechselrichter = Column(Float) + ZugeordneteWirkleistungWechselrichter = Column(Float) NutzbareSpeicherkapazitaet = Column(Float) SpeMastrNummer = Column(String) EegMastrNummer = Column(String) @@ -510,6 +504,7 @@ class GasStorageExtended(ParentAllTables, Base): DatumBeginnVoruebergehendeStilllegung = Column(Date) DatumDesBetreiberwechsels = Column(Date) DatumRegistrierungDesBetreiberwechsels = Column(Date) + DatumEndgueltigeStilllegung = Column(Date) class StorageUnits(ParentAllTables, Base): @@ -570,6 +565,7 @@ class GasProducer(ParentAllTables, Base): FlurFlurstuecknummern = Column(String) GeplantesInbetriebnahmedatum = Column(Date) DatumBeginnVoruebergehendeStilllegung = Column(Date) + DatumEndgueltigeStilllegung = Column(Date) class GasConsumer(ParentAllTables, Base): @@ -734,6 +730,8 @@ class MarketActors(ParentAllTables, Base): Stromgrosshaendler = Column(Boolean) MarktakteurVorname = Column(String) MarktakteurNachname = Column(String) + WebportalDesNetzbetreibers = Column(String) + RegisternummerPraefix = Column(String) class Grids(ParentAllTables, Base): diff --git a/open_mastr/xml_download/utils_cleansing_bulk.py b/open_mastr/xml_download/utils_cleansing_bulk.py index 14be3418..2a304ef0 100644 --- a/open_mastr/xml_download/utils_cleansing_bulk.py +++ b/open_mastr/xml_download/utils_cleansing_bulk.py @@ -44,7 +44,7 @@ def replace_mastr_katalogeintraege( .apply(lambda x: x.str.strip()) .replace("", None) .astype("Int64") - .applymap(katalogwerte.get) + .map(katalogwerte.get) .agg(lambda d: ",".join(i for i in d if isinstance(i, str)), axis=1) .replace("", None) ) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 11f67eb8..319d1c50 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -5,7 +5,6 @@ import numpy as np import requests -from bs4 import BeautifulSoup from tqdm import tqdm # setup logger @@ -13,22 +12,66 @@ log = setup_logger() +def gen_version(when: time.struct_time = time.localtime()) -> str: + """ + Generates the current version. + + The version number is determined according to a fixed release cycle, + which is by convention in sync with the changes to other german regulatory + frameworks of the energysuch as GeLI Gas and GPKE. + + The release schedule is twice per year on 1st of April and October. + The version number is determined by the year of release and the running + number of the release, i.e. the release on April 1st is release 1, + while the release in October is release 2. -def get_url_from_Mastr_website() -> str: - """Get the url of the latest MaStR file from markstammdatenregister.de. + Further, the release happens during the day, so on the day of the + changeover, the exported data will still be in the old version/format. - The file and the corresponding url are updated once per day. - The url has a randomly generated string appended, so it has to be - grabbed from the marktstammdatenregister.de homepage. - For further details visit https://www.marktstammdatenregister.de/MaStR/Datendownload + see + + Examples: + 2024-01-01 = version 23.2 + 2024-04-01 = version 23.2 + 2024-04-02 = version 24.1 + 2024-09-30 = version 24.1 + 2024-10-01 = version 24.1 + 2024-10-02 = version 24.2 + 2024-31-12 = version 24.2 """ - html = requests.get("https://www.marktstammdatenregister.de/MaStR/Datendownload") - soup = BeautifulSoup(html.text, "lxml") - # find the download button element on the website - element = soup.find_all("a", "btn btn-primary text-right")[0] - # extract the url from the html element - return str(element).split('href="')[1].split('" title')[0] + year = when.tm_year + release = 1 + + if when.tm_mon < 4 or (when.tm_mon == 4 and when.tm_mday == 1): + year = year - 1 + release = 2 + elif when.tm_mon > 10 or (when.tm_mon == 10 and when.tm_mday > 1): + release = 2 + + # only the last two digits of the year are used + year = str(year)[-2:] + + return f'{year}.{release}' + +def gen_url(when: time.struct_time = time.localtime()) -> str: + """ + Generates the download URL for the specified date. + + Note that not all dates are archived on the website. + Normally only today is available, the export is usually made + between 02:00 and 04:00, which means before 04:00 the current data may not + yet be available and the download could fail. + + Note also that this function will not be able to generate URLs for dates + before 2024 because a different URL scheme was used then which had some random + data embedded in the name to make it harder to automate downloads. + """ + + version = gen_version(when) + date = time.strftime("%Y%m%d", when) + + return f'https://download.marktstammdatenregister.de/Gesamtdatenexport_{date}_{version}.zip' def download_xml_Mastr( @@ -69,9 +112,21 @@ def download_xml_Mastr( " You may want to download it another time." ) print(print_message) - url = get_url_from_Mastr_website() + + now = time.localtime() + url = gen_url(now) + time_a = time.perf_counter() r = requests.get(url, stream=True) + if r.status_code == 404: + # presumably todays download is not ready yet, retry with yesterdays date + log.warning("Download file was not found. Assuming that the new file was not published yet and retrying with yesterday.") + now = time.localtime(time.mktime(now) - (24 * 60 * 60)) # subtract 1 day from the date + r = requests.get(url, stream=True) + if r.status_code == 404: + log.error("Could not download file: download URL not found") + return + total_length = int(18000 * 1024 * 1024) with open(save_path, "wb") as zfile, tqdm( desc=save_path, total=(total_length / 1024 / 1024), unit="" diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 709407df..9dba5027 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -12,6 +12,7 @@ from open_mastr.utils.helpers import data_to_include_tables from open_mastr.utils.orm import tablename_mapping from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data +from io import StringIO def write_mastr_xml_to_database( @@ -156,7 +157,7 @@ def preprocess_table_for_writing_to_database( try: df = pd.read_xml(data, encoding="UTF-16", compression="zip") except lxml.etree.XMLSyntaxError as err: - df = handle_xml_syntax_error(data, err) + df = handle_xml_syntax_error(data.decode("utf-16"), err) df = add_zero_as_first_character_for_too_short_string(df) df = change_column_names_to_orm_format(df, xml_tablename) @@ -335,19 +336,19 @@ def add_missing_columns_to_table( ) -def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> None: +def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> pd.DataFrame: delete_entry = str(err).split("«")[0].split("»")[1] print(f"The entry {delete_entry} was deleted due to its false data type.") - df = df.replace(delete_entry, np.nan) + return df.replace(delete_entry, np.nan) -def handle_xml_syntax_error(data: bytes, err: Error) -> pd.DataFrame: +def handle_xml_syntax_error(data: str, err: Error) -> pd.DataFrame: """Deletes entries that cause an xml syntax error and produces DataFrame. Parameters ----------- - data : bytes - Unzipped xml data + data : str + Decoded xml file as one string err : ErrorMessage Error message that appeared when trying to use pd.read_xml on invalid xml file. @@ -356,25 +357,31 @@ def handle_xml_syntax_error(data: bytes, err: Error) -> pd.DataFrame: df : pandas.DataFrame DataFrame which is read from the changed xml data. """ - wrong_char_position = int(str(err).split()[-4]) - decoded_data = data.decode("utf-16") - loop_condition = True - - shift = 0 - while loop_condition: - evaluated_string = decoded_data[wrong_char_position + shift] - if evaluated_string == ">": - start_char = wrong_char_position + shift + 1 - break - else: - shift -= 1 - loop_condition_2 = True - while loop_condition_2: - evaluated_string = decoded_data[start_char] - if evaluated_string == "<": - break - else: - decoded_data = decoded_data[:start_char] + decoded_data[start_char + 1 :] - df = pd.read_xml(decoded_data) - print("One invalid xml expression was deleted.") - return df + + def find_nearest_brackets(xml_string: str, position: int) -> tuple[int, int]: + left_bracket_position = xml_string.rfind(">", 0, position) + right_bracket_position = xml_string.find("<", position) + return left_bracket_position, right_bracket_position + + data = data.splitlines() + + for _ in range(100): + # check for maximum of 100 syntax errors, otherwise return an error + wrong_char_row, wrong_char_column = err.position + row_with_error = data[wrong_char_row - 1] + + left_bracket, right_bracket = find_nearest_brackets( + row_with_error, wrong_char_column + ) + data[wrong_char_row - 1] = ( + row_with_error[: left_bracket + 1] + row_with_error[right_bracket:] + ) + try: + print("One invalid xml expression was deleted.") + df = pd.read_xml(StringIO("\n".join(data))) + return df + except lxml.etree.XMLSyntaxError as e: + err = e + continue + + raise Error("An error occured when parsing the xml file. Maybe it is corrupted?") diff --git a/setup.py b/setup.py index 738a61d4..870cd9a6 100644 --- a/setup.py +++ b/setup.py @@ -19,14 +19,14 @@ "open_mastr.utils.config", "open_mastr.xml_download", ], - version="0.14.1", + version="0.14.2", description="A package that provides an interface for downloading and" " processing the data of the Marktstammdatenregister (MaStR)", long_description=long_description, long_description_content_type="text/x-rst", url="https://github.com/OpenEnergyPlatform/open-MaStR", download_url="https://github.com/OpenEnergyPlatform/open-MaStR/archive" - "/refs/tags/v0.14.1.tar.gz", + "/refs/tags/v0.14.2.tar.gz", author="Open Energy Family", author_email="datenzentrum@rl-institut.de", maintainer="Ludwig Hülk", @@ -43,7 +43,7 @@ ], python_requires=">=3.8, <4", # 3.8 is needed for pandas 1.4 install_requires=[ - "pandas>=1.4", # pandas 1.4 is needed for pd.read_xml + "pandas>=2.1", # pandas 2.1 is needed for dataframe.map() "numpy", "sqlalchemy", "psycopg2-binary", @@ -52,7 +52,6 @@ "requests", "keyring", "tqdm", - "beautifulsoup4", "pyyaml", "xmltodict", ], @@ -69,6 +68,7 @@ "mkdocs-material", "mkdocs-include-markdown-plugin", "mike", + "black", ] }, package_data={ diff --git a/tests/xml_download/test_utils_download_bulk.py b/tests/xml_download/test_utils_download_bulk.py index 4be2b711..3fe351f6 100644 --- a/tests/xml_download/test_utils_download_bulk.py +++ b/tests/xml_download/test_utils_download_bulk.py @@ -1,8 +1,33 @@ -from open_mastr.xml_download.utils_download_bulk import get_url_from_Mastr_website +import time +from open_mastr.xml_download.utils_download_bulk import gen_url +def test_gen_url(): + when = time.strptime("2024-01-01", "%Y-%m-%d") + url = gen_url(when) + assert type(url) == str + assert url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240101_23.2.zip" + + when = time.strptime("2024-04-01", "%Y-%m-%d") + url = gen_url(when) + assert type(url) == str + assert url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240401_23.2.zip" + + when = time.strptime("2024-04-02", "%Y-%m-%d") + url = gen_url(when) + assert type(url) == str + assert url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240402_24.1.zip" + + when = time.strptime("2024-10-01", "%Y-%m-%d") + url = gen_url(when) + assert type(url) == str + assert url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241001_24.1.zip" + + when = time.strptime("2024-10-02", "%Y-%m-%d") + url = gen_url(when) + assert type(url) == str + assert url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241002_24.2.zip" -def test_get_url_from_Mastr_website(): - url = get_url_from_Mastr_website() - assert len(url) > 10 + when = time.strptime("2024-12-31", "%Y-%m-%d") + url = gen_url(when) assert type(url) == str - assert "marktstammdaten" in url + assert url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241231_24.2.zip"