From 9243578757244484d37023f55aed7fec8b3077f7 Mon Sep 17 00:00:00 2001 From: Lucas Date: Mon, 29 May 2023 22:06:01 +0200 Subject: [PATCH] feat: use data from minio hosted s3 buckets --- requirements.txt | 1 - suisa_sendemeldung/acrclient.py | 52 ++++---- suisa_sendemeldung/suisa_sendemeldung.py | 65 +++++----- tests/test_acrclient.py | 48 +++---- tests/test_suisa_sendemeldung.py | 155 ++++++++++++----------- 5 files changed, 155 insertions(+), 166 deletions(-) diff --git a/requirements.txt b/requirements.txt index a264f2da..26fe5ef5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -acrclient==0.3.0 ConfigArgParse==1.5.3 iso3901==0.3.0.post1 openpyxl==3.1.2 diff --git a/suisa_sendemeldung/acrclient.py b/suisa_sendemeldung/acrclient.py index e9bfcd99..a195c71f 100644 --- a/suisa_sendemeldung/acrclient.py +++ b/suisa_sendemeldung/acrclient.py @@ -1,16 +1,20 @@ """module containing the ACRCloud client.""" from datetime import date, datetime, timedelta +import logging import pytz -from acrclient import Client +import requests from tqdm import tqdm -class ACRClient(Client): - """ACRCloud client to fetch metadata. +logger = logging.getLogger(__name__) + +class ACRClient: + """Fetches cached metadata from MinIO. Args: - bearer_token: The bearer token for ACRCloud. + minio_url: URL to a public MinIO bucket containing raw per-day JSON files. + timezone (optional): The timezone to use for localization. """ # format of timestamp in api answer @@ -18,63 +22,57 @@ class ACRClient(Client): # timezone of ACRCloud ACR_TIMEZONE = "UTC" - def __init__(self, bearer_token, base_url="https://eu-api-v2.acrcloud.com"): - super().__init__(bearer_token=bearer_token, base_url=base_url) + def __init__(self, minio_url: str, timezone=ACR_TIMEZONE): + self.minio_url = minio_url + self.timezone = timezone self.default_date = date.today() - timedelta(days=1) def get_data( - self, project_id, stream_id, requested_date=None, timezone=ACR_TIMEZONE + self, requested_date=None ): - """Fetch metadata from ACRCloud for `stream_id`. + """Fetch ACRCloud metadata from MinIO. Args: - project_id: The Project ID of the stream. - stream_id: The ID of the stream. requested_date (optional): The date of the entries you want (default: yesterday). - timezone (optional): The timezone to use for localization. Returns: json: The ACR data from date """ if requested_date is None: requested_date = self.default_date - data = self.get_bm_cs_projects_results( - project_id=project_id, - stream_id=stream_id, - params={ - "date": requested_date.strftime("%Y%m%d"), - }, - ) + url = f"{self.minio_url}{requested_date.strftime('%Y-%m-%d')}.json" + resp = requests.get(url, timeout=10) + if resp.ok: + data = resp.json() + else: # pragma: no cover + raise RuntimeError(f"💀 failed to load data from {url}") for entry in data: metadata = entry.get("metadata") ts_utc = pytz.utc.localize( datetime.strptime(metadata.get("timestamp_utc"), ACRClient.TS_FMT) ) - ts_local = ts_utc.astimezone(pytz.timezone(timezone)) + ts_local = ts_utc.astimezone(pytz.timezone(self.timezone)) metadata.update({"timestamp_local": ts_local.strftime(ACRClient.TS_FMT)}) return data def get_interval_data( - self, project_id, stream_id, start, end, timezone=ACR_TIMEZONE - ): # pylint: disable-msg=too-many-locals,too-many-arguments + self, start, end + ): """Get data specified by interval from start to end. Args: - project_id: The ID of the project. - stream_id: The ID of the stream. start: The start date of the interval. end: The end date of the interval. - timezone (optional): will be passed to `get_data()`. Returns: json: The ACR data from start to end. """ trim = False # if we have to localize the timestamps we may need more data - if timezone != ACRClient.ACR_TIMEZONE: + if self.timezone != ACRClient.ACR_TIMEZONE: # compute utc offset - offset = pytz.timezone(timezone).utcoffset(datetime.now()) + offset = pytz.timezone(self.timezone).utcoffset(datetime.now()) # decrease start by 1 day if we're ahead of utc if offset > timedelta(seconds=1): computed_start = start - timedelta(days=1) @@ -99,7 +97,7 @@ def get_interval_data( ljust_amount: int = 27 for ptr in tqdm(dates, desc="load ACRCloud data".ljust(ljust_amount)): data += self.get_data( - project_id, stream_id, requested_date=ptr, timezone=timezone + requested_date=ptr ) # if timestamps are localized we will have to removed the unneeded entries. diff --git a/suisa_sendemeldung/suisa_sendemeldung.py b/suisa_sendemeldung/suisa_sendemeldung.py index 2eef757a..9748f904 100644 --- a/suisa_sendemeldung/suisa_sendemeldung.py +++ b/suisa_sendemeldung/suisa_sendemeldung.py @@ -20,6 +20,7 @@ import cridlib import pytz +import requests from babel.dates import format_date from configargparse import ArgumentParser from dateutil.relativedelta import relativedelta @@ -89,21 +90,6 @@ def validate_arguments(parser, args): args: the arguments to validate """ msgs = [] - # check length of bearer_token - if not len(args.bearer_token) >= 32: - msgs.append( - "".join( - ( - "wrong format on bearer_token, ", - f"expected larger than 32 characters but got {len(args.bearer_token)}", - ) - ) - ) - # check length of stream_id - if not len(args.stream_id) == 9: - msgs.append( - f"wrong format on stream_id, expected 9 characters but got {len(args.stream_id)}" - ) # one output option has to be set if not (args.file or args.email or args.stdout): msgs.append( @@ -130,22 +116,25 @@ def get_arguments(parser: ArgumentParser): # pragma: no cover args: the parsed args from the parser """ parser.add_argument( - "--bearer-token", - env_var="BEARER_TOKEN", - help="the bearer token for ACRCloud (required)", - required=True, + "--minio", + dest="minio", + env_var="MINIO", + help="URL to MinIO", + default="https://minio.service.int.rabe.ch:9000", ) parser.add_argument( - "--project-id", - env_var="PROJECT_ID", - help="the id of the project at ACRCloud (required)", - required=True, + "--minio-raw-bucket", + dest="minio_raw", + env_var="MINIO_RAW_BUCKET", + help="world readable bucket with daily data exports from ACRCloud", + default="acrcloud.raw", ) parser.add_argument( - "--stream-id", - env_var="STREAM_ID", - help="the id of the stream at ACRCloud (required)", - required=True, + "--minio-music-bucket", + dest="minio_music", + env_var="MINIO_MUSIC_BUCKET", + help="world readable bucket with deduplicated music info", + default="acrcloud.music", ) parser.add_argument( "--station-name", @@ -444,7 +433,7 @@ def get_isrc(music): # all local vars are required, eight are already used for the csv entries # pylint: disable-msg=too-many-locals -def get_csv(data, station_name=""): +def get_csv(data, station_name="", minio_url=""): """Create SUISA compatible csv data. Arguments: @@ -498,6 +487,12 @@ def get_csv(data, station_name=""): try: music = metadata.get("music")[0] + url = f"{minio_url}{music.get('acrid')}" + resp = requests.get(url, timeout=10) + if resp.ok: + music = resp.json() + else: # pragma: no cover + raise RuntimeError(f"💀 failed to load data from {url}") except TypeError: music = metadata.get("custom_files")[0] title = music.get("title") @@ -574,7 +569,7 @@ def get_csv(data, station_name=""): return csv.getvalue() -def get_xlsx(data, station_name=""): +def get_xlsx(data, station_name="", minio_url=""): """Create SUISA compatible xlsx data. Arguments: @@ -583,7 +578,7 @@ def get_xlsx(data, station_name=""): Returns: xlsx: The converted data as BytesIO object """ - csv = get_csv(data, station_name=station_name) + csv = get_csv(data, station_name=station_name, minio_url=minio_url) csv_reader = reader(StringIO(csv)) xlsx = BytesIO() @@ -745,16 +740,18 @@ def main(): # pragma: no cover start_date, end_date = parse_date(args) filename = parse_filename(args, start_date) + minio_raw_url=f"{args.minio}/{args.minio_raw}/" + minio_music_url=f"{args.minio}/{args.minio_music}/" - client = ACRClient(bearer_token=args.bearer_token) + client = ACRClient(minio_url=minio_raw_url, timezone=args.timezone) data = client.get_interval_data( - args.project_id, args.stream_id, start_date, end_date, timezone=args.timezone + start_date, end_date ) data = merge_duplicates(data) if args.filetype == "xlsx": - data = get_xlsx(data, station_name=args.station_name) + data = get_xlsx(data, station_name=args.station_name, minio_url=minio_music_url) elif args.filetype == "csv": - data = get_csv(data, station_name=args.station_name) + data = get_csv(data, station_name=args.station_name, minio_url=minio_music_url) if args.email: email_subject = Template(args.email_subject).substitute( { diff --git a/tests/test_acrclient.py b/tests/test_acrclient.py index 1d57a529..e19be2d7 100644 --- a/tests/test_acrclient.py +++ b/tests/test_acrclient.py @@ -6,73 +6,61 @@ from suisa_sendemeldung import acrclient -_ACR_URL = "https://eu-api-v2.acrcloud.com/api/bm-cs-projects/project-id/streams/stream-id/results" +_MINIO_RAW_URL = "http://minio.example.com/acrcloud.raw/" def test_init(): """Test ACRClient.__init__.""" - bearer_token = "secret-key" with freeze_time("1993-03-02"): - acr = acrclient.ACRClient(bearer_token) + acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL) assert acr.default_date == date(1993, 3, 1) def test_get_data(): """Test ACRClient.get_data.""" - bearer_token = "secret-key" - project_id = "project-id" - stream_id = "stream-id" - data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]} + data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}] with freeze_time("1993-03-02"): - acr = acrclient.ACRClient(bearer_token) + acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL) with requests_mock.Mocker() as mock: mock.get( - _ACR_URL, + f"{_MINIO_RAW_URL}1993-03-01.json", json=data, ) - acr.get_data(project_id, stream_id) + acr.get_data() def test_get_interval_data(): """Test ACRClient.get_interval_data.""" - bearer_token = "secret-key" - project_id = "project-id" - stream_id = "stream-id" - data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]} + data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}] with freeze_time("1993-03-02"): - acr = acrclient.ACRClient(bearer_token) + acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL) with requests_mock.Mocker() as mock: mock.get( - _ACR_URL, + requests_mock.ANY, json=data, ) - acr.get_interval_data( - project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31) - ) + acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31)) + # ahead of UTC with freeze_time("1993-03-02"): - acr = acrclient.ACRClient(bearer_token) + acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="Europe/Zurich") with requests_mock.Mocker() as mock: - data["data"][0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00" + data[0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00" mock.get( - _ACR_URL, + requests_mock.ANY, json=data, ) - acr.get_interval_data( - project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "Europe/Zurich" - ) + acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31)) # behind UTC with freeze_time("1993-03-02"): - acr = acrclient.ACRClient(bearer_token) + acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="America/Nuuk") with requests_mock.Mocker() as mock: mock.get( - _ACR_URL, + requests_mock.ANY, json=data, ) - acr.get_interval_data( - project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "America/Nuuk" - ) + acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31)) diff --git a/tests/test_suisa_sendemeldung.py b/tests/test_suisa_sendemeldung.py index bdb45335..733add59 100644 --- a/tests/test_suisa_sendemeldung.py +++ b/tests/test_suisa_sendemeldung.py @@ -3,7 +3,10 @@ from email.message import Message from io import BytesIO from unittest.mock import call, patch +from urllib.parse import urlparse +from pathlib import Path +import requests_mock from configargparse import ArgumentParser from freezegun import freeze_time from openpyxl import load_workbook @@ -16,10 +19,6 @@ def test_validate_arguments(): """Test validate_arguments.""" args = ArgumentParser() - # length of bearer_token should be 32 or more chars long - args.bearer_token = "iamclearlynotthirtytwocharslong" - # check length of stream_id - args.stream_id = "iamnot9chars" # one output option has to be set (but none is) args.file = False args.email = False @@ -31,8 +30,6 @@ def test_validate_arguments(): suisa_sendemeldung.validate_arguments(mock, args) mock.error.assert_called_once_with( "\n" - "- wrong format on bearer_token, expected larger than 32 characters but got 31\n" - "- wrong format on stream_id, expected 9 characters but got 12\n" "- no output option has been set, specify one of --file, --email or --stdout\n" "- argument --last_month not allowed with --start_date or --end_date" ) @@ -43,8 +40,6 @@ def test_validate_arguments(): suisa_sendemeldung.validate_arguments(mock, args) mock.error.assert_called_once_with( "\n" - "- wrong format on bearer_token, expected larger than 32 characters but got 31\n" - "- wrong format on stream_id, expected 9 characters but got 12\n" "- xlsx cannot be printed to stdout, please set --filetype to csv\n" "- argument --last_month not allowed with --start_date or --end_date" ) @@ -199,7 +194,11 @@ def test_get_csv(mock_cridlib_get): # empty data data = [] - csv = suisa_sendemeldung.get_csv(data) + with requests_mock.Mocker() as mock: + mock.get(requests_mock.ANY, json={}) + csv = suisa_sendemeldung.get_csv( + data, minio_url="http://minio.example.org/acrcloud.music/" + ) # pylint: disable=line-too-long assert csv == ( "Titel,Komponist,Interpret,Interpreten-Info,Sender,Sendedatum,Sendedauer,Sendezeit,Werkverzeichnisangaben,ISRC,Label,CD ID / Katalog-Nummer,Aufnahmedatum,Aufnahmeland,Erstveröffentlichungsdatum,Titel des Tonträgers (Albumtitel),Autor Text,Track Nummer,Genre,Programm,Bestellnummer,Marke,Label Code,EAN/GTIN,Identifikationsnummer\r\n" @@ -209,13 +208,69 @@ def test_get_csv(mock_cridlib_get): # bunch of data mock_cridlib_get.reset_mock() + musics = { + "a1": {"title": "Uhrenvergleich", "acrid": "a1"}, + "a2": { + "acrid": "a2", + "title": "Meme Dub", + "artist": "Da Gang", + "album": "album, but string", + "contributors": { + "composers": [ + "Da Composah", + ] + }, + "release_date": "2023", + "external_ids": {"isrc": "DEZ650710376"}, + }, + "a3": { + "acrid": "a3", + "title": "Bubbles", + "album": { + "name": "Da Alboom", + }, + "release_date": "2022-12-13", + "artists": [ + { + "name": "Mary's Surprise Act", + }, + { + "name": "Climmy Jiff", + }, + ], + "isrc": "DEZ650710376", + "label": "Jane Records", + "external_ids": { + "upc": "greedy-capitalist-number", + }, + }, + "a4": { + "acrid": "a4", + "artists": "Artists as string not list", + }, + "a5": {"title": "Long Playing", "acrid": "a5"}, + "a6": { + "title": "composer in works", + "acrid": "a6", + "works": [{"creators": [{"name": "Worker", "role": "W"}]}], + }, + "a7": { + "title": "composer better in works", + "artists": [{"name": "same"}], + "contributors": { + "composers": ["same"], + }, + "acrid": "a7", + "works": [{"creators": [{"name": "composer", "role": "C"}]}], + }, + } data = [ { "metadata": { "timestamp_local": "1993-03-01 13:12:00", "timestamp_utc": "1993-03-01 13:12:00", "played_duration": 60, - "music": [{"title": "Uhrenvergleich", "acrid": "a1"}], + "music": [musics.get("a1")], } }, { @@ -223,21 +278,7 @@ def test_get_csv(mock_cridlib_get): "timestamp_local": "1993-03-01 13:37:00", "timestamp_utc": "1993-03-01 13:37:00", "played_duration": 60, - "custom_files": [ - { - "acrid": "a2", - "title": "Meme Dub", - "artist": "Da Gang", - "album": "album, but string", - "contributors": { - "composers": [ - "Da Composah", - ] - }, - "release_date": "2023", - "external_ids": {"isrc": "DEZ650710376"}, - } - ], + "custom_files": [musics.get("a2")], } }, { @@ -245,29 +286,7 @@ def test_get_csv(mock_cridlib_get): "timestamp_local": "1993-03-01 16:20:00", "timestamp_utc": "1993-03-01 16:20:00", "played_duration": 60, - "music": [ - { - "acrid": "a3", - "title": "Bubbles", - "album": { - "name": "Da Alboom", - }, - "release_date": "2022-12-13", - "artists": [ - { - "name": "Mary's Surprise Act", - }, - { - "name": "Climmy Jiff", - }, - ], - "isrc": "DEZ650710376", - "label": "Jane Records", - "external_ids": { - "upc": "greedy-capitalist-number", - }, - } - ], + "music": [musics.get("a3")], } }, { @@ -275,12 +294,7 @@ def test_get_csv(mock_cridlib_get): "timestamp_local": "1993-03-01 17:17:17", "timestamp_utc": "1993-03-01 17:17:17", "played_duration": 60, - "custom_files": [ - { - "acrid": "a4", - "artists": "Artists as string not list", - } - ], + "custom_files": [musics.get("a4")], } }, { @@ -288,7 +302,7 @@ def test_get_csv(mock_cridlib_get): "timestamp_local": "1993-03-01 18:18:18", "timestamp_utc": "1993-03-01 18:18:18", "played_duration": 71337, - "music": [{"title": "Long Playing", "acrid": "a5"}], + "music": [musics.get("a5")], } }, { @@ -296,13 +310,7 @@ def test_get_csv(mock_cridlib_get): "timestamp_local": "1993-03-01 18:18:18", "timestamp_utc": "1993-03-01 18:18:18", "played_duration": 71337, - "music": [ - { - "title": "composer in works", - "acrid": "a6", - "works": [{"creators": [{"name": "Worker", "role": "W"}]}], - } - ], + "music": [musics.get("a6")], } }, { @@ -310,21 +318,20 @@ def test_get_csv(mock_cridlib_get): "timestamp_local": "1993-03-01 18:18:18", "timestamp_utc": "1993-03-01 18:18:18", "played_duration": 71337, - "music": [ - { - "title": "composer better in works", - "artists": [{"name": "same"}], - "contributors": { - "composers": ["same"], - }, - "acrid": "a6", - "works": [{"creators": [{"name": "composer", "role": "C"}]}], - } - ], + "music": [musics.get("a7")], } }, ] - csv = suisa_sendemeldung.get_csv(data, station_name="Station Name") + with requests_mock.Mocker() as mock: + mock.get( + requests_mock.ANY, + json=lambda req, _: musics.get(Path(urlparse(req.url).path).stem, {}), + ) + csv = suisa_sendemeldung.get_csv( + data, + station_name="Station Name", + minio_url="http://minio.example.org/acrcloud.music/", + ) # pylint: disable=line-too-long assert csv == ( "Titel,Komponist,Interpret,Interpreten-Info,Sender,Sendedatum,Sendedauer,Sendezeit,Werkverzeichnisangaben,ISRC,Label,CD ID / Katalog-Nummer,Aufnahmedatum,Aufnahmeland,Erstveröffentlichungsdatum,Titel des Tonträgers (Albumtitel),Autor Text,Track Nummer,Genre,Programm,Bestellnummer,Marke,Label Code,EAN/GTIN,Identifikationsnummer\r\n"