From 9243578757244484d37023f55aed7fec8b3077f7 Mon Sep 17 00:00:00 2001
From: Lucas <hairmare@rabe.ch>
Date: Mon, 29 May 2023 22:06:01 +0200
Subject: [PATCH] feat: use data from minio hosted s3 buckets

---
 requirements.txt                         |   1 -
 suisa_sendemeldung/acrclient.py          |  52 ++++----
 suisa_sendemeldung/suisa_sendemeldung.py |  65 +++++-----
 tests/test_acrclient.py                  |  48 +++----
 tests/test_suisa_sendemeldung.py         | 155 ++++++++++++-----------
 5 files changed, 155 insertions(+), 166 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a264f2da..26fe5ef5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-acrclient==0.3.0
 ConfigArgParse==1.5.3
 iso3901==0.3.0.post1
 openpyxl==3.1.2
diff --git a/suisa_sendemeldung/acrclient.py b/suisa_sendemeldung/acrclient.py
index e9bfcd99..a195c71f 100644
--- a/suisa_sendemeldung/acrclient.py
+++ b/suisa_sendemeldung/acrclient.py
@@ -1,16 +1,20 @@
 """module containing the ACRCloud client."""
 from datetime import date, datetime, timedelta
+import logging
 
 import pytz
-from acrclient import Client
+import requests
 from tqdm import tqdm
 
 
-class ACRClient(Client):
-    """ACRCloud client to fetch metadata.
+logger = logging.getLogger(__name__)
+
+class ACRClient:
+    """Fetches cached metadata from MinIO.
 
     Args:
-        bearer_token: The bearer token for ACRCloud.
+        minio_url: URL to a public MinIO bucket containing raw per-day JSON files.
+        timezone (optional): The timezone to use for localization.
     """
 
     # format of timestamp in api answer
@@ -18,63 +22,57 @@ class ACRClient(Client):
     # timezone of ACRCloud
     ACR_TIMEZONE = "UTC"
 
-    def __init__(self, bearer_token, base_url="https://eu-api-v2.acrcloud.com"):
-        super().__init__(bearer_token=bearer_token, base_url=base_url)
+    def __init__(self, minio_url: str, timezone=ACR_TIMEZONE):
+        self.minio_url = minio_url
+        self.timezone = timezone
         self.default_date = date.today() - timedelta(days=1)
 
     def get_data(
-        self, project_id, stream_id, requested_date=None, timezone=ACR_TIMEZONE
+        self, requested_date=None
     ):
-        """Fetch metadata from ACRCloud for `stream_id`.
+        """Fetch ACRCloud metadata from MinIO.
 
         Args:
-            project_id: The Project ID of the stream.
-            stream_id: The ID of the stream.
             requested_date (optional): The date of the entries you want (default: yesterday).
-            timezone (optional): The timezone to use for localization.
 
         Returns:
             json: The ACR data from date
         """
         if requested_date is None:
             requested_date = self.default_date
-        data = self.get_bm_cs_projects_results(
-            project_id=project_id,
-            stream_id=stream_id,
-            params={
-                "date": requested_date.strftime("%Y%m%d"),
-            },
-        )
+        url = f"{self.minio_url}{requested_date.strftime('%Y-%m-%d')}.json"
+        resp = requests.get(url, timeout=10)
+        if resp.ok:
+            data = resp.json()
+        else:  # pragma: no cover
+            raise RuntimeError(f"💀 failed to load data from {url}")
         for entry in data:
             metadata = entry.get("metadata")
             ts_utc = pytz.utc.localize(
                 datetime.strptime(metadata.get("timestamp_utc"), ACRClient.TS_FMT)
             )
-            ts_local = ts_utc.astimezone(pytz.timezone(timezone))
+            ts_local = ts_utc.astimezone(pytz.timezone(self.timezone))
             metadata.update({"timestamp_local": ts_local.strftime(ACRClient.TS_FMT)})
 
         return data
 
     def get_interval_data(
-        self, project_id, stream_id, start, end, timezone=ACR_TIMEZONE
-    ):  # pylint: disable-msg=too-many-locals,too-many-arguments
+        self, start, end
+    ):
         """Get data specified by interval from start to end.
 
         Args:
-            project_id: The ID of the project.
-            stream_id: The ID of the stream.
             start: The start date of the interval.
             end: The end date of the interval.
-            timezone (optional): will be passed to `get_data()`.
 
         Returns:
             json: The ACR data from start to end.
         """
         trim = False
         # if we have to localize the timestamps we may need more data
-        if timezone != ACRClient.ACR_TIMEZONE:
+        if self.timezone != ACRClient.ACR_TIMEZONE:
             # compute utc offset
-            offset = pytz.timezone(timezone).utcoffset(datetime.now())
+            offset = pytz.timezone(self.timezone).utcoffset(datetime.now())
             # decrease start by 1 day if we're ahead of utc
             if offset > timedelta(seconds=1):
                 computed_start = start - timedelta(days=1)
@@ -99,7 +97,7 @@ def get_interval_data(
         ljust_amount: int = 27
         for ptr in tqdm(dates, desc="load ACRCloud data".ljust(ljust_amount)):
             data += self.get_data(
-                project_id, stream_id, requested_date=ptr, timezone=timezone
+                requested_date=ptr
             )
 
         # if timestamps are localized we will have to removed the unneeded entries.
diff --git a/suisa_sendemeldung/suisa_sendemeldung.py b/suisa_sendemeldung/suisa_sendemeldung.py
index 2eef757a..9748f904 100644
--- a/suisa_sendemeldung/suisa_sendemeldung.py
+++ b/suisa_sendemeldung/suisa_sendemeldung.py
@@ -20,6 +20,7 @@
 
 import cridlib
 import pytz
+import requests
 from babel.dates import format_date
 from configargparse import ArgumentParser
 from dateutil.relativedelta import relativedelta
@@ -89,21 +90,6 @@ def validate_arguments(parser, args):
         args: the arguments to validate
     """
     msgs = []
-    # check length of bearer_token
-    if not len(args.bearer_token) >= 32:
-        msgs.append(
-            "".join(
-                (
-                    "wrong format on bearer_token, ",
-                    f"expected larger than 32 characters but got {len(args.bearer_token)}",
-                )
-            )
-        )
-    # check length of stream_id
-    if not len(args.stream_id) == 9:
-        msgs.append(
-            f"wrong format on stream_id, expected 9 characters but got {len(args.stream_id)}"
-        )
     # one output option has to be set
     if not (args.file or args.email or args.stdout):
         msgs.append(
@@ -130,22 +116,25 @@ def get_arguments(parser: ArgumentParser):  # pragma: no cover
         args: the parsed args from the parser
     """
     parser.add_argument(
-        "--bearer-token",
-        env_var="BEARER_TOKEN",
-        help="the bearer token for ACRCloud (required)",
-        required=True,
+        "--minio",
+        dest="minio",
+        env_var="MINIO",
+        help="URL to MinIO",
+        default="https://minio.service.int.rabe.ch:9000",
     )
     parser.add_argument(
-        "--project-id",
-        env_var="PROJECT_ID",
-        help="the id of the project at ACRCloud (required)",
-        required=True,
+        "--minio-raw-bucket",
+        dest="minio_raw",
+        env_var="MINIO_RAW_BUCKET",
+        help="world readable bucket with daily data exports from ACRCloud",
+        default="acrcloud.raw",
     )
     parser.add_argument(
-        "--stream-id",
-        env_var="STREAM_ID",
-        help="the id of the stream at ACRCloud (required)",
-        required=True,
+        "--minio-music-bucket",
+        dest="minio_music",
+        env_var="MINIO_MUSIC_BUCKET",
+        help="world readable bucket with deduplicated music info",
+        default="acrcloud.music",
     )
     parser.add_argument(
         "--station-name",
@@ -444,7 +433,7 @@ def get_isrc(music):
 
 # all local vars are required, eight are already used for the csv entries
 # pylint: disable-msg=too-many-locals
-def get_csv(data, station_name=""):
+def get_csv(data, station_name="", minio_url=""):
     """Create SUISA compatible csv data.
 
     Arguments:
@@ -498,6 +487,12 @@ def get_csv(data, station_name=""):
 
         try:
             music = metadata.get("music")[0]
+            url = f"{minio_url}{music.get('acrid')}"
+            resp = requests.get(url, timeout=10)
+            if resp.ok:
+                music = resp.json()
+            else:  # pragma: no cover
+                raise RuntimeError(f"💀 failed to load data from {url}")
         except TypeError:
             music = metadata.get("custom_files")[0]
         title = music.get("title")
@@ -574,7 +569,7 @@ def get_csv(data, station_name=""):
     return csv.getvalue()
 
 
-def get_xlsx(data, station_name=""):
+def get_xlsx(data, station_name="", minio_url=""):
     """Create SUISA compatible xlsx data.
 
     Arguments:
@@ -583,7 +578,7 @@ def get_xlsx(data, station_name=""):
     Returns:
         xlsx: The converted data as BytesIO object
     """
-    csv = get_csv(data, station_name=station_name)
+    csv = get_csv(data, station_name=station_name, minio_url=minio_url)
     csv_reader = reader(StringIO(csv))
 
     xlsx = BytesIO()
@@ -745,16 +740,18 @@ def main():  # pragma: no cover
 
     start_date, end_date = parse_date(args)
     filename = parse_filename(args, start_date)
+    minio_raw_url=f"{args.minio}/{args.minio_raw}/"
+    minio_music_url=f"{args.minio}/{args.minio_music}/"
 
-    client = ACRClient(bearer_token=args.bearer_token)
+    client = ACRClient(minio_url=minio_raw_url, timezone=args.timezone)
     data = client.get_interval_data(
-        args.project_id, args.stream_id, start_date, end_date, timezone=args.timezone
+        start_date, end_date
     )
     data = merge_duplicates(data)
     if args.filetype == "xlsx":
-        data = get_xlsx(data, station_name=args.station_name)
+        data = get_xlsx(data, station_name=args.station_name, minio_url=minio_music_url)
     elif args.filetype == "csv":
-        data = get_csv(data, station_name=args.station_name)
+        data = get_csv(data, station_name=args.station_name, minio_url=minio_music_url)
     if args.email:
         email_subject = Template(args.email_subject).substitute(
             {
diff --git a/tests/test_acrclient.py b/tests/test_acrclient.py
index 1d57a529..e19be2d7 100644
--- a/tests/test_acrclient.py
+++ b/tests/test_acrclient.py
@@ -6,73 +6,61 @@
 
 from suisa_sendemeldung import acrclient
 
-_ACR_URL = "https://eu-api-v2.acrcloud.com/api/bm-cs-projects/project-id/streams/stream-id/results"
+_MINIO_RAW_URL = "http://minio.example.com/acrcloud.raw/"
 
 
 def test_init():
     """Test ACRClient.__init__."""
-    bearer_token = "secret-key"
     with freeze_time("1993-03-02"):
-        acr = acrclient.ACRClient(bearer_token)
+        acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
 
     assert acr.default_date == date(1993, 3, 1)
 
 
 def test_get_data():
     """Test ACRClient.get_data."""
-    bearer_token = "secret-key"
-    project_id = "project-id"
-    stream_id = "stream-id"
-    data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]}
+    data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]
     with freeze_time("1993-03-02"):
-        acr = acrclient.ACRClient(bearer_token)
+        acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
     with requests_mock.Mocker() as mock:
         mock.get(
-            _ACR_URL,
+            f"{_MINIO_RAW_URL}1993-03-01.json",
             json=data,
         )
-        acr.get_data(project_id, stream_id)
+        acr.get_data()
 
 
 def test_get_interval_data():
     """Test ACRClient.get_interval_data."""
-    bearer_token = "secret-key"
-    project_id = "project-id"
-    stream_id = "stream-id"
-    data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]}
+    data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]
 
     with freeze_time("1993-03-02"):
-        acr = acrclient.ACRClient(bearer_token)
+        acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
     with requests_mock.Mocker() as mock:
         mock.get(
-            _ACR_URL,
+            requests_mock.ANY,
             json=data,
         )
-        acr.get_interval_data(
-            project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31)
-        )
+        acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))
+
 
     # ahead of UTC
     with freeze_time("1993-03-02"):
-        acr = acrclient.ACRClient(bearer_token)
+        acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="Europe/Zurich")
     with requests_mock.Mocker() as mock:
-        data["data"][0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00"
+        data[0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00"
         mock.get(
-            _ACR_URL,
+            requests_mock.ANY,
             json=data,
         )
-        acr.get_interval_data(
-            project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "Europe/Zurich"
-        )
+        acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))
 
     # behind UTC
     with freeze_time("1993-03-02"):
-        acr = acrclient.ACRClient(bearer_token)
+        acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="America/Nuuk")
     with requests_mock.Mocker() as mock:
         mock.get(
-            _ACR_URL,
+            requests_mock.ANY,
             json=data,
         )
-        acr.get_interval_data(
-            project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "America/Nuuk"
-        )
+        acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))
diff --git a/tests/test_suisa_sendemeldung.py b/tests/test_suisa_sendemeldung.py
index bdb45335..733add59 100644
--- a/tests/test_suisa_sendemeldung.py
+++ b/tests/test_suisa_sendemeldung.py
@@ -3,7 +3,10 @@
 from email.message import Message
 from io import BytesIO
 from unittest.mock import call, patch
+from urllib.parse import urlparse
+from pathlib import Path
 
+import requests_mock
 from configargparse import ArgumentParser
 from freezegun import freeze_time
 from openpyxl import load_workbook
@@ -16,10 +19,6 @@ def test_validate_arguments():
     """Test validate_arguments."""
 
     args = ArgumentParser()
-    # length of bearer_token should be 32 or more chars long
-    args.bearer_token = "iamclearlynotthirtytwocharslong"
-    # check length of stream_id
-    args.stream_id = "iamnot9chars"
     # one output option has to be set (but none is)
     args.file = False
     args.email = False
@@ -31,8 +30,6 @@ def test_validate_arguments():
         suisa_sendemeldung.validate_arguments(mock, args)
         mock.error.assert_called_once_with(
             "\n"
-            "- wrong format on bearer_token, expected larger than 32 characters but got 31\n"
-            "- wrong format on stream_id, expected 9 characters but got 12\n"
             "- no output option has been set, specify one of --file, --email or --stdout\n"
             "- argument --last_month not allowed with --start_date or --end_date"
         )
@@ -43,8 +40,6 @@ def test_validate_arguments():
         suisa_sendemeldung.validate_arguments(mock, args)
         mock.error.assert_called_once_with(
             "\n"
-            "- wrong format on bearer_token, expected larger than 32 characters but got 31\n"
-            "- wrong format on stream_id, expected 9 characters but got 12\n"
             "- xlsx cannot be printed to stdout, please set --filetype to csv\n"
             "- argument --last_month not allowed with --start_date or --end_date"
         )
@@ -199,7 +194,11 @@ def test_get_csv(mock_cridlib_get):
 
     # empty data
     data = []
-    csv = suisa_sendemeldung.get_csv(data)
+    with requests_mock.Mocker() as mock:
+        mock.get(requests_mock.ANY, json={})
+        csv = suisa_sendemeldung.get_csv(
+            data, minio_url="http://minio.example.org/acrcloud.music/"
+        )
     # pylint: disable=line-too-long
     assert csv == (
         "Titel,Komponist,Interpret,Interpreten-Info,Sender,Sendedatum,Sendedauer,Sendezeit,Werkverzeichnisangaben,ISRC,Label,CD ID / Katalog-Nummer,Aufnahmedatum,Aufnahmeland,Erstveröffentlichungsdatum,Titel des Tonträgers (Albumtitel),Autor Text,Track Nummer,Genre,Programm,Bestellnummer,Marke,Label Code,EAN/GTIN,Identifikationsnummer\r\n"
@@ -209,13 +208,69 @@ def test_get_csv(mock_cridlib_get):
 
     # bunch of data
     mock_cridlib_get.reset_mock()
+    musics = {
+        "a1": {"title": "Uhrenvergleich", "acrid": "a1"},
+        "a2": {
+            "acrid": "a2",
+            "title": "Meme Dub",
+            "artist": "Da Gang",
+            "album": "album, but string",
+            "contributors": {
+                "composers": [
+                    "Da Composah",
+                ]
+            },
+            "release_date": "2023",
+            "external_ids": {"isrc": "DEZ650710376"},
+        },
+        "a3": {
+            "acrid": "a3",
+            "title": "Bubbles",
+            "album": {
+                "name": "Da Alboom",
+            },
+            "release_date": "2022-12-13",
+            "artists": [
+                {
+                    "name": "Mary's Surprise Act",
+                },
+                {
+                    "name": "Climmy Jiff",
+                },
+            ],
+            "isrc": "DEZ650710376",
+            "label": "Jane Records",
+            "external_ids": {
+                "upc": "greedy-capitalist-number",
+            },
+        },
+        "a4": {
+            "acrid": "a4",
+            "artists": "Artists as string not list",
+        },
+        "a5": {"title": "Long Playing", "acrid": "a5"},
+        "a6": {
+            "title": "composer in works",
+            "acrid": "a6",
+            "works": [{"creators": [{"name": "Worker", "role": "W"}]}],
+        },
+        "a7": {
+            "title": "composer better in works",
+            "artists": [{"name": "same"}],
+            "contributors": {
+                "composers": ["same"],
+            },
+            "acrid": "a7",
+            "works": [{"creators": [{"name": "composer", "role": "C"}]}],
+        },
+    }
     data = [
         {
             "metadata": {
                 "timestamp_local": "1993-03-01 13:12:00",
                 "timestamp_utc": "1993-03-01 13:12:00",
                 "played_duration": 60,
-                "music": [{"title": "Uhrenvergleich", "acrid": "a1"}],
+                "music": [musics.get("a1")],
             }
         },
         {
@@ -223,21 +278,7 @@ def test_get_csv(mock_cridlib_get):
                 "timestamp_local": "1993-03-01 13:37:00",
                 "timestamp_utc": "1993-03-01 13:37:00",
                 "played_duration": 60,
-                "custom_files": [
-                    {
-                        "acrid": "a2",
-                        "title": "Meme Dub",
-                        "artist": "Da Gang",
-                        "album": "album, but string",
-                        "contributors": {
-                            "composers": [
-                                "Da Composah",
-                            ]
-                        },
-                        "release_date": "2023",
-                        "external_ids": {"isrc": "DEZ650710376"},
-                    }
-                ],
+                "custom_files": [musics.get("a2")],
             }
         },
         {
@@ -245,29 +286,7 @@ def test_get_csv(mock_cridlib_get):
                 "timestamp_local": "1993-03-01 16:20:00",
                 "timestamp_utc": "1993-03-01 16:20:00",
                 "played_duration": 60,
-                "music": [
-                    {
-                        "acrid": "a3",
-                        "title": "Bubbles",
-                        "album": {
-                            "name": "Da Alboom",
-                        },
-                        "release_date": "2022-12-13",
-                        "artists": [
-                            {
-                                "name": "Mary's Surprise Act",
-                            },
-                            {
-                                "name": "Climmy Jiff",
-                            },
-                        ],
-                        "isrc": "DEZ650710376",
-                        "label": "Jane Records",
-                        "external_ids": {
-                            "upc": "greedy-capitalist-number",
-                        },
-                    }
-                ],
+                "music": [musics.get("a3")],
             }
         },
         {
@@ -275,12 +294,7 @@ def test_get_csv(mock_cridlib_get):
                 "timestamp_local": "1993-03-01 17:17:17",
                 "timestamp_utc": "1993-03-01 17:17:17",
                 "played_duration": 60,
-                "custom_files": [
-                    {
-                        "acrid": "a4",
-                        "artists": "Artists as string not list",
-                    }
-                ],
+                "custom_files": [musics.get("a4")],
             }
         },
         {
@@ -288,7 +302,7 @@ def test_get_csv(mock_cridlib_get):
                 "timestamp_local": "1993-03-01 18:18:18",
                 "timestamp_utc": "1993-03-01 18:18:18",
                 "played_duration": 71337,
-                "music": [{"title": "Long Playing", "acrid": "a5"}],
+                "music": [musics.get("a5")],
             }
         },
         {
@@ -296,13 +310,7 @@ def test_get_csv(mock_cridlib_get):
                 "timestamp_local": "1993-03-01 18:18:18",
                 "timestamp_utc": "1993-03-01 18:18:18",
                 "played_duration": 71337,
-                "music": [
-                    {
-                        "title": "composer in works",
-                        "acrid": "a6",
-                        "works": [{"creators": [{"name": "Worker", "role": "W"}]}],
-                    }
-                ],
+                "music": [musics.get("a6")],
             }
         },
         {
@@ -310,21 +318,20 @@ def test_get_csv(mock_cridlib_get):
                 "timestamp_local": "1993-03-01 18:18:18",
                 "timestamp_utc": "1993-03-01 18:18:18",
                 "played_duration": 71337,
-                "music": [
-                    {
-                        "title": "composer better in works",
-                        "artists": [{"name": "same"}],
-                        "contributors": {
-                            "composers": ["same"],
-                        },
-                        "acrid": "a6",
-                        "works": [{"creators": [{"name": "composer", "role": "C"}]}],
-                    }
-                ],
+                "music": [musics.get("a7")],
             }
         },
     ]
-    csv = suisa_sendemeldung.get_csv(data, station_name="Station Name")
+    with requests_mock.Mocker() as mock:
+        mock.get(
+            requests_mock.ANY,
+            json=lambda req, _: musics.get(Path(urlparse(req.url).path).stem, {}),
+        )
+        csv = suisa_sendemeldung.get_csv(
+            data,
+            station_name="Station Name",
+            minio_url="http://minio.example.org/acrcloud.music/",
+        )
     # pylint: disable=line-too-long
     assert csv == (
         "Titel,Komponist,Interpret,Interpreten-Info,Sender,Sendedatum,Sendedauer,Sendezeit,Werkverzeichnisangaben,ISRC,Label,CD ID / Katalog-Nummer,Aufnahmedatum,Aufnahmeland,Erstveröffentlichungsdatum,Titel des Tonträgers (Albumtitel),Autor Text,Track Nummer,Genre,Programm,Bestellnummer,Marke,Label Code,EAN/GTIN,Identifikationsnummer\r\n"