Skip to content

Commit

Permalink
feat: use data from minio hosted s3 buckets
Browse files Browse the repository at this point in the history
  • Loading branch information
hairmare committed May 29, 2023
1 parent 3dcb741 commit 9243578
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 166 deletions.
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
acrclient==0.3.0
ConfigArgParse==1.5.3
iso3901==0.3.0.post1
openpyxl==3.1.2
Expand Down
52 changes: 25 additions & 27 deletions suisa_sendemeldung/acrclient.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,78 @@
"""module containing the ACRCloud client."""
from datetime import date, datetime, timedelta
import logging

import pytz
from acrclient import Client
import requests
from tqdm import tqdm


class ACRClient(Client):
"""ACRCloud client to fetch metadata.
logger = logging.getLogger(__name__)

class ACRClient:
"""Fetches cached metadata from MinIO.
Args:
bearer_token: The bearer token for ACRCloud.
minio_url: URL to a public MinIO bucket containing raw per-day JSON files.
timezone (optional): The timezone to use for localization.
"""

# format of timestamp in api answer
TS_FMT = "%Y-%m-%d %H:%M:%S"
# timezone of ACRCloud
ACR_TIMEZONE = "UTC"

def __init__(self, bearer_token, base_url="https://eu-api-v2.acrcloud.com"):
super().__init__(bearer_token=bearer_token, base_url=base_url)
def __init__(self, minio_url: str, timezone=ACR_TIMEZONE):
self.minio_url = minio_url
self.timezone = timezone
self.default_date = date.today() - timedelta(days=1)

def get_data(
self, project_id, stream_id, requested_date=None, timezone=ACR_TIMEZONE
self, requested_date=None
):
"""Fetch metadata from ACRCloud for `stream_id`.
"""Fetch ACRCloud metadata from MinIO.
Args:
project_id: The Project ID of the stream.
stream_id: The ID of the stream.
requested_date (optional): The date of the entries you want (default: yesterday).
timezone (optional): The timezone to use for localization.
Returns:
json: The ACR data from date
"""
if requested_date is None:
requested_date = self.default_date
data = self.get_bm_cs_projects_results(
project_id=project_id,
stream_id=stream_id,
params={
"date": requested_date.strftime("%Y%m%d"),
},
)
url = f"{self.minio_url}{requested_date.strftime('%Y-%m-%d')}.json"
resp = requests.get(url, timeout=10)
if resp.ok:
data = resp.json()
else: # pragma: no cover
raise RuntimeError(f"💀 failed to load data from {url}")
for entry in data:
metadata = entry.get("metadata")
ts_utc = pytz.utc.localize(
datetime.strptime(metadata.get("timestamp_utc"), ACRClient.TS_FMT)
)
ts_local = ts_utc.astimezone(pytz.timezone(timezone))
ts_local = ts_utc.astimezone(pytz.timezone(self.timezone))
metadata.update({"timestamp_local": ts_local.strftime(ACRClient.TS_FMT)})

return data

def get_interval_data(
self, project_id, stream_id, start, end, timezone=ACR_TIMEZONE
): # pylint: disable-msg=too-many-locals,too-many-arguments
self, start, end
):
"""Get data specified by interval from start to end.
Args:
project_id: The ID of the project.
stream_id: The ID of the stream.
start: The start date of the interval.
end: The end date of the interval.
timezone (optional): will be passed to `get_data()`.
Returns:
json: The ACR data from start to end.
"""
trim = False
# if we have to localize the timestamps we may need more data
if timezone != ACRClient.ACR_TIMEZONE:
if self.timezone != ACRClient.ACR_TIMEZONE:
# compute utc offset
offset = pytz.timezone(timezone).utcoffset(datetime.now())
offset = pytz.timezone(self.timezone).utcoffset(datetime.now())
# decrease start by 1 day if we're ahead of utc
if offset > timedelta(seconds=1):
computed_start = start - timedelta(days=1)
Expand All @@ -99,7 +97,7 @@ def get_interval_data(
ljust_amount: int = 27
for ptr in tqdm(dates, desc="load ACRCloud data".ljust(ljust_amount)):
data += self.get_data(
project_id, stream_id, requested_date=ptr, timezone=timezone
requested_date=ptr
)

# if timestamps are localized we will have to removed the unneeded entries.
Expand Down
65 changes: 31 additions & 34 deletions suisa_sendemeldung/suisa_sendemeldung.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import cridlib
import pytz
import requests
from babel.dates import format_date
from configargparse import ArgumentParser
from dateutil.relativedelta import relativedelta
Expand Down Expand Up @@ -89,21 +90,6 @@ def validate_arguments(parser, args):
args: the arguments to validate
"""
msgs = []
# check length of bearer_token
if not len(args.bearer_token) >= 32:
msgs.append(
"".join(
(
"wrong format on bearer_token, ",
f"expected larger than 32 characters but got {len(args.bearer_token)}",
)
)
)
# check length of stream_id
if not len(args.stream_id) == 9:
msgs.append(
f"wrong format on stream_id, expected 9 characters but got {len(args.stream_id)}"
)
# one output option has to be set
if not (args.file or args.email or args.stdout):
msgs.append(
Expand All @@ -130,22 +116,25 @@ def get_arguments(parser: ArgumentParser): # pragma: no cover
args: the parsed args from the parser
"""
parser.add_argument(
"--bearer-token",
env_var="BEARER_TOKEN",
help="the bearer token for ACRCloud (required)",
required=True,
"--minio",
dest="minio",
env_var="MINIO",
help="URL to MinIO",
default="https://minio.service.int.rabe.ch:9000",
)
parser.add_argument(
"--project-id",
env_var="PROJECT_ID",
help="the id of the project at ACRCloud (required)",
required=True,
"--minio-raw-bucket",
dest="minio_raw",
env_var="MINIO_RAW_BUCKET",
help="world readable bucket with daily data exports from ACRCloud",
default="acrcloud.raw",
)
parser.add_argument(
"--stream-id",
env_var="STREAM_ID",
help="the id of the stream at ACRCloud (required)",
required=True,
"--minio-music-bucket",
dest="minio_music",
env_var="MINIO_MUSIC_BUCKET",
help="world readable bucket with deduplicated music info",
default="acrcloud.music",
)
parser.add_argument(
"--station-name",
Expand Down Expand Up @@ -444,7 +433,7 @@ def get_isrc(music):

# all local vars are required, eight are already used for the csv entries
# pylint: disable-msg=too-many-locals
def get_csv(data, station_name=""):
def get_csv(data, station_name="", minio_url=""):
"""Create SUISA compatible csv data.
Arguments:
Expand Down Expand Up @@ -498,6 +487,12 @@ def get_csv(data, station_name=""):

try:
music = metadata.get("music")[0]
url = f"{minio_url}{music.get('acrid')}"
resp = requests.get(url, timeout=10)
if resp.ok:
music = resp.json()
else: # pragma: no cover
raise RuntimeError(f"💀 failed to load data from {url}")
except TypeError:
music = metadata.get("custom_files")[0]
title = music.get("title")
Expand Down Expand Up @@ -574,7 +569,7 @@ def get_csv(data, station_name=""):
return csv.getvalue()


def get_xlsx(data, station_name=""):
def get_xlsx(data, station_name="", minio_url=""):
"""Create SUISA compatible xlsx data.
Arguments:
Expand All @@ -583,7 +578,7 @@ def get_xlsx(data, station_name=""):
Returns:
xlsx: The converted data as BytesIO object
"""
csv = get_csv(data, station_name=station_name)
csv = get_csv(data, station_name=station_name, minio_url=minio_url)
csv_reader = reader(StringIO(csv))

xlsx = BytesIO()
Expand Down Expand Up @@ -745,16 +740,18 @@ def main(): # pragma: no cover

start_date, end_date = parse_date(args)
filename = parse_filename(args, start_date)
minio_raw_url=f"{args.minio}/{args.minio_raw}/"
minio_music_url=f"{args.minio}/{args.minio_music}/"

client = ACRClient(bearer_token=args.bearer_token)
client = ACRClient(minio_url=minio_raw_url, timezone=args.timezone)
data = client.get_interval_data(
args.project_id, args.stream_id, start_date, end_date, timezone=args.timezone
start_date, end_date
)
data = merge_duplicates(data)
if args.filetype == "xlsx":
data = get_xlsx(data, station_name=args.station_name)
data = get_xlsx(data, station_name=args.station_name, minio_url=minio_music_url)
elif args.filetype == "csv":
data = get_csv(data, station_name=args.station_name)
data = get_csv(data, station_name=args.station_name, minio_url=minio_music_url)
if args.email:
email_subject = Template(args.email_subject).substitute(
{
Expand Down
48 changes: 18 additions & 30 deletions tests/test_acrclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,73 +6,61 @@

from suisa_sendemeldung import acrclient

_ACR_URL = "https://eu-api-v2.acrcloud.com/api/bm-cs-projects/project-id/streams/stream-id/results"
_MINIO_RAW_URL = "http://minio.example.com/acrcloud.raw/"


def test_init():
"""Test ACRClient.__init__."""
bearer_token = "secret-key"
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)

assert acr.default_date == date(1993, 3, 1)


def test_get_data():
"""Test ACRClient.get_data."""
bearer_token = "secret-key"
project_id = "project-id"
stream_id = "stream-id"
data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]}
data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
with requests_mock.Mocker() as mock:
mock.get(
_ACR_URL,
f"{_MINIO_RAW_URL}1993-03-01.json",
json=data,
)
acr.get_data(project_id, stream_id)
acr.get_data()


def test_get_interval_data():
"""Test ACRClient.get_interval_data."""
bearer_token = "secret-key"
project_id = "project-id"
stream_id = "stream-id"
data = {"data": [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]}
data = [{"metadata": {"timestamp_utc": "1993-03-01 13:12:00"}}]

with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL)
with requests_mock.Mocker() as mock:
mock.get(
_ACR_URL,
requests_mock.ANY,
json=data,
)
acr.get_interval_data(
project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31)
)
acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))


# ahead of UTC
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="Europe/Zurich")
with requests_mock.Mocker() as mock:
data["data"][0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00"
data[0]["metadata"]["timestamp_utc"] = "1993-03-01 00:00:00"
mock.get(
_ACR_URL,
requests_mock.ANY,
json=data,
)
acr.get_interval_data(
project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "Europe/Zurich"
)
acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))

# behind UTC
with freeze_time("1993-03-02"):
acr = acrclient.ACRClient(bearer_token)
acr = acrclient.ACRClient(minio_url=_MINIO_RAW_URL, timezone="America/Nuuk")
with requests_mock.Mocker() as mock:
mock.get(
_ACR_URL,
requests_mock.ANY,
json=data,
)
acr.get_interval_data(
project_id, stream_id, date(1993, 3, 1), date(1993, 3, 31), "America/Nuuk"
)
acr.get_interval_data(date(1993, 3, 1), date(1993, 3, 31))
Loading

0 comments on commit 9243578

Please sign in to comment.