From 88188fe96cf26f512a5daa0f49881675bf34b019 Mon Sep 17 00:00:00 2001 From: Steve Berardi <6608085+steveberardi@users.noreply.github.com> Date: Wed, 24 Jul 2024 06:20:52 -0700 Subject: [PATCH] clean up --- scripts/bigsky_stars.py | 52 ------------------------- src/starplot/data/bigsky.py | 78 +++++++++++++++++++++---------------- src/starplot/data/stars.py | 21 ++-------- 3 files changed, 49 insertions(+), 102 deletions(-) delete mode 100644 scripts/bigsky_stars.py diff --git a/scripts/bigsky_stars.py b/scripts/bigsky_stars.py deleted file mode 100644 index 1d4f738..0000000 --- a/scripts/bigsky_stars.py +++ /dev/null @@ -1,52 +0,0 @@ -import pandas as pd - -DIGITS = 4 - -df = pd.read_csv( - # "raw/bigsky.stars.csv.gz", - "raw/bigsky.stars.mag11.csv.gz", - header=0, - names=[ - "tyc_id", - "hip_id", - "ccdm", - "magnitude", - "bv", - "ra_degrees_j2000", - "dec_degrees_j2000", - "ra_mas_per_year", - "dec_mas_per_year", - "parallax_mas", - # "hip", - # "magnitude", - # "ra_hours", - # "dec_degrees", - # "parallax_mas", - # "ra_mas_per_year", - # "dec_mas_per_year", - # "bv", - ], - compression="gzip", -) - -df["ra_hours"] = df.apply(lambda row: round(row.ra_degrees_j2000 / 15, DIGITS), axis=1) - -df = df.assign( - # hip=df["hip_id"], - # ra_degrees=df["ra_degrees_j2000"], - # dec_degrees=df["dec_degrees_j2000"], - epoch_year=2000, -) - -df = df.rename( - columns={ - "hip_id": "hip", - "ra_degrees_j2000": "ra_degrees", - "dec_degrees_j2000": "dec_degrees", - } -) - -print(df) - -# df.to_parquet("temp/stars.bigsky.parquet", compression="gzip") -df.to_parquet("temp/stars.bigsky.mag11.parquet", compression="gzip") diff --git a/src/starplot/data/bigsky.py b/src/starplot/data/bigsky.py index 965bee6..286d450 100644 --- a/src/starplot/data/bigsky.py +++ b/src/starplot/data/bigsky.py @@ -1,51 +1,54 @@ -import sys import os -import requests -from starplot.data import DATA_PATH, DataFiles +import pandas as pd + +from starplot.data import DATA_PATH, DataFiles, utils BIG_SKY_VERSION = "0.1.0" -BIG_SKY_URL = f"https://github.com/steveberardi/bigsky/releases/download/v{BIG_SKY_VERSION}/bigsky.stars.csv.gz" +BIG_SKY_FILENAME = "bigsky.stars.csv.gz" + +BIG_SKY_URL = f"https://github.com/steveberardi/bigsky/releases/download/v{BIG_SKY_VERSION}/{BIG_SKY_FILENAME}" -DOWNLOADED_PATH = DATA_PATH / "bigsky.stars.csv.gz" +DOWNLOADED_PATH = DATA_PATH / BIG_SKY_FILENAME DIGITS = 4 -# TODO : refactor this to make it re-usable for different filenames -# TODO : delete the SCRIPT for this in scripts/ +BIG_SKY_ASSETS = { + DataFiles.BIG_SKY: "bigsky.stars.csv.gz", + DataFiles.BIG_SKY_MAG11: "bigsky.stars.mag11.csv.gz", +} -def download(): - with open(DOWNLOADED_PATH, "wb") as f: - print("Downloading Big Sky Catalog...") - response = requests.get(BIG_SKY_URL, stream=True) - total_size = response.headers.get("content-length") +def url(filename: str, version: str): + return f"https://github.com/steveberardi/bigsky/releases/download/v{version}/{filename}" - if total_size is None: - f.write(response.content) - return - - bytes_written = 0 - total_size = int(total_size) - for chunk in response.iter_content(chunk_size=4096): - bytes_written += len(chunk) - f.write(chunk) - progress = int(25 * bytes_written / total_size) - sys.stdout.write("\r[%s%s]" % ("=" * progress, " " * (25 - progress))) - sys.stdout.flush() - - print("Download complete!") +def download( + filename: str = BIG_SKY_FILENAME, + version: str = BIG_SKY_VERSION, + download_path: str = None, + digits: int = 4, +): + download_path = download_path or str(DATA_PATH / filename) + utils.download( + url(filename, version), + download_path, + "Big Sky Star Catalog", + ) + to_parquet( + download_path, + DataFiles.BIG_SKY, + digits, + ) -def to_parquet(): - import pandas as pd +def to_parquet(source_path: str, destination_path: str, digits: int = DIGITS): print("Preparing Big Sky Catalog for Starplot...") df = pd.read_csv( - DOWNLOADED_PATH, + source_path, header=0, names=[ "tyc_id", @@ -63,7 +66,7 @@ def to_parquet(): ) df["ra_hours"] = df.apply( - lambda row: round(row.ra_degrees_j2000 / 15, DIGITS), axis=1 + lambda row: round(row.ra_degrees_j2000 / 15, digits), axis=1 ) df = df.assign(epoch_year=2000) @@ -76,8 +79,17 @@ def to_parquet(): } ) - df.to_parquet(DataFiles.BIG_SKY, compression="gzip") + df.to_parquet(destination_path, compression="gzip") + + +def load(path): + if not exists(path): + download(filename=BIG_SKY_ASSETS.get(path)) + + df = pd.read_parquet(path) + + return df.set_index("tyc_id") -def exists() -> bool: - return os.path.isfile(DataFiles.BIG_SKY) +def exists(path) -> bool: + return os.path.isfile(path) diff --git a/src/starplot/data/stars.py b/src/starplot/data/stars.py index 4a32752..2df0ae2 100644 --- a/src/starplot/data/stars.py +++ b/src/starplot/data/stars.py @@ -124,32 +124,19 @@ class StarCatalog(str, Enum): """Big Sky Catalog ~ 2.5M stars""" -def load_hipparcos(): - return read_parquet(DataFiles.HIPPARCOS) - - -def load_bigsky_mag11(): - df = read_parquet(DataFiles.BIG_SKY_MAG11) - - return df.set_index("tyc_id") - - def load_bigsky(): if not bigsky.exists(): bigsky.download() - bigsky.to_parquet() - - df = read_parquet(DataFiles.BIG_SKY) - return df.set_index("tyc_id") + return bigsky.load(DataFiles.BIG_SKY) def load(catalog: StarCatalog = StarCatalog.HIPPARCOS): if catalog == StarCatalog.HIPPARCOS: - return load_hipparcos() + return read_parquet(DataFiles.HIPPARCOS) elif catalog == StarCatalog.BIG_SKY_MAG11: - return load_bigsky_mag11() + return bigsky.load(DataFiles.BIG_SKY_MAG11) elif catalog == StarCatalog.BIG_SKY: - return load_bigsky() + return bigsky.load(DataFiles.BIG_SKY) else: raise ValueError("Unrecognized star catalog.")