Skip to content

Commit

Permalink
2M setup
Browse files Browse the repository at this point in the history
  • Loading branch information
stelais committed Jul 25, 2024
1 parent d5c753b commit 06b61b9
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 63 deletions.
96 changes: 96 additions & 0 deletions microlensing/moa_data_interface_2M.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
Code for interacting with MOA light curve files and metadata.
"""
from __future__ import annotations

from collections import defaultdict
from pathlib import Path
from typing import ClassVar

import pandas as pd
import requests
from bs4 import BeautifulSoup


class MoaDataInterface2M:
"""
A class for interacting with MOA light curve files and metadata.
"""

survey_tag_to_path_list_dictionary_: dict[str, list[Path]] | None = None
no_tag_string = "no_tag"
all_survey_tags: ClassVar[list[str]] = ["c", "cf", "cp", "cw", "cs", "cb", "v", "n", "nr", "m", "j", no_tag_string]

@property
def survey_tag_to_path_list_dictionary(self) -> dict[str, list[Path]]:
"""
Property allowing the survey tag to path list dictionary to only be loaded once.
:return: The survey tag to path list dictionary
"""
if self.survey_tag_to_path_list_dictionary_ is None:
takahiro_sumi_nine_year_events_data_frame = self.read_takahiro_sumi_nine_year_events_table_as_data_frame(
Path("/local/data/fugu3/sishitan/qusi_project/qusi/data/moa_microlensing_550k/candlist_2023Oct12.txt")
)
self.survey_tag_to_path_list_dictionary_ = self.group_paths_by_tag_in_events_data_frame(
list(Path("/local/data/fugu3/sishitan/qusi_project/merida/data/microlensing_2M").glob("**/*.feather")), takahiro_sumi_nine_year_events_data_frame
)
return self.survey_tag_to_path_list_dictionary_


@staticmethod
def read_takahiro_sumi_nine_year_events_table_as_data_frame(path: Path) -> pd.DataFrame:
"""
Reads Takahiro Sumi's 9-year events table as a Pandas data frame.
:param path: The path to the events table file.
:return: The data frame.
"""
named_column_names = ["field", "clr", "chip", "subfield", "id", "tag", "x", "y"]
# The number of columns in the file are inconsistent, so here we add extra unnamed columns to match the
# largest number of columns in any row.
largest_column_count = 33
unnamed_column_names = [f"unnamed{index}" for index in range(largest_column_count - len(named_column_names))]
column_names = named_column_names + unnamed_column_names
data_frame = pd.read_csv(
path, comment="#", names=column_names, delim_whitespace=True, skipinitialspace=True, skiprows=23
)
data_frame = data_frame.set_index(["field", "clr", "chip", "subfield", "id"], drop=False)
data_frame = data_frame.sort_index()
return data_frame

def get_tag_for_path_from_data_frame(self, path: Path, events_data_frame: pd.DataFrame) -> str:
"""
Gets the event tag of a light curve from the events data frame.
:param path: The path of the light curve whose event tag should be retrieved.
:param events_data_frame: Takahiro Sumi's 9-year events data frame.
:return: The string of the tag of the event. None if no tag exists.
"""
file_name = path.name
file_name_without_extension = file_name.split(".")[0]
moa_identifier = file_name_without_extension.split("_")[-1] # Remove duplicate identifier string.
field, clr, chip_string, subfield_string, id_string = moa_identifier.split("-")
chip, subfield, id_ = int(chip_string), int(subfield_string), int(id_string)
try:
row = events_data_frame.loc[(field, clr, chip, subfield, id_)]
except KeyError:
return self.no_tag_string
tag = row["tag"]
return tag

def group_paths_by_tag_in_events_data_frame(
self, paths: list[Path], events_data_frame: pd.DataFrame
) -> dict[str, list[Path]]:
"""
Groups paths into a dictionary based on their tags.
:param paths: The paths to group.
:param events_data_frame: The events data frame to look for a tag in.
:return:
"""
tag_path_list_dictionary = defaultdict(list)
for path in paths:
tag = self.get_tag_for_path_from_data_frame(path, events_data_frame)
tag_path_list_dictionary[tag].append(path)
return tag_path_list_dictionary
78 changes: 19 additions & 59 deletions microlensing/moa_dataset_2M.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,28 @@
from functools import partial

import numpy as np
import pandas as pd

from moa_data_interface_2M import MoaDataInterface2M
from moa_survey_light_curve_collection_2M import MoaSurveyLightCurveCollection2M
from ramjet.photometric_database.standard_and_injected_light_curve_database import StandardAndInjectedLightCurveDatabase

from qusi.light_curve_collection import LabeledLightCurveCollection
from qusi.light_curve_dataset import LightCurveDataset, default_light_curve_observation_post_injection_transform
from qusi.light_curve_collection import LightCurveCollection



def positive_label_function(path):
return 1


def negative_label_function(path):
return 0

class MoaSurveyMicrolensingAndNonMicrolensingDatabase(StandardAndInjectedLightCurveDatabase):
class MoaSurveyMicrolensingAndNonMicrolensingDatabase2M(StandardAndInjectedLightCurveDatabase):
"""
A class for a database of MOA light curves including non-microlensing, and microlensing collections.
"""
moa_data_interface_2M = MoaDataInterface2M()

def __init__(self, test_split: int):
super().__init__()
Expand All @@ -30,35 +31,39 @@ def __init__(self, test_split: int):
train_splits.remove(validation_split)
train_splits.remove(test_split)

self.negative_training = MoaSurveyLightCurveCollection(
survey_tags=['v', 'n', 'nr', 'm', 'j', 'no_tag'],
# Note that the NN has number_of_splits: int = 10 already set.
# Creating the training collection | splits [0, 1, 2, 3, 4, 5, 6, 7] = 80% of the data
self.negative_training = MoaSurveyLightCurveCollection2M(
survey_tags=['v', 'n', 'nr', 'm', 'j', self.moa_data_interface_2M.no_tag_string],
label=0,
dataset_splits=train_splits)
self.positive_training = MoaSurveyLightCurveCollection(
self.positive_training = MoaSurveyLightCurveCollection2M(
survey_tags=['c', 'cf', 'cp', 'cw', 'cs', 'cb'],
label=1,
dataset_splits=train_splits)

self.negative_validation = MoaSurveyLightCurveCollection(
survey_tags=['v', 'n', 'nr', 'm', 'j', 'no_tag'],
# Creating the validation collection | split [8] = 10% of the data
self.negative_validation = MoaSurveyLightCurveCollection2M(
survey_tags=['v', 'n', 'nr', 'm', 'j', self.moa_data_interface_2M.no_tag_string],
label=0,
dataset_splits=[validation_split])
self.positive_validation = MoaSurveyLightCurveCollection(
self.positive_validation = MoaSurveyLightCurveCollection2M(
survey_tags=['c', 'cf', 'cp', 'cw', 'cs', 'cb'],
label=1,
dataset_splits=[validation_split])

self.negative_inference = MoaSurveyLightCurveCollection(
survey_tags=['v', 'n', 'nr', 'm', 'j', 'no_tag'],
# Creating the inference collection | split [9] = 10% of the data
self.negative_inference = MoaSurveyLightCurveCollection2M(
survey_tags=['v', 'n', 'nr', 'm', 'j', self.moa_data_interface_2M.no_tag_string],
label=0,
dataset_splits=[test_split])
self.positive_inference = MoaSurveyLightCurveCollection(
self.positive_inference = MoaSurveyLightCurveCollection2M(
survey_tags=['c', 'cf', 'cp', 'cw', 'cs', 'cb'],
label=1,
dataset_splits=[test_split])
self.all_inference = MoaSurveyLightCurveCollection(
self.all_inference = MoaSurveyLightCurveCollection2M(
survey_tags=['c', 'cf', 'cp', 'cw', 'cs', 'cb',
'v', 'n', 'nr', 'm', 'j', 'no_tag'],
'v', 'n', 'nr', 'm', 'j', self.moa_data_interface_2M.no_tag_string],
label=np.nan,
dataset_splits=[test_split])

Expand All @@ -77,7 +82,6 @@ def get_microlensing_train_dataset(self):
negative_train_light_curve_collection],
post_injection_transform=partial(
default_light_curve_observation_post_injection_transform, length=18_000))
# print('check "properties" of the train_light_curve_dataset', train_light_curve_dataset)
return train_light_curve_dataset

def get_microlensing_validation_dataset(self):
Expand All @@ -101,47 +105,3 @@ def get_microlensing_infer_collection(self):
get_paths_function=self.all_inference.get_paths,
load_times_and_fluxes_from_path_function=self.all_inference.load_times_and_fluxes_from_path)
return infer_light_curve_collection

class MoaSurveyLightCurveCollection(LightCurveCollection):
"""
A collection of light curves based on the MOA 9-year survey.
"""

def __init__(
self,
survey_tags: list[str],
dataset_splits: list[int] | None = None,
label: float | list[float] | np.ndarray | None = None,
):
super().__init__()
self.label = label
self.survey_tags: list[str] = survey_tags
self.dataset_splits: list[int] | None = dataset_splits

def get_paths(self):
"""
Gets the paths for the light curves in the collection.
:return: An iterable of the light curve paths.
"""
paths: list[Path] = []
for tag in self.survey_tags:
tag_paths = self.moa_data_interface.survey_tag_to_path_list_dictionary[tag]
if self.dataset_splits is not None:
# Split on each tag, so that the splitting remains across collections with different tag selections.
tag_paths = self.shuffle_and_split_paths(tag_paths, self.dataset_splits)
paths.extend(tag_paths)
return paths

def load_times_and_fluxes_from_path(self, path) -> (np.ndarray, np.ndarray):
"""
Loads the times and fluxes from a given light curve path.
:param path: The path to the light curve file.
:return: The times and the fluxes of the light curve.
"""
light_curve_dataframe = pd.read_feather(path)
times = light_curve_dataframe["HJD"].values
fluxes = light_curve_dataframe["flux"].values
return times, fluxes

2 changes: 1 addition & 1 deletion microlensing/moa_infer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
import pandas as pd

from moa_dataset import MoaSurveyMicrolensingAndNonMicrolensingDatabase
from moa_dataset_550 import MoaSurveyMicrolensingAndNonMicrolensingDatabase
from functools import partial
from qusi.finite_standard_light_curve_dataset import FiniteStandardLightCurveDataset
from qusi.light_curve_dataset import default_light_curve_observation_post_injection_transform, \
Expand Down
108 changes: 108 additions & 0 deletions microlensing/moa_survey_light_curve_collection_2M.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from __future__ import annotations

import re
import shutil
import socket
from pathlib import Path
from typing import TYPE_CHECKING

import pandas as pd
import scipy.stats
from filelock import FileLock

from moa_data_interface_2M import MoaDataInterface2M
from ramjet.photometric_database.light_curve_collection import LightCurveCollection

if TYPE_CHECKING:
from collections.abc import Iterable

import numpy as np


class MoaSurveyLightCurveCollection2M(LightCurveCollection):
"""
A collection of light curves based on the MOA 9-year survey.
"""

moa_data_interface = MoaDataInterface2M()

def __init__(
self,
survey_tags: list[str],
dataset_splits: list[int] | None = None,
label: float | list[float] | np.ndarray | None = None,
):
super().__init__()
self.label = label
self.survey_tags: list[str] = survey_tags
self.dataset_splits: list[int] | None = dataset_splits

def get_paths(self) -> Iterable[Path]:
"""
Gets the paths for the light curves in the collection.
:return: An iterable of the light curve paths.
"""
paths: list[Path] = []
for tag in self.survey_tags:
tag_paths = self.moa_data_interface.survey_tag_to_path_list_dictionary[tag]
if self.dataset_splits is not None:
# Split on each tag, so that the splitting remains across collections with different tag selections.
tag_paths = self.shuffle_and_split_paths(tag_paths, self.dataset_splits)
paths.extend(tag_paths)
return paths

def move_path_to_nvme(self, path: Path) -> Path:
match = re.match(r"gpu\d{3}", socket.gethostname())
if match is not None:
nvme_path = Path("/lscratch/golmsche").joinpath(path)
if not nvme_path.exists():
nvme_path.parent.mkdir(exist_ok=True, parents=True)
nvme_lock_path = nvme_path.parent.joinpath(nvme_path.name + ".lock")
lock = FileLock(str(nvme_lock_path))
with lock.acquire():
if not nvme_path.exists():
nvme_tmp_path = nvme_path.parent.joinpath(nvme_path.name + ".tmp")
shutil.copy(path, nvme_tmp_path)
nvme_tmp_path.rename(nvme_path)
return nvme_path
return path

def load_times_and_fluxes_from_path(self, path: Path) -> (np.ndarray, np.ndarray):
"""
Loads the times and fluxes from a given light curve path.
:param path: The path to the light curve file.
:return: The times and the fluxes of the light curve.
"""
path = self.move_path_to_nvme(path)
light_curve_dataframe = pd.read_feather(path)
times = light_curve_dataframe["HJD"].values
fluxes = light_curve_dataframe["flux"].values
return times, fluxes

def load_times_and_magnifications_from_path(self, path: Path) -> (np.ndarray, np.ndarray):
"""
Loads the times and magnifications from a given path as an injectable signal.
:param path: The path to the light curve/signal file.
:return: The times and the magnifications of the light curve/signal.
"""
path = self.move_path_to_nvme(path)
times, fluxes = self.load_times_and_fluxes_from_path(path)
magnifications, times = self.generate_synthetic_signal_from_real_data(fluxes, times)
return times, magnifications

@staticmethod
def generate_synthetic_signal_from_real_data(fluxes: np.ndarray, times: np.ndarray) -> (np.ndarray, np.ndarray):
"""
Takes real light curve data and converts it to a form that can be used for synthetic light curve injection.
:param fluxes: The real light curve fluxes.
:param times: The real light curve times.
:return: Fake synthetic magnifications and times.
"""
flux_median_absolute_deviation = scipy.stats.median_abs_deviation(fluxes)
normalized_fluxes = (fluxes / flux_median_absolute_deviation) * 0.25
# relative_times = times - np.min(times)
return normalized_fluxes, times
5 changes: 2 additions & 3 deletions microlensing/moa_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from qusi.train_logging_configuration import TrainLoggingConfiguration
from qusi.train_session import train_session

from torchmetrics.classification import (BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinarySpecificity,
BinaryStatScores)
from torchmetrics.classification import (BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinarySpecificity)

from moa_dataset_550 import MoaSurveyMicrolensingAndNonMicrolensingDatabase
from wrapped_metrics import WrappedBinaryPrecision, WrappedBinaryRecall
Expand All @@ -13,7 +12,7 @@

def main(test_split):
# WAND
logging_configuration = TrainLoggingConfiguration.new(wandb_project='qusi_moa', wandb_entity='ramjet')
logging_configuration = TrainLoggingConfiguration.new(wandb_project='qusi_moa_2M', wandb_entity='ramjet')

# Database
database = MoaSurveyMicrolensingAndNonMicrolensingDatabase(test_split=test_split)
Expand Down
Loading

0 comments on commit 06b61b9

Please sign in to comment.