Merge pull request #32 from francois-drielsma/develop

Significant changes ahead of release 0.2.0
DeepLearnPhysics · Nov 18, 2024 · 84a81ce · 84a81ce
2 parents 30083b4 + d09435b
commit 84a81ce
Show file tree

Hide file tree

Showing 126 changed files with 4,908 additions and 1,788 deletions.
diff --git a/bin/larcv_check_valid.py b/bin/larcv_check_valid.py
@@ -64,7 +64,7 @@ def main(source, source_list, output):
             (set(keys_list[idx]) != set(all_keys))):
             print(f"- Bad file: {file_path}")
             out_file.write(f'{file_path}\n')
-            bad_files += file_path
+            bad_files.append(file_path)
 
     suffix = ':' if len(bad_files) > 0 else '.'
     print(f"\nFound {len(bad_files)} bad files{suffix}")

diff --git a/spine/ana/base.py b/spine/ana/base.py
@@ -25,11 +25,19 @@ class AnaBase(ABC):
     units : str
         Units in which the coordinates are expressed
     """
+
+    # Name of the analysis script (as specified in the configuration)
     name = None
+
+    # Alternative allowed names of the analysis script
     aliases = ()
-    keys = None
+
+    # Units in which the analysis script expects objects to be expressed in
     units = 'cm'
 
+    # Set of data keys needed for this analysis script to operate
+    _keys = ()
+
     # List of recognized object types
     _obj_types = ('fragment', 'particle', 'interaction')
 
@@ -58,9 +66,7 @@ def __init__(self, obj_type=None, run_mode=None, append=False,
             Name to prefix every output CSV file with
         """
         # Initialize default keys
-        if self.keys is None:
-            self.keys = {}
-        self.keys.update({
+        self.update_keys({
                 'index': True, 'file_index': True,
                 'file_entry_index': False, 'run_info': False
         })
@@ -104,7 +110,9 @@ def __init__(self, obj_type=None, run_mode=None, append=False,
         self.obj_keys = (self.fragment_keys 
                          + self.particle_keys 
                          + self.interaction_keys)
-        self.keys.update({k:True for k in self.obj_keys})
+
+        # Update underlying keys, if needed
+        self.update_keys({k:True for k in self.obj_keys})
 
         # Store the append flag
         self.append_file = append
@@ -136,6 +144,42 @@ def initialize_writer(self, name):
                 file_name, append=self.append_file,
                 overwrite=self.overwrite_file)
 
+    @property
+    def keys(self):
+        """Dictionary of (key, necessity) pairs which determine which data keys
+        are needed/optional for the post-processor to run.
+
+        Returns
+        -------
+        Dict[str, bool]
+            Dictionary of (key, necessity) pairs to be used
+        """
+        return dict(self._keys)
+
+    @keys.setter
+    def keys(self, keys):
+        """Converts a dictionary of keys to an immutable tuple.
+
+        Parameters
+        ----------
+        Dict[str, bool]
+            Dictionary of (key, necessity) pairs to be used
+        """
+        self._keys = tuple(keys.items())
+
+    def update_keys(self, update_dict):
+        """Update the underlying set of keys and their necessity in place.
+
+        Parameters
+        ----------
+        update_dict : Dict[str, bool]
+            Dictionary of (key, necessity) pairs to update the keys with
+        """
+        if len(update_dict) > 0:
+            keys = self.keys
+            keys.update(update_dict)
+            self._keys = tuple(keys.items())
+
     def get_base_dict(self, data):
         """Builds the entry information dictionary.
 

diff --git a/spine/ana/diag/__init__.py b/spine/ana/diag/__init__.py
@@ -0,0 +1,10 @@
+'''Diagnostic analaysis scripts.
+
+This submodule is use to run basic diagnostics analyses such as:
+- Track dE/dx profile
+- Track energy reconstruction
+- Shower start dE/dx
+- ...
+'''
+
+from .shower import *
diff --git a/spine/ana/diag/shower.py b/spine/ana/diag/shower.py
@@ -0,0 +1,64 @@
+'''Module to evaluate diagnostic metrics on showers.'''
+
+from spine.ana.base import AnaBase
+
+__all__ = ['ShowerStartDEdxAna']
+
+
+class ShowerStartDEdxAna(AnaBase):
+    """This analysis script computes the dE/dx value within some distance
+    from the start point of an EM shower object.
+
+    This is a useful diagnostic tool to evaluate the calorimetric separability
+    of different EM shower types (electron vs photon), which are expected to
+    have different dE/dx patterns near their start point.
+    """
+
+    # Name of the analysis script (as specified in the configuration)
+    name = 'shower_start_dedx'
+
+    def __init__(self, radius, obj_type='particle', run_mode='both', 
+                 truth_point_mode='points', truth_dep_mode='depositions',
+                 **kwargs):
+        """Initialize the analysis script.
+
+        Parameters
+        ----------
+        radius : Union[float, List[float]]
+            Radius around the start point for which evaluate dE/dx
+        **kwargs : dict, optional
+            Additional arguments to pass to :class:`AnaBase`
+        """
+        # Initialize the parent class
+        super().__init__(obj_type, run_mode, **kwargs)
+
+        # Store parameters
+        self.radius = radius
+
+        # Initialize the CSV writer(s) you want
+        for obj in self.obj_type:
+            self.initialize_writer(obj)
+
+    def process(self, data):
+        """Evaluate shower start dE/dx for one entry.
+
+        Parameters
+        ----------
+        data : dict
+            Dictionary of data products
+        """
+        # Fetch the keys you want
+        data = data['prod']
+
+        # Loop over all requested object types
+        for key in self.obj_keys:
+            # Loop over all objects of that type
+            for obj in data[key]:
+                # Do something with the object
+                disp = p.end_point - p.start_point
+
+                # Make a dictionary of integer out of it
+                out = {'disp_x': disp[0], 'disp_y': disp[1], 'disp_z': disp[2]}
+
+                # Write the row to file
+                self.append('template', **out)
diff --git a/spine/ana/factories.py b/spine/ana/factories.py
@@ -10,7 +10,7 @@
     ANA_DICT.update(**module_dict(module))
 
 
-def ana_script_factory(name, cfg, overwrite=False, log_dir=None, prefix=None):
+def ana_script_factory(name, cfg, overwrite=None, log_dir=None, prefix=None):
     """Instantiates an analyzer module from a configuration dictionary.
 
     Parameters
@@ -22,7 +22,7 @@ def ana_script_factory(name, cfg, overwrite=False, log_dir=None, prefix=None):
     parent_path : str
         Path to the parent directory of the main analysis configuration. This
         allows for the use of relative paths in the analyzers.
-    overwrite : bool, default False
+    overwrite : bool, optional
         If `True`, overwrite the CSV logs if they already exist
     log_dir : str, optional
         Output CSV file directory (shared with driver log)
@@ -39,5 +39,9 @@ def ana_script_factory(name, cfg, overwrite=False, log_dir=None, prefix=None):
     cfg['name'] = name
 
     # Instantiate the analysis script module
-    return instantiate(
-            ANA_DICT, cfg, overwrite=overwrite, log_dir=log_dir, prefix=prefix)
+    if overwrite is not None:
+        return instantiate(
+                ANA_DICT, cfg, overwrite=overwrite, log_dir=log_dir, prefix=prefix)
+    else:
+        return instantiate(
+                ANA_DICT, cfg, log_dir=log_dir, prefix=prefix)
diff --git a/spine/ana/manager.py b/spine/ana/manager.py
@@ -1,5 +1,6 @@
 """Manages the operation of analysis scripts."""
 
+from copy import deepcopy
 from collections import defaultdict, OrderedDict
 
 import numpy as np
@@ -35,7 +36,7 @@ def __init__(self, cfg, log_dir=None, prefix=None):
         # Parse the analysis block configuration
         self.parse_config(log_dir, prefix, **cfg)
 
-    def parse_config(self, log_dir, prefix, overwrite=False,
+    def parse_config(self, log_dir, prefix, overwrite=None,
                      prefix_output=False, **modules):
         """Parse the analysis tool configuration.
 
@@ -46,14 +47,15 @@ def parse_config(self, log_dir, prefix, overwrite=False,
         prefix : str
             Input file prefix. If requested, it will be used to prefix
             all the output CSV files.
-        overwrite : bool, default False
+        overwrite : bool, optional
             If `True`, overwrite the CSV logs if they already exist
         prefix_output : bool, optional
             If `True`, will prefix the output CSV names with the input file name
         **modules : dict
             List of analysis script modules
         """
         # Loop over the analyzer modules and get their priorities
+        modules = deepcopy(modules)
         keys = np.array(list(modules.keys()))
         priorities = -np.ones(len(keys), dtype=np.int32)
         for i, k in enumerate(keys):

diff --git a/spine/ana/metric/cluster.py b/spine/ana/metric/cluster.py
@@ -22,13 +22,16 @@ class ClusterAna(AnaBase):
     - particles
     - interactions
     """
+
+    # Name of the analysis script (as specified in the configuration)
     name = 'cluster_eval'
 
     # Label column to use for each clustering label_col
-    _label_cols = {
-            'fragment': CLUST_COL, 'particle': GROUP_COL,
-            'interaction': INTER_COL
-    }
+    _label_cols = (
+            ('fragment', CLUST_COL),
+            ('particle', GROUP_COL),
+            ('interaction', INTER_COL)
+    )
 
     def __init__(self, obj_type=None, use_objects=False, per_object=True,
                  per_shape=True, metrics=('pur', 'eff', 'ari'),
@@ -53,7 +56,7 @@ def __init__(self, obj_type=None, use_objects=False, per_object=True,
         label_key : str, default 'clust_label_adapt'
             Name of the tensor which contains the cluster labels, when
             using the raw reconstruction output
-        label_col : str
+        label_col : str, optional
             Column name in the label tensor specifying the aggregation label_col
         **kwargs : dict, optional
             Additional arguments to pass to :class:`AnaBase`
@@ -71,9 +74,9 @@ def __init__(self, obj_type=None, use_objects=False, per_object=True,
 
         # Initialize the parent class
         super().__init__(obj_type, 'both', **kwargs)
-        if not use_objects:
-            for key in self.obj_keys:
-                del self.keys[key]
+
+
+        # If the clustering is not done per object, fix target
         if not per_object:
             self.obj_type = [label_col]
 
@@ -90,27 +93,47 @@ def __init__(self, obj_type=None, use_objects=False, per_object=True,
         # Convert metric strings to functions
         self.metrics = {m: getattr(spine.utils.metrics, m) for m in metrics}
 
-        # List the necessary data products
+        # If objects are not used, remove them from the required keys
+        keys = self.keys
+        if not use_objects:
+            for key in self.obj_keys:
+                del keys[key]
+
+        # List other necessary data products
         if self.per_object:
             if not self.use_objects:
                 # Store the labels and the clusters output by the reco chain
-                self.keys[label_key] = True
+                keys[label_key] = True
                 for obj in self.obj_type:
-                    self.keys[f'{obj}_clusts'] = True
-                    self.keys[f'{obj}_shapes'] = True
+                    keys[f'{obj}_clusts'] = True
+                    keys[f'{obj}_shapes'] = True
 
             else:
-                self.keys['points'] = True
+                keys['points'] = True
 
         else:
-            self.keys[label_key] = True
-            self.keys['clusts'] = True
-            self.keys['group_pred'] = True
+            keys[label_key] = True
+            keys['clusts'] = True
+            keys['group_pred'] = True
+
+        self.keys = keys
 
         # Initialize the output
         for obj in self.obj_type:
             self.initialize_writer(obj)
 
+    @property
+    def label_cols(self):
+        """Dictionary of (key, column_id) pairs which determine which column
+        in the label tensor corresponds to a specific clustering target.
+
+        Returns
+        -------
+        Dict[str, int]
+            Dictionary of (key, column_id) mapping from name to label column
+        """
+        return dict(self._label_cols)
+
     def process(self, data):
         """Store the clustering metrics for one entry.
 
@@ -124,7 +147,7 @@ def process(self, data):
             # Build the cluster labels for this object type
             if not self.use_objects:
                 # Fetch the right label column
-                label_col = self.label_col or self._label_cols[obj_type]
+                label_col = self.label_col or self.label_cols[obj_type]
                 num_points = len(data[self.label_key])
                 labels = data[self.label_key][:, label_col]
                 shapes = data[self.label_key][:, SHAPE_COL]

diff --git a/spine/ana/metric/point.py b/spine/ana/metric/point.py
@@ -25,8 +25,13 @@ class PointProposalAna(AnaBase):
     - Point type classification accuracy
     - Point end classification accuracy
     """
+
+    # Name of the analysis script (as specified in the configuration)
     name = 'point_eval'
 
+    # Set of data keys needed for this analysis script to operate
+    _keys = (('ppn_pred', True),)
+
     def __init__(self, num_classes=LOWES_SHP, label_key='ppn_label',
                  endpoints=False, **kwargs):
         """Initialize the analysis script.
@@ -51,8 +56,7 @@ def __init__(self, num_classes=LOWES_SHP, label_key='ppn_label',
         self.endpoints = endpoints
 
         # Append other required key
-        self.keys['ppn_pred'] = True
-        self.keys[self.label_key] = True
+        self.update_keys({self.label_key: True})
 
         # Initialize the output
         self.initialize_writer('truth_to_reco')