Merge pull request #221 from brandynlucca/brandynlucca_WIP_transect_a…

…nalysis_tests Add unit tests for `init`, `transect_analysis`, and `stratified_summary` Survey-class methods
OSOceanAcoustics · Apr 16, 2024 · 9548528 · 9548528
2 parents 19e6a98 + 4378029
commit 9548528
Show file tree

Hide file tree

Showing 9 changed files with 2,153 additions and 70 deletions.
diff --git a/docs/glossary.md b/docs/glossary.md
@@ -33,4 +33,4 @@
 <!-- Glossary of all symbols, indices, and notations used for mathematical equations and variables contained within the `Survey` class object. 
 
 
-![ text ](images/symbols.jpeg) -->
+![ text ](images/symbols.jpeg) -->
diff --git a/echopop/computation/operations.py b/echopop/computation/operations.py
@@ -104,7 +104,8 @@ def bin_stats( dataframe: pd.DataFrame ,
     return (
         dataframe # input dataframe 
         .bin_variable( bin_values , bin_variable ) # discretize variable into bins )
-        .groupby( [f'{bin_variable}_bin'] + con_lst ) # group by these variables/contrasts
+        .groupby( [f'{bin_variable}_bin'] + con_lst ,
+                  observed = False ) # group by these variables/contrasts
         .agg( aggregation_dict ) # apply specified functions
         .replace( np.nan , 0 ) # replace NaN w/ 0's
         .droplevel( level = 0 , axis = 1 ) # drop the column indices 
@@ -132,7 +133,7 @@ def count_variable( dataframe: pd.DataFrame ,
     return (
         dataframe # input dataframe
         .reset_index( drop=True )
-        .groupby( contrasts ) 
+        .groupby( contrasts , observed = False ) 
         .agg({variable: [('count' , fun)]})
         .replace(np.nan, 0 )
         .droplevel( level = 0 , axis = 1 )
@@ -158,15 +159,16 @@ def meld( specimen_dataframe: pd.DataFrame ,
     specimen_stacked = (
         specimen_dataframe 
         .copy()
-        .groupby(['stratum_num' , 'species_id' , 'sex' , 'group' , 'station' , 'length' , 'length_bin' ])
-        .apply(lambda x: len(x['length']))
+        .groupby( ['stratum_num' , 'species_id' , 'sex' , 'group' , 'station' , 'length' , 'length_bin' ] ,
+                  observed = False )[ [ 'length' ] ]
+        .apply(lambda x: len( x ) , include_groups = True )
         .reset_index(name='length_count')
     )
 
     # Concatenate the data frames and return
     return pd.concat( [ specimen_stacked ,
                         length_dataframe ] ,
-                        join = 'inner' )
+                        join = 'inner' ).reset_index( drop = True )
 
 @patch_method_to_DataFrame( pd.DataFrame )    
 def stretch( dataframe ,             

diff --git a/echopop/survey.py b/echopop/survey.py
@@ -672,7 +672,7 @@ def strata_sex_weight_proportions( self ,
         station_length_aggregate = (
             station_sex_length
             # calculate the within-sample sum and proportions (necessary for the downstream dot product calculation)
-            .pipe( lambda x: x.assign( within_station_n = x.groupby( [ 'sex' , 'station' , 'stratum_num' ] )[ 'count' ].transform( sum ) ,
+            .pipe( lambda x: x.assign( within_station_n = x.groupby( [ 'sex' , 'station' , 'stratum_num' ] )[ 'count' ].transform( 'sum' ) ,
                                         within_station_p = lambda x: x[ 'count' ] / x[ 'within_station_n' ] ) )
             .replace( np.nan, 0 ) # remove erroneous NaN (divide by 0 or invalid values)
             .merge( total_n , on = 'stratum_num' ) # merge station_sex_length with total_n
@@ -688,8 +688,9 @@ def strata_sex_weight_proportions( self ,
             .loc[ station_length_aggregate.sex.isin( [ 'male' , 'female' ] ) ] # only parse 'male' and 'female'
             # create a pivot that will reorient data to the desired shape
             .pivot_table( index = [ 'sex' , 'station' ] , 
-                        columns = [ 'stratum_num' ] , 
-                        values = [ 'overall_station_p' ] )
+                          columns = [ 'stratum_num' ] , 
+                          values = [ 'overall_station_p' ] ,
+                          observed = False )
             .groupby( 'sex' )
             .sum( )
         )
@@ -701,7 +702,8 @@ def strata_sex_weight_proportions( self ,
             # create a pivot that will reorient data to the desired shape
             .pivot_table( index = [ 'sex' , 'station' ] , 
                           columns = 'stratum_num' , 
-                          values = 'overall_station_p' )
+                          values = 'overall_station_p' ,
+                          observed = False )
             .groupby( 'station' )
             .sum()
         )
@@ -713,7 +715,8 @@ def strata_sex_weight_proportions( self ,
             # create a pivot that will reorient data to the desired shape
             .pivot_table( index = [ 'sex' , 'station' ] , 
                           columns = 'stratum_num' , 
-                          values = 'overall_station_p' )
+                          values = 'overall_station_p'  ,
+                          observed = False )
             .groupby( [ 'sex' , 'station' ] )
             .sum()
         )
@@ -728,7 +731,8 @@ def strata_sex_weight_proportions( self ,
                 .reset_index( name = 'stn_p' ) , on = [ 'stratum_num' , 'station' ] )
             .pivot_table( columns = 'stratum_num' ,
                           index = [ 'station' , 'sex' ] ,
-                          values = [ 'stn_p' , 'sex_stn_p' ] )    
+                          values = [ 'stn_p' , 'sex_stn_p' ] ,
+                          observed = False )    
         )
 
         ### Format the length bin proportions so they resemble a similar table/matrix shape as the above metrics
@@ -737,7 +741,8 @@ def strata_sex_weight_proportions( self ,
             station_length_aggregate
             .pivot_table( columns = [ 'sex' , 'station' , 'stratum_num' ] , 
                           index = [ 'length_bin' ] ,
-                          values = [ 'within_station_p' ] )[ 'within_station_p' ]
+                          values = [ 'within_station_p' ] ,
+                          observed = False )[ 'within_station_p' ]
         )
 
         ### Calculate combined station fraction means
@@ -837,13 +842,13 @@ def strata_age_binned_weight_proportions( self ,
             .count_variable( variable = 'length' ,
                              contrasts = [ 'stratum_num' , 'age' ] ,
                              fun = 'size' )
-            .pipe( lambda x: x.assign( stratum_count_all = x.groupby( [ 'stratum_num' ] )[ 'count' ].transform( sum ) ,
-                                       stratum_count_total  = x.loc[ x.age > 1 ].groupby( [ 'stratum_num' ] )[ 'count' ].transform( sum ) ) )
-            .groupby( [ 'stratum_num' , 'age' ] )
+            .pipe( lambda x: x.assign( stratum_count_all = x.groupby( [ 'stratum_num' ] )[ 'count' ].transform( 'sum' ) ,
+                                       stratum_count_total  = x.loc[ x.age > 1 ].groupby( [ 'stratum_num' ] )[ 'count' ].transform( 'sum' ) ) )
+            .groupby( [ 'stratum_num' , 'age' ] , observed = False )[ [ 'age' , 'count' , 'stratum_count_all' , 'stratum_count_total' ] ]
             .apply( lambda df: pd.Series( {
                 'count_age_proportion_all': ( df[ 'count' ] / df.stratum_count_all ).sum() ,
                 'count_age_proportion_adult': ( df.loc[ df.age > 1 ][ 'count' ] / df.stratum_count_total ).sum( )
-            } ) )
+            } ) , include_groups = True )
             .reset_index( )
         )
 
@@ -864,17 +869,17 @@ def strata_age_binned_weight_proportions( self ,
             .dropna( how = 'any' )
             .pipe( lambda df: df.assign( weight_stratum_all = df
                                                         .groupby( [ 'stratum_num' ] )[ 'weight' ]
-                                                        .transform( sum ) ,
+                                                        .transform( 'sum' ) ,
                                         weight_stratum_adult = df
                                                             .loc[ lambda x: x.age > 1 ]
                                                             .groupby( [ 'stratum_num' ] )[ 'weight' ]
-                                                            .transform( sum ) ) )
+                                                            .transform( 'sum' ) ) )
             .groupby( [ 'stratum_num' , 'age' ] )
             .apply( lambda df: pd.Series( {
                 'weight_age_proportion_all': ( df.weight / df.weight_stratum_all ).sum( ) ,
                 'weight_age_proportion_adult': ( df.weight / df.weight_stratum_adult ).sum( )
-            } ) )
-            .reset_index()
+            } ) , include_groups = False )
+            .reset_index( )
         )
 
         # Calculate adult proportions/contributions (in terms of summed weight) for each stratum
@@ -888,14 +893,15 @@ def strata_age_binned_weight_proportions( self ,
             .count_variable( contrasts = [ 'stratum_num' , 'age' , 'length_bin' , 'sex' ] ,
                             variable = 'weight' ,
                             fun = 'sum' )
-            .pipe( lambda df: df.assign( weight_total_all = df.groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( sum ) ,
-                                         weight_total_adult = df.loc[ df.age > 1 ].groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( sum ) ) )
+            .pipe( lambda df: df.assign( weight_total_all = df.groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( 'sum' ) ,
+                                         weight_total_adult = df.loc[ df.age > 1 ].groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( 'sum' ) ) )
             .groupby( [ 'stratum_num' , 'age' , 'sex' ] )
             .apply( lambda x: pd.Series( {
                 'weight_sex_proportion_all': ( x[ 'count' ] / x.weight_total_all ).sum() ,
                 'weight_sex_proportion_adult': ( x[ 'count' ] / x.weight_total_adult ).sum()
-            } ) )
+            } ) , include_groups = False )
             .reset_index( )
+            .fillna( 0 )
         )
 
         length_sex_age_weight_proportions = (
@@ -908,10 +914,11 @@ def strata_age_binned_weight_proportions( self ,
             .count_variable( contrasts = [ 'stratum_num' , 'age' , 'length_bin' , 'sex' ] ,
                             variable = 'weight' ,
                             fun = 'sum' )
-            .pipe( lambda df: df.assign( weight_total_all = df.groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( sum ) ,
-                                         weight_total_adult = df.loc[ df.age > 1 ].groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( sum ) ) )
+            .pipe( lambda df: df.assign( weight_total_all = df.groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( 'sum' ) ,
+                                         weight_total_adult = df.loc[ df.age > 1 ].groupby( [ 'stratum_num' , 'sex' ] )[ 'count' ].transform( 'sum' ) ) )
             .assign( weight_length_sex_proportion_all = lambda x: x[ 'count' ] / x.weight_total_all ,
                      weight_length_sex_proportion_adult = lambda x: x[ 'count' ] / x.weight_total_adult )
+            .replace( np.nan , 0 )
         )
 
         ### Add these dataframes to the appropriate data attribute