Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

integrate smaup into tobler #139

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .ci/37.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ dependencies:
- pygeos
- h3-py
- joblib
- esda
1 change: 1 addition & 0 deletions .ci/38.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,4 @@ dependencies:
- pygeos
- h3-py
- joblib
- esda
1 change: 1 addition & 0 deletions .ci/39.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ dependencies:
- numpydoc
- nbsphinx
- joblib
- esda
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ dependencies:
- mapclassify
- descartes
- joblib
- esda
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ libpysal
tqdm
pygeos
joblib
esda
3 changes: 2 additions & 1 deletion requirements_tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ pytest-cov
twine
xgboost
shap >=0.33
quilt3 >=3.1.11
quilt3 >=3.1.11
esda
1 change: 1 addition & 0 deletions tobler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
from . import dasymetric
from . import model
from . import util
from . import diagnostics
31 changes: 29 additions & 2 deletions tobler/area_weighted/area_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import numpy as np
import geopandas as gpd
from ._vectorized_raster_interpolation import _fast_append_profile_in_gdf
import warnings
from warnings import warn
from scipy.sparse import dok_matrix, diags, coo_matrix
import pandas as pd
import os

from tobler.util.util import _check_crs, _nan_check, _inf_check, _check_presence_of_crs
from tobler.diagnostics import _smaup


def _chunk_dfs(geoms_to_chunk, geoms_full, n_jobs):
Expand Down Expand Up @@ -268,6 +269,7 @@ def _area_interpolate_binning(
spatial_index="auto",
n_jobs=1,
categorical_variables=None,
smaup_weight=None
):
"""
Area interpolation for extensive, intensive and categorical variables.
Expand Down Expand Up @@ -307,6 +309,9 @@ def _area_interpolate_binning(
of `pygeos` and `geopandas`.
categorical_variables : list
[Optional. Default=None] Columns in dataframes for categorical variables
smaup_weight : libpysal.weights
[Optional. Default = None] Argument for tobler's smaup wrapper
w to calculate Moran's I. Will use Rook if nothing is passed.

Returns
-------
Expand Down Expand Up @@ -348,6 +353,28 @@ def _area_interpolate_binning(
source_df = source_df.copy()
target_df = target_df.copy()

if smaup_weight is not None:
for var in intensive_variables:
stat = _smaup(
source_df=source_df,
target_df=target_df,
y=source_df[var].to_numpy(),
w=smaup_weight)
if stat.summary.find('H0 is rejected'):
warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!")
else:
print(f"{var} is not affected by the MAUP.")
else:
for var in intensive_variables:
stat = _smaup(
source_df=source_df,
target_df=target_df,
y=source_df[var].to_numpy())
if stat.summary.find('H0 is rejected'):
warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!")
else:
print(f"{var} is not affected by the MAUP.")

if _check_crs(source_df, target_df):
pass
else:
Expand Down Expand Up @@ -614,7 +641,7 @@ def _area_tables_raster(
res_union_pre = gpd.overlay(source_df, target_df, how="union")

# Establishing a CRS for the generated union
warnings.warn(
warn(
"The CRS for the generated union will be set to be the same as source_df."
)
res_union_pre.crs = source_df.crs
Expand Down
31 changes: 30 additions & 1 deletion tobler/dasymetric/masked_area_interpolate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

from ..area_weighted._vectorized_raster_interpolation import *

from tobler.diagnostics import _smaup
from warnings import warn


def masked_area_interpolate(
source_df,
Expand All @@ -13,6 +16,7 @@ def masked_area_interpolate(
intensive_variables=None,
allocate_total=True,
tables=None,
smaup_weight=None,
):
"""Interpolate data between two polygonal datasets using an auxiliary raster to mask out uninhabited land.

Expand All @@ -38,7 +42,10 @@ def masked_area_interpolate(
whether to allocate the total from the source geometries (the default is True).
tables : tuple of two numpy.array (optional)
As generated from `tobler.area_weighted.area_tables_raster` (the default is None).

smaup_weight : libpysal.weights
[Optional. Default = None] Argument for tobler's smaup wrapper
w to calculate Moran's I. Will use Rook if nothing is passed.

Returns
-------
geopandas.GeoDataFrame
Expand All @@ -53,6 +60,28 @@ def masked_area_interpolate(
"You must pass the path to a raster that can be read with rasterio"
)

if smaup_weight is not None:
for var in intensive_variables:
stat = _smaup(
source_df=source_df,
target_df=target_df,
y=source_df[var].to_numpy(),
w=smaup_weight)
if stat.summary.find('H0 is rejected'):
warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!")
else:
print(f"{var} is not affected by the MAUP.")
else:
for var in intensive_variables:
stat = _smaup(
source_df=source_df,
target_df=target_df,
y=source_df[var].to_numpy())
if stat.summary.find('H0 is rejected'):
warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!")
else:
print(f"{var} is not affected by the MAUP.")

if not tables:
tables = _area_tables_raster(
source_df,
Expand Down
1 change: 1 addition & 0 deletions tobler/diagnostics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .smaup import _smaup
45 changes: 45 additions & 0 deletions tobler/diagnostics/smaup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
A wrapper for using the S-maup statistical test in tobler interpolation

"""

from esda.smaup import Smaup
from esda.moran import Moran
from libpysal.weights import Rook
from warnings import warn

def _smaup(
source_df,
target_df,
y,
w=None):
"""
A function that wraps esda's smaup function and automates some of the process of calculating smaup.
For more about smaup read here: https://pysal.org/esda/generated/esda.Smaup.html#esda.Smaup

Parameters
----------
source_df : geopandas.GeoDataFrame
source data to be converted. Used to construct spatial weights.
target_df : geopandas.GeoDataFrame
target geometries that will form the new representation of the input data. Used for k.
y : array
data for autocorellation calculation
w : libpysal.weights object
pysal spatial weights object for autocorellation calculation.
Rook recommended for smaup and used by default.

Returns
-------
esda.smaup.Smaup:
statistic that contains information regarding the extent to which the variable is affected by the MAUP.

"""
if w is None:
w = Rook.from_dataframe(source_df)
rho = Moran(y, w).I
n = len(y)
k = len(target_df)
stat = Smaup(n,k,rho)

return stat
28 changes: 28 additions & 0 deletions tobler/model/glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
_return_weights_from_regression,
)
from tobler.util import project_gdf
from tobler.diagnostics import _smaup


def glm_pixel_adjusted(
Expand Down Expand Up @@ -100,6 +101,7 @@ def glm(
likelihood="poisson",
force_crs_match=True,
return_model=False,
smaup_weight=None
):
"""Estimate interpolated values using raster data as input to a generalized linear model.

Expand Down Expand Up @@ -130,6 +132,9 @@ def glm(
return model : bool
whether to return the fitted model in addition to the interpolated geodataframe.
If true, this will return (geodataframe, model)
smaup_weight : libpysal.weights
[Optional. Default = None] Argument for tobler's smaup wrapper
w to calculate Moran's I. Will use Rook if nothing is passed.

Returns
--------
Expand All @@ -143,6 +148,29 @@ def glm(
source_df = source_df.copy()
target_df = target_df.copy()
_check_presence_of_crs(source_df)

if smaup_weight is not None:
for var in variable:
stat = _smaup(
source_df=source_df,
target_df=target_df,
y=source_df[var].to_numpy(),
w=smaup_weight)
if stat.summary.find('H0 is rejected'):
warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!")
else:
print(f"{var} is not affected by the MAUP.")
else:
for var in variable:
stat = _smaup(
source_df=source_df,
target_df=target_df,
y=source_df[var].to_numpy())
if stat.summary.find('H0 is rejected'):
warn(f"{var} is affected by the MAUP. Interpolations of this variable may not be accourate!")
else:
print(f"{var} is not affected by the MAUP.")

liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial}

if likelihood not in liks.keys():
Expand Down
16 changes: 16 additions & 0 deletions tobler/tests/test_diagnostics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""test diagnostics funtions"""

import geopandas
from tobler.diagnostics import _smaup

precincts = geopandas.read_file("https://ndownloader.figshare.com/files/20460549")
tracts = geopandas.read_file("https://ndownloader.figshare.com/files/20460645")

def test_smaup():
s = _smaup(
source_df=tracts,
target_df=precincts,
y=tracts["pct Youth"].to_numpy()
)
assert type(s.summary) == str
assert s.summary.find('H0 is rejected')