From cf805ade925ecc9ad7a030ec75201d65a88d1158 Mon Sep 17 00:00:00 2001
From: Paul Zivich <32672909+pzivich@users.noreply.github.com>
Date: Tue, 23 Apr 2024 10:54:08 -0400
Subject: [PATCH] Updating all docs

---
 delicatessen/data.py                          |  28 +-
 delicatessen/estimating_equations/basic.py    |  41 +--
 delicatessen/estimating_equations/causal.py   | 319 ++++++++----------
 .../estimating_equations/dose_response.py     |  48 +--
 .../estimating_equations/measurement.py       |  26 +-
 .../estimating_equations/regression.py        | 307 ++++++++---------
 delicatessen/estimating_equations/survival.py |  63 ++--
 delicatessen/mestimation.py                   | 127 +++----
 delicatessen/sandwich.py                      |  68 ++--
 delicatessen/utilities.py                     |  74 ++--
 docs/Reference/Utilities.rst                  |   2 +-
 ...ssen.derivative.approx_differentiation.rst |   6 +
 12 files changed, 539 insertions(+), 570 deletions(-)
 create mode 100644 docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst

diff --git a/delicatessen/data.py b/delicatessen/data.py
index 085f5fe..854ed25 100644
--- a/delicatessen/data.py
+++ b/delicatessen/data.py
@@ -14,7 +14,13 @@ def load_shaq_free_throws():
 
     Returns
     -------
-    ndarray
+    array :
+        Returns a 24-by-2 NumPy array.
+
+    References
+    ----------
+    Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
+    (pp. 297-337). Springer, New York, NY.
     """
     d = np.array([[ 1,  4,  5],
                   [ 2,  5, 11],
@@ -43,7 +49,7 @@ def load_shaq_free_throws():
 
 
 def load_inderjit():
-    """Load example data from Inderjit et al. (2002) on the dose-response of herbicide on perennial ryegrass growth
+    """Load example data from Inderjit et al. (2002) on the dose-response of herbicide on perennial ryegrass growth.
 
     Notes
     -----
@@ -53,7 +59,13 @@ def load_inderjit():
 
     Returns
     -------
-    ndarray
+    array :
+        Returns a 24-by-2 NumPy array.
+
+    References
+    ----------
+    Inderjit, Streibig JC, & Olofsdotter M. (2002). Joint action of phenolic acid mixtures and its significance in
+    allelopathy research. *Physiologia Plantarum*, 114(3), 422-428.
     """
     d = np.array([[7.5800000,  0.00],
                   [8.0000000,  0.00],
@@ -83,14 +95,22 @@ def load_inderjit():
 
 
 def load_robust_regress(outlier=True):
-    """Load illustrative example for robust linear regression.
+    """Load illustrative example of robust linear regression published in Zivich et al. (2022).
 
     Parameters
     ----------
+    outlier : bool, optional
+        Whether to induce the outlier (``True``) or not (``False``).
 
     Returns
     -------
+    array :
+        Returns a 15-by-2 NumPy array.
 
+    References
+    ----------
+    Zivich PN, Klose M, Cole SR, Edwards JK, & Shook-Sa BE. (2022). Delicatessen: M-estimation in Python.
+    *arXiv:2203.11300*.
     """
     height = [168.519, 166.944, 164.327, 164.058, 166.212, 167.358,
               165.244, 169.352, 159.386, 166.953, 163.876,
diff --git a/delicatessen/estimating_equations/basic.py b/delicatessen/estimating_equations/basic.py
index 7047c1e..a7e872f 100644
--- a/delicatessen/estimating_equations/basic.py
+++ b/delicatessen/estimating_equations/basic.py
@@ -20,11 +20,6 @@ def ee_mean(theta, y):
 
         \sum_{i=1}^n (Y_i - \theta) = 0
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -36,7 +31,7 @@ def ee_mean(theta, y):
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input ``theta`` and ``y``
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta`` and ``y``
 
     Examples
     --------
@@ -89,11 +84,6 @@ def ee_mean_robust(theta, y, k, loss='huber', lower=None, upper=None):
     Tukey's biweight, Andrew's Sine, and Hampel. See ``robust_loss_function`` for further details on the loss
     functions for the robust mean.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -105,18 +95,18 @@ def ee_mean_robust(theta, y, k, loss='huber', lower=None, upper=None):
         Tuning or hyperparameter for the chosen loss function. Notice that the choice of hyperparameter depends on the
         loss function.
     loss : str, optional
-        Robust loss function to use. Default is 'huber'. Options include 'andrew', 'hampel', 'huber', 'tukey'.
+        Robust loss function to use. Default is ``'huber'``. Options include ``'andrew'``, ``'hampel'``, ``'tukey'``.
     lower : int, float, None, optional
-        Lower parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Lower parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
     upper : int, float, None, optional
-        Upper parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Upper parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
 
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input theta and y
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
@@ -186,12 +176,6 @@ def ee_mean_variance(theta, y):
     Unlike ``ee_mean``, ``theta`` consists of 2 parameters. The output covariance matrix will also provide estimates
     for each of the ``theta`` values.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -204,7 +188,7 @@ def ee_mean_variance(theta, y):
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta and y
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
@@ -273,12 +257,12 @@ def ee_percentile(theta, y, q):
         1-dimensional vector of n observed values. No missing data should be included (missing data may cause unexpected
         behavior when attempting to calculate the mean).
     q : float
-        Percentile to calculate. Must be (0, 1)
+        Percentile to calculate. Must be :math:`(0, 1)`
 
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input theta and y
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
@@ -309,7 +293,7 @@ def ee_percentile(theta, y, q):
     >>> estr.theta
 
     Then displays the estimated percentile / median. In this example, there is a difference between the closed form
-    solution (-0.07978) and M-Estimation (-0.06022).
+    solution (``-0.07978``) and M-Estimation (``-0.06022``).
 
     References
     ----------
@@ -352,11 +336,6 @@ def ee_positive_mean_deviation(theta, y):
     sandwich) cannot be used to estimate the variance. This estimating equation is offered for completeness, but is not
     generally recommended for applications.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
@@ -369,7 +348,7 @@ def ee_positive_mean_deviation(theta, y):
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta and y
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta`` and ``y``.
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/causal.py b/delicatessen/estimating_equations/causal.py
index 7e2fa63..ceced5e 100644
--- a/delicatessen/estimating_equations/causal.py
+++ b/delicatessen/estimating_equations/causal.py
@@ -13,37 +13,36 @@
 
 
 def ee_gformula(theta, y, X, X1, X0=None, force_continuous=False):
-    r"""Estimating equations for the g-computation. The parameter of interest can either be the mean under a single
-    policy or plan of action, or the mean difference between two policies. This is accomplished by providing the
-    estimating equation the observed data (``X``, ``y``), and the same data under the actions (``X1`` and optionally
-    ``X0``).
+    r"""Estimating equations for the g-formula (or g-computation). The parameter of interest can either be the mean
+    under a single policy or plan of action, or the mean difference between two policies. This is accomplished by
+    providing the estimating equation the observed data (``X``, ``y``), and the same data under the actions (``X1``
+    and optionally ``X0``).
 
-    The outcome regression estimating equation is
+    The stack of estimating equations are
 
     .. math::
 
-        \sum_{i=1}^n \left\{ Y_i - g(X_i^T \beta) \right\} X_i = 0
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \left\{ g({X_i^*}^T \beta) - \theta_1 \right\} \\
+            \left\{ Y_i - g(X_i^T \beta) \right\} X_i
+        \end{bmatrix}
+        = 0
 
-    where :math:`g` indicates a transformation function. For linear regression, :math:`g` is the identity function.
-    Logistic regression uses the inverse-logit function. By default, `ee_gformula` detects whether `y` is all binary
+    where the first is the mean under the specified plan, with the plan setting the values of action :math:`A` (e.g.,
+    exposure, treatment, vaccination, etc.), and the second equation is the outcome regression model.
+    Here, :math:`g` indicates a transformation function. For linear regression, :math:`g` is the identity function.
+    Logistic regression uses the inverse-logit function. By default, ``ee_gformula`` detects whether `y` is all binary
     (zero or one), and applies logistic regression if that is evaluated to be true.
 
-    There are two variations on the parameter of interest. The first could be the mean under a plan, where the plan sets
-    the values of action :math:`A` (e.g., exposure, treatment, vaccination, etc.). The estimating equation for this
-    causal mean is
-
-    .. math::
-
-        \sum_{i=1}^n \left\{ g({X_i^*}^T \beta) - \theta_1 \right\} = 0
-
     Note
     ----
-    This variation includes :math:`1+b` parameters, where the first parameter is the causal mean, and the remainder are
+    This variation includes 1+`b` parameters, where the first parameter is the causal mean, and the remainder are
     the parameters for the regression model.
 
 
-    The alternative parameter of interest could be the mean difference between two plans. A common example of this would
-    be the average causal effect, where the plans are all-action-one versus all-action-zero. Therefore, the estimating
+    Alternatively, a causal mean difference is estimated when ``X0`` is specified. A common example of this would be
+    the average causal effect, where the plans are all-action-one versus all-action-zero. Therefore, the estimating
     equations consist of the following three equations
 
     .. math::
@@ -52,46 +51,38 @@ def ee_gformula(theta, y, X, X1, X0=None, force_continuous=False):
         \begin{bmatrix}
             (\theta_1 - \theta_2) - \theta_0 \\
             g({X_i^1}^T \beta) - \theta_1 \\
-            g({X_i^0}^T \beta) - \theta_2
+            g({X_i^0}^T \beta) - \theta_2 \\
+            \left\{ Y_i - g(X_i^T \beta) \right\} X_i
         \end{bmatrix}
         = 0
 
     Note
     ----
-    This variation includes :math:`3+b` parameters, where the first parameter is the causal mean difference, the second
+    This variation includes 3+`b` parameters, where the first parameter is the causal mean difference, the second
     is the causal mean under plan 1, the third is the causal mean under plan 0, and the remainder are the parameters
     for the regression model.
 
-
-    The parameter of interest is designated by the user via whether the optional argument ``X0`` is left as ``None``
-    (which estimates the causal mean) or is given an array (which estimates the causal mean difference and the
-    corresponding causal means).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 1+b values if ``X0`` is ``None``, and 3+b values if ``X0`` is not ``None``.
+        Theta consists of 1+`b` values if ``X0`` is ``None``, and 3+`b` values if ``X0`` is not ``None``.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     X1 : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables under the action plan.
+        2-dimensional vector of `n` observed values for `b` variables under the action plan.
     X0 : ndarray, list, vector, None, optional
-        2-dimensional vector of n observed values for b variables under the separate action plan. This second argument
-        is optional and should be specified if the causal mean difference between two action plans is of interest.
+        2-dimensional vector of `n` observed values for `b` variables under the separate action plan. This second
+        argument is optional and should be specified if the causal mean difference between two action plans is of
+        interest.
     force_continuous : bool, optional
         Option to force the use of linear regression despite detection of a binary variable.
 
     Returns
     -------
     array :
-        Returns a (1+b)-by-n NumPy array if ``X0=None``, or returns a (3+b)-by-n NumPy array if ``X0!=None``
+        Returns a (1+`b`)-by-`n` NumPy array if ``X0=None``, or returns a (3+`b`)-by-`n` NumPy array if ``X0!=None``
 
     Examples
     --------
@@ -248,18 +239,10 @@ def ee_gformula(theta, y, X, X1, X0=None, force_continuous=False):
 
 
 def ee_ipw(theta, y, A, W, truncate=None, weights=None):
-    r"""Estimating equation for inverse probability weighting estimator. For estimation of the weights (or propensity
-    scores), a logistic model is used. The first estimating equations for the logistic regression model are
+    r"""Estimating equation for inverse probability weighting (IPW) estimator. The average causal effect is estimated by
+    this implementation of the IPW estimator. For estimation of the propensity scores, a logistic model is used.
 
-    .. math::
-
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
-
-    where A is the action and W is the set of confounders.
-
-    For the implementation of the inverse probability weighting estimator, stacked estimating equations are used
-    for the mean had everyone been set to ``A=1``, the mean had everyone been set to ``A=0``, and the mean difference
-    between the two causal means. The estimating equations are
+    The stacked estimating equations are
 
     .. math::
 
@@ -267,52 +250,41 @@ def ee_ipw(theta, y, A, W, truncate=None, weights=None):
         \begin{bmatrix}
             (\theta_1 - \theta_2) - \theta_0 \\
             \frac{A_i Y_i}{\pi_i} - \theta_1 - \theta_1 \\
-            \frac{(1-A_i) Y_i}{1-\pi_i} - \theta_2
+            \frac{(1-A_i) Y_i}{1-\pi_i} - \theta_2 \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i
         \end{bmatrix}
         = 0
 
-    where :math:`\pi_i = expit(W_i^T \alpha)`. Due to these 3 extra values, the length of the theta vector is 3+b,
-    where b is the number of parameters in the regression model.
-
-    Note
-    ----
-    Unlike ``ee_gformula``, ``ee_ipw`` always provides the average causal effect, and causal means for ``A=1`` and
-    ``A=0``.
-
-
-    Here, theta corresponds to a variety of different quantities. The *first* value in theta vector is the causal mean
-    difference, the *second* is the mean had everyone been set to ``A=1``, the *third* is the mean had everyone been
-    set to ``A=0``. The remainder of the parameters correspond to the logistic regression model coefficients.
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    where :math:`A` is the action, math:`W` is the set of confounders, and :math:`\pi_i = expit(W_i^T \alpha)`. The
+    first estimating equation is for the average causal effect, the second is for the mean under :math:`A:=1`,
+    the third is for the mean under :math:`A:=0`, and the last is the logistic regression model for the propensity
+    scores. Here, the length of the theta vector is 3+`b`, where `b` is the number of parameters in the regression
+    model.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 3+b values.
+        Theta consists of 3+`b` values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables to model the probability of ``A`` with.
+        2-dimensional vector of `n` observed values for `b` variables to model the probability of ``A`` with.
     truncate : None, list, set, ndarray, optional
         Bounds to truncate the estimated probabilities of ``A`` at. For example, estimated probabilities above 0.99 or
         below 0.01 can be set to 0.99 or 0.01, respectively. This is done by specifying ``truncate=(0.01, 0.99)``. Note
         this step is done via ``numpy.clip(.., a_min, a_max)``, so order is important. Default
         is ``None``, which applies no truncation.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations. This
+        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations. This
         argument is intended to support the use of missingness weights. The propensity score model is *not* fit using
         these weights.
 
     Returns
     -------
     array :
-        Returns a (3+b)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (3+`b`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -334,7 +306,7 @@ def ee_ipw(theta, y, A, W, truncate=None, weights=None):
     >>> d['Y'] = (1-d['A'])*d['Ya0'] + d['A']*d['Ya1']
     >>> d['C'] = 1
 
-    Defining psi, or the stacked estimating equations. Note that 'A' is the action.
+    Defining psi, or the stacked estimating equations. Note that ``'A'`` is the action.
 
     >>> def psi(theta):
     >>>     return ee_ipw(theta, y=d['Y'], A=d['A'],
@@ -409,39 +381,42 @@ def ee_ipw(theta, y, A, W, truncate=None, weights=None):
 
 
 def ee_ipw_msm(theta, y, A, W, V, distribution, link, hyperparameter=None, truncate=None, weights=None):
-    r"""Estimating equation for inverse probability weighting estimator of the parameters of a marginal structural
-    model. For estimation of the weights (or propensity scores), a logistic model is used. The first estimating
-    equations for the logistic regression model are
+    r"""Estimating equation for parameters of a marginal structural model estimated using inverse probability weighting.
+    For estimation of the propensity scores, a logistic model is used.
+
+    The stacked estimating equations are
 
     .. math::
 
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \frac{1}{\pi_i} \left\{ Y_i - g^{-1}(X_i^T \beta) \right\} \times \frac{D(\beta)}{v(\beta)} X_i \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i
+        \end{bmatrix}
+        = 0
 
-    where A is the action and W is the set of confounders. For the implementation of the inverse probability weighting
-    estimator of the marginal structural model, a weighted generalized linear model is used. See ``ee_glm`` for details
-    on this estimating equation.
+    where :math:`A` is the action, math:`W` is the set of confounders, and :math:`\pi_i = \text{expit}(W_i^T \alpha)`.
+    Here, :math:`X` is the design matrix for the marginal structural model (it includes :math:`A`, and possibly some
+    covariates from :math:`W`). The first estimating equation is a weighted generalized linear model is used. See
+    ``ee_glm`` for details on this estimating equation. The second estimating equation is the logistic model for the
+    propensity scores.
 
     Here, ``theta`` corresponds to multiple quantities. The *first* set of values correspond to the parameters of the
     marginal structural model, and the *second* set correspond to the logistic regression model coefficients for the
     propensity scores.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 3+b values.
+        Theta consists of `c`+`b` values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables to model the probability of ``A`` with.
+        2-dimensional vector of `n` observed values for `b` variables to model the probability of ``A`` with.
     V : ndarray, list, vector
-        2-dimensional vector of n observed values for c variables in the marginal structural model.
+        2-dimensional vector of `n` observed values for `c` variables in the marginal structural model.
     distribution : str
         Distribution for the generalized linear model. See ``ee_glm`` for options.
     link : str
@@ -459,7 +434,7 @@ def ee_ipw_msm(theta, y, A, W, V, distribution, link, hyperparameter=None, trunc
     Returns
     -------
     array :
-        Returns a (3+b)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (`c`+`b`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -558,26 +533,9 @@ def ee_ipw_msm(theta, y, A, W, V, distribution, link, hyperparameter=None, trunc
 
 def ee_aipw(theta, y, A, W, X, X1, X0, truncate=None, force_continuous=False):
     r"""Estimating equation for augmented inverse probability weighting (AIPW) estimator. AIPW consists of two nuisance
-    models (the propensity score model and the outcome model). For estimation of the propensity scores, the estimating
-    equations are
+    models (the propensity score model and the outcome model).
 
-    .. math::
-
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
-
-    where ``A`` is the treatment and ``W`` is the set of confounders. The estimating equations for the outcome model
-    are
-
-    .. math::
-
-        \sum_{i=1}^n \left\{ Y_i - g(X_i^T \beta) \right\} X_i = 0
-
-    By default, `ee_aipw` detects whether `y` is all binary (zero or one), and applies logistic regression. Notice that
-    ``X`` here should consists of both ``A`` and ``W`` (with possible interaction terms or other differences in
-    functional forms from the propensity score model).
-
-    The AIPW estimating equations include the causal mean difference, mean had everyone been set to ``A=1``, and the
-    mean had everyone been set to ``A=0``
+    The stacked estimating equations are
 
     .. math::
 
@@ -585,46 +543,39 @@ def ee_aipw(theta, y, A, W, X, X1, X0, truncate=None, force_continuous=False):
         \begin{bmatrix}
             (\theta_1 - \theta_2) - \theta_0 \\
             \frac{A_i Y_i}{\pi_i} - \frac{\hat{Y^1}(A_i-\pi_i}{\pi_i} - \theta_1 \\
-            \frac{(1-A_i) Y_i}{1-\pi_i} + \frac{\hat{Y^0}(A_i-\pi_i}{1-\pi_i} - \theta_2
+            \frac{(1-A_i) Y_i}{1-\pi_i} + \frac{\hat{Y^0}(A_i-\pi_i}{1-\pi_i} - \theta_2 \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i \\
+            \left\{ Y_i - g(X_i^T \beta) \right\} X_i
         \end{bmatrix}
         = 0
 
-    where :math:`\hat{Y}^a = g({X_i^*}^T \beta)`.
-
-    Note
-    ----
-    Unlike ``ee_gformula``, ``ee_aipw`` always provides the average causal effect, and causal means for ``A=1`` and
-    ``A=0``.
-
+    where :math:`A` is the action and :math:`W` is the set of confounders, :math:`Y` is the outcome, and
+    :math:`\pi_i = \text{expit}(W_i^T \alpha)`. The first estimating equation is for the average causal effect, the
+    second is for the mean under :math:`A:=1`, the third is for the mean under :math:`A:=0`, the fourth is the logistic
+    regression model for the propensity scores, and the last is for the outcome model. Here, the length of the theta
+    vector is 3+`b`+`c`, where `b` is the number of parameters in the propensity score model and `c` is the number
+    of parameters in the outcome model.
 
-    Due to these 3 extra values and two nuisance models, the length of the parameter vector is 3+b+c, where b is the
-    number of columns in ``W``, and c is the number of columns in ``X``. The *first* value in theta vector is the
-    causal mean difference (or average causal effect), the *second* is the mean had everyone been given ``A=1``, the
-    *third* is the mean had everyone been given ``A=0``. The remainder of the parameters correspond to the regression
-    model coefficients, in the order input. The first 'chunk' of  coefficients correspond to the propensity score model
-    and the last 'chunk' correspond to the outcome model.
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    By default, `ee_aipw` detects whether `y` is all binary (zero or one), and applies logistic regression. Notice that
+    ``X`` here should consists of both ``A`` and ``W`` (with possible interaction terms or other differences in
+    functional forms from the propensity score model).
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 3+b+c values.
+        Theta consists of 3+`b`+`c` values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables to model the probability of ``A`` with.
+        2-dimensional vector of `n` observed values for `b` variables to model the probability of ``A`` with.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for c variables to model the outcome ``y``.
+        2-dimensional vector of `n` observed values for `c` variables to model the outcome ``y``.
     X1 : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables under the action plan where ``A=1`` for all units.
+        2-dimensional vector of `n` observed values for `c` variables under the action plan where ``A=1`` for all units.
     X0 : ndarray, list, vector, None, optional
-        2-dimensional vector of n observed values for b variables under the action plan where ``A=0`` for all units.
+        2-dimensional vector of `n` observed values for `c` variables under the action plan where ``A=0`` for all units.
     truncate : None, list, set, ndarray, optional
         Bounds to truncate the estimated probabilities of ``A`` at. For example, estimated probabilities above 0.99 or
         below 0.01 can be set to 0.99 or 0.01, respectively. This is done by specifying ``truncate=(0.01, 0.99)``. Note
@@ -636,7 +587,7 @@ def ee_aipw(theta, y, A, W, X, X1, X0, truncate=None, force_continuous=False):
     Returns
     -------
     array :
-        Returns a (3+b+c)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (3+`b`+`c`)-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -787,15 +738,15 @@ def ee_gestimation_snmm(theta, y, A, W, V, X=None, model='linear', weights=None)
 
         E[Y^a - Y^{0} | A=a, V] = \beta_1 a + \beta_2 a V
 
-    This model corresponds to the average causal effect among those with :math:`A=a` within strata of :math:`V`. The
+    This model corresponds to the average causal effect among those with :math:`A=a` by :math:`V`. The
     log-linear SMM is defined as
 
     .. math::
 
         \frac{E[Y^a | A=a, V]}{E[Y^{0} | A=a, V]} = \exp(\beta_1 a + \beta_2 a V)
 
-    This model corresponds to the causal mean ratio among those with :math:`A=a` within strata of :math:`V`. Note that
-    the log-linear SMM is only defined when :math:`Y > 0`. The parameters of either SMM can be identified under the
+    This model corresponds to the causal mean ratio among those with :math:`A=a` by :math:`V`. Note that
+    the log-linear SMM is only defined when :math:`Y > 0`. The parameters of either SMM are identified under the
     assumptions of  causal consistency, and exchangeability with positivity.
 
     Two different estimating equations are available for g-estimation. The first set is referred to at the 'inefficient'
@@ -803,57 +754,68 @@ def ee_gestimation_snmm(theta, y, A, W, V, X=None, model='linear', weights=None)
 
     .. math::
 
-        \sum_{i=1}^n \left\{ H(\beta) \times (A - E[A | W]) \right\}  \times \mathbb{V}_i = 0
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \left\{ H(\beta) \times (A - \pi_i) \right\}  \times V_i \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i
+        \end{bmatrix}
+        = 0
 
-    where :math:`H(\beta) = Y - \beta A \mathbb{V}` for a linear SMM and
-    :math:`H(\beta) = Y \times \exp(-A \beta \mathbb{V})` for a log-linear SMM, where :math:`\mathbb{V}` is a design
-    matrix. Note that :math:`V \subseteq W`, where :math:`W` is the set of confounding variables. This estimating
-    equation requires :math:`E[A|W]`, which must be estimated. This is done via the following estimating equation for
-    binary actions
+    where :math:`\pi_i = \text{expit}(W_i^T \alpha)`, and
+    :math:`H(\beta) = Y - \beta A \mathbb{V}` for a linear SMM and
+    :math:`H(\beta) = Y \times \exp(-A \beta \mathbb{V})` for a log-linear SMM, where .
+    Note that :math:`V \subseteq W`, where :math:`W` is the set of confounding variables.
+    The length of the parameter vector is `b`+`c`, where `b` is the number of columns in ``V``, and
+    `c` is the number of columns in ``W``.
 
-    .. math::
+    The second implementation for g-estimation is the 'efficient' g-estimator. For the efficient g-estimator we replace
+    :math:`H(\beta)` with :math:`\{H(\beta) - E[H(\beta) | W]\}` in the prior estimating equation and specify a model
+    for :math:`E[H(\beta) | W]`. The corresponding stacked estimating equations are
 
-        \sum_{i=1}^n \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i = 0
+    .. math::
 
-    These estimating equations are stacked together. Therefore, the length of the parameter vector is b+c, where b is
-    the number of columns in ``V``, and c is the number of columns in ``W``. The *first* b values in theta
-    vector are the SMM parameters. The *second* set are the parameters corresponding to the :math:`E[A|W]` model.
+        \sum_{i=1}^n
+        \begin{bmatrix}
+            \left\{ (H(\beta) - g^{-1}(W_i^T \gamma)) \times (A - \pi_i) \right\}  \times V_i \\
+            \left\{ A_i - \text{expit}(W_i^T \alpha) \right\} W_i \\
+            \left\{ H(\beta) - g^{-1}(W_i^T \gamma) \right\} W_i \\
+        \end{bmatrix}
+        = 0
 
-    The second implementation for g-estimation is the 'efficient' g-estimator. For the efficient g-estimator we replace
-    :math:`H(\beta)` with :math:`\{H(\beta) - E[H(\beta) | W]\}` in the prior estimating equation. Here, we also need to
-    specify a model for :math:`E[H(\beta) | W]`. Therefore, an additional estimating equation for
-    :math:`E[H(\beta) | W]` is stacked with the others. Therefore, there are b+c+d parameters for the efficient
-    g-estimator, where d is the number of parameters in the model for :math:`E[H(\beta) | W]`.
+    where :math:`g^{-1}` is the inverse transformation for the specified SMM. Therefore, there are b+c+d parameters
+    for the efficient g-estimator, where `d` is the number of parameters in the model for :math:`E[H(\beta) | W]`.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta consists of 1+b values if ``X0`` is ``None``, and 3+b values if ``X0`` is not ``None``.
+        Theta consists of 1+`b` values if ``X0`` is ``None``, and 3+b values if ``X0`` is not ``None``.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values of the outcome.
+        1-dimensional vector of `n` observed values of the outcome.
     A : ndarray, list, vector
-        1-dimensional vector of n observed values of the action. The A values should all be 0 or 1.
+        1-dimensional vector of `n` observed values of the action. The A values should all be 0 or 1.
     W : ndarray, list, vector
-        2-dimensional vector of n observed values for b columns of a design matrix to model the expected value of ``A``.
+        2-dimensional vector of `n` observed values for b columns of a design matrix to model the expected value of
+        ``A``.
     V : ndarray, list, vector
-        2-dimensional vector of n observed values for b columns of a design matrix for the structural mean model. Note
-        that the design matrix here is expected to not include the observed values of ``A``
+        2-dimensional vector of `n` observed values for `b` columns of a design matrix for the structural mean model.
+        Note that the design matrix here is expected to not include the observed values of ``A``
     X : ndarray, list, vector, None, optional
         Default of this argument is ``None``, which implements the estimating equation for the inefficient g-estimator.
-        To use the efficient g-estimator, a 2-dimensional vector of n observed values for b columns of a design matrix
+        To use the efficient g-estimator, a 2-dimensional vector of n observed values for `b` columns of a design matrix
         for the :math:`E[H(\beta) | W]` model should be provided here.
     model : str, optional
         Type of structural mean model to fit. Options are currently: ``linear``, ``poisson``. Default is ``linear``.
         The Poisson model specification can be used for positive continuous data, or with binary data in order to
         estimate causal risk ratios.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations. This
+        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations. This
         argument is intended to support the use of sampling or missingness weights.
 
     Returns
     -------
     array :
-        Returns a (b+c)-by-n (inefficient) or (b+c+d)-by-n (efficient) NumPy array evaluated for the input ``theta``
+        Returns a (`b`+`c`)-by-`n` (inefficient) or (`b`+`c`+`d`)-by-`n` (efficient) NumPy array evaluated for the
+        input ``theta``.
 
     Examples
     --------
@@ -1040,34 +1002,35 @@ def ee_mean_sensitivity_analysis(theta, y, delta, X, q_eval, H_function):
     probability weighting estimator will result in different (but similar) estimates.
 
 
-    The length of the parameter vector, :math:`\theta`, is 1+b, where b is the number of columns in ``X``. The *first*
-    value in the theta vector is the corrected mean of :math:`Y`. The remainder of the parameters correspond to the
-    regression model coefficients.
+    The length of the parameter vector, :math:`\theta`, is 1+`b`, where `b` is the number of columns in ``X``.
+    The *first* value in the theta vector is the corrected mean of :math:`Y`. The remainder of the parameters
+    correspond to the regression model coefficients.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 1+b values. Therefore, initial values should consist of one plus the number of
+        Theta in this case consists of 1+`b` values. Therefore, initial values should consist of one plus the number of
         columns present in ``X``. This can easily be accomplished generally by ``[0, ] + [0, ] * X.shape[1]``.
     y : ndarray, list, vector
-        1-dimensional vector of n values. Any values of ``y`` that are missing should be indicated by the ``delta``
+        1-dimensional vector of `n` values. Any values of ``y`` that are missing should be indicated by the ``delta``
         parameter.
     delta : ndarray, list, vector
-        1-dimensional vector of n observed values indicating whether the observation has a value for ``y`` observed,
+        1-dimensional vector of `n` observed values indicating whether the observation has a value for ``y`` observed,
         where 1 indicates yes and 0 indicated no. This vector should not include any ``nan`` values.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables consider as predictors. At a minimum, a vector of ones
-        (intercept) should be included. This matrix should not include any ``nan`` values.
+        2-dimensional vector of `n` observed values for `b` variables consider as predictors. At a minimum, a vector
+        of ones (intercept) should be included. This matrix cannot include any ``nan`` values.
     q_eval : ndarray, list, vector
-        1-dimensional vector of n values evaluated using the :math:`q(Y; \alpha)` function.
+        1-dimensional vector of `n` values evaluated using the :math:`q(Y; \alpha)` function.
     H_function : callable
-        Function use to bound the observations between 0,1. The function must be monotonic increasing and be bounded by
-        :math:`[0,1]`. For example, the expit (``delicatessen.utilities.inverse_logit``) function meets this criteria.
+        Function use to bound the observations between :math:`[0,1]`. The function must be monotonic increasing and be
+        bounded by :math:`[0,1]`. For example, the expit (``delicatessen.utilities.inverse_logit``) function meets
+        this criteria.
 
     Returns
     -------
     array :
-        Returns a (1+b)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (1+`b`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/dose_response.py b/delicatessen/estimating_equations/dose_response.py
index 405efa5..7dc2839 100644
--- a/delicatessen/estimating_equations/dose_response.py
+++ b/delicatessen/estimating_equations/dose_response.py
@@ -26,28 +26,24 @@ def ee_4p_logistic(theta, X, y):
     :math:`\rho = \frac{D_i}{\theta_1}^{\theta_2}`, and
     :math:`\hat{Y_i} = \theta_0 + \frac{\theta_3 - \theta_0}{1+\rho}`.
 
-    Here, theta is a 1-by-4 array, where 4 are the 4 parameters of the 4PL. The first theta corresponds to lower limit
+    Here, theta is a 1-by-4 array. The first theta corresponds to lower limit
     (:math:`\theta_0`), the second corresponds to the effective dose (ED50) (:math:`\theta_1`), the third corresponds
     to the steepness of the curve (:math:`\theta_2`), and the fourth corresponds to the upper limit (:math:`\theta_3`).
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 4 values. In general, starting values ``>0`` are better choices for the 4PL model
+        Theta in this case consists of 4 values. In general, starting values :math:`>0` are better choices for the
+        4PL model
     X : ndarray, list, vector
-        1-dimensional vector of n dose values.
+        1-dimensional vector of `n` dose values.
     y : ndarray, list, vector
-        1-dimensional vector of n response values.
+        1-dimensional vector of `n` response values.
 
     Returns
     -------
     array :
-        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+        Returns a 4-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -162,26 +158,22 @@ def ee_3p_logistic(theta, X, y, lower):
     corresponds to the upper limit (:math:`\theta_3`). The lower limit (:math:`\theta_0`, ``lower``) is pre-specified
     by the user (and is no longer estimated)
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 3 values. In general, starting values ``>0`` are better choices for the 3PL model
+        Theta in this case consists of 3 values. In general, starting values :math:`>0` are better choices for the
+        3PL model
     X : ndarray, list, vector
-        1-dimensional vector of n dose values.
+        1-dimensional vector of `n` dose values.
     y : ndarray, list, vector
-        1-dimensional vector of n response values.
+        1-dimensional vector of `n` response values.
     lower : int, float
         Set value for the lower limit.
 
     Returns
     -------
     array :
-        Returns a 3-by-n NumPy array evaluated for the input theta, y, X
+        Returns a 3-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -260,19 +252,15 @@ def ee_2p_logistic(theta, X, y, lower, upper):
     (:math:`\theta_0`, ``lower``) and upper limit (:math:`\theta_3`, ``upper``) are pre-specified by the user (and are
     no longer estimated)
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of 2 values. In general, starting values >0 are better choices for the 3PL model
+        Theta in this case consists of 2 values. In general, starting values :math:`>0` are better choices for the
+        2PL model
     X : ndarray, list, vector
-        1-dimensional vector of n dose values.
+        1-dimensional vector of `n` dose values.
     y : ndarray, list, vector
-        1-dimensional vector of n response values.
+        1-dimensional vector of `n` response values.
     lower : int, float
         Set value for the lower limit.
     upper : int, float
@@ -281,7 +269,7 @@ def ee_2p_logistic(theta, X, y, lower, upper):
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta, y, X
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -356,7 +344,7 @@ def ee_effective_dose_delta(theta, y, delta, steepness, ed50, lower, upper):
     theta : int, float
         Theta value corresponding to the ED(alpha).
     y : ndarray, list, vector
-        1-dimensional vector of n response values, used to construct correct shape for output.
+        1-dimensional vector of `n` response values, used to construct correct shape for output.
     delta : float
         The effective dose level of interest, ED(alpha).
     steepness : float
@@ -373,7 +361,7 @@ def ee_effective_dose_delta(theta, y, delta, steepness, ed50, lower, upper):
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input theta
+        Returns a 1-by-`n` NumPy array evaluated for the input theta
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/measurement.py b/delicatessen/estimating_equations/measurement.py
index 51618da..2bf0610 100644
--- a/delicatessen/estimating_equations/measurement.py
+++ b/delicatessen/estimating_equations/measurement.py
@@ -43,21 +43,21 @@ def ee_rogan_gladen(theta, y, y_star, r, weights=None):
     theta : ndarray, list, vector
         Theta consists of 4 values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the gold-standard :math:`Y` measurements in the external
+        1-dimensional vector of `n` observed values. These are the gold-standard :math:`Y` measurements in the external
         sample. All values should be either 0 or 1, and be non-missing among those with :math:`R=0`.
     y_star : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the mismeasured :math:`Y` values. All values should be
+        1-dimensional vector of `n` observed values. These are the mismeasured :math:`Y` values. All values should be
         either 0 or 1, and be non-missing among all observations.
     r : ndarray, list, vector
-        1-dimensional vector of n indicators regarding whether an observation was part of the external validation data.
-        Indicator should designate if observations are the main data.
+        1-dimensional vector of `n` indicators regarding whether an observation was part of the external validation
+        data. Indicator should designate if observations are the main data.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
 
     Returns
     -------
     array :
-        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+        Returns a 4-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -164,7 +164,7 @@ def ee_rogan_gladen_extended(theta, y, y_star, r, X, weights=None):
     where :math:`Y` is the true value of the outcome, :math:`Y^*` is the mismeasured value of the outcome. The first
     estimating equation is the corrected proportion, the second is for sensitivity, and the third for specificity.
 
-    If :math:`X` is of dimension :math:`p`, then ``theta`` is a 1-by-(1+2p) array. Note that the design matrix is
+    If :math:`X` is of dimension :math:`p`, then ``theta`` is a 1-by-(1+2`p`) array. Note that the design matrix is
     shared across the sensitivity and specificity models.
 
     Note
@@ -177,23 +177,23 @@ def ee_rogan_gladen_extended(theta, y, y_star, r, X, weights=None):
     theta : ndarray, list, vector
         Theta consists of 4 values.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the gold-standard :math:`Y` measurements in the external
+        1-dimensional vector of `n` observed values. These are the gold-standard :math:`Y` measurements in the external
         sample. All values should be either 0 or 1, and be non-missing among those with :math:`R=0`.
     y_star : ndarray, list, vector
-        1-dimensional vector of n observed values. These are the mismeasured :math:`Y` values. All values should be
+        1-dimensional vector of `n` observed values. These are the mismeasured :math:`Y` values. All values should be
         either 0 or 1, and be non-missing among all observations.
     r : ndarray, list, vector
-        1-dimensional vector of n indicators regarding whether an observation was part of the external validation data.
-        Indicator should designate if observations are the main data.
+        1-dimensional vector of `n` indicators regarding whether an observation was part of the external validation
+        data. Indicator should designate if observations are the main data.
     X : ndarray, list, vector
         2-dimensional vector of a design matrix for the sensitivity and specificity models.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
 
     Returns
     -------
     array :
-        Returns a 4-by-n NumPy array evaluated for the input ``theta``
+        Returns a 4-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
diff --git a/delicatessen/estimating_equations/regression.py b/delicatessen/estimating_equations/regression.py
index 11a4e05..61969ef 100644
--- a/delicatessen/estimating_equations/regression.py
+++ b/delicatessen/estimating_equations/regression.py
@@ -27,39 +27,31 @@ def ee_regression(theta, X, y, model, weights=None, offset=None):
     Logistic regression uses the inverse-logit function, :math:`\text{expit}(u) = 1 / (1 + \exp(u))`. Finally, Poisson
     regression is :math:`\exp(u)`.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughout, these
-    user-defined functions are defined as ``psi``.
-
-
-    Here, :math:`\theta` corresponds to the coefficients in the corresponding regression model
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented by ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -145,7 +137,7 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
 
     .. math::
 
-        \sum_{i=1}^n W_i \left\{ Y_i - g^{-1}(X_i^T \theta) \times \frac{D(\theta)}{v(\theta)} \right\} X_i = 0
+        \sum_{i=1}^n \left\{ Y_i - g^{-1}(X_i^T \theta) \right\} \times \frac{D(\theta)}{v(\theta)} X_i = 0
 
     where :math:`g` is the link function, :math:`g^{-1}` is the inverse link function, :math:`D(\theta)` is the
     derivative of the inverse link function by :math:`\theta`, and :math:`v(\theta)` is the variance function for the
@@ -157,27 +149,19 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
     additional parameter-specific estimating equations.
 
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughout, these
-    user-defined functions are defined as ``psi``.
-
-
-    Here, :math:`\theta` corresponds to the coefficients in the corresponding regression model
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented by ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     distribution : str
         Distribution for the generalized linear model. Options are:
         ``'normal'`` (alias: ``gaussian``),
@@ -199,22 +183,22 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
         ``inverse``,
         and ``square_root`` (alias: ``sqrt``).
     hyperparameter : None, int, float
-        Hyperparameter specification. Default is None. This option is only used by the tweedie distribution. It is
+        Hyperparameter specification. Default is ``None``. This option is only used by the tweedie distribution. It is
         ignored by all other distributions.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Note
     ----
     Link and distribution combinations are not checked for their validity. Some pairings may not converge or may
-    produce nonsensical results. Please check the combination you are using is valid.
+    produce nonsensical results. Please check the distribution-link combination you are using is valid.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -285,19 +269,19 @@ def ee_glm(theta, X, y, distribution, link, hyperparameter=None, weights=None, o
     >>> estr = MEstimator(stacked_equations=psi, init=[0., 0., 0., 0.])
     >>> estr.estimate()
 
-    Note that ``delicatessen`` appropriately incorporates the estimation of the additional parameter for the
+    Note that delicatessen appropriately incorporates the estimation of the additional parameter for the
     negative-binomial and gamma distributions. This is unlike some statistical software that estimates this parameter
     but does *not* incorporate the uncertainty in estimation of that parameter. This may explain differences you
-    encounter across software (and the ``delicatessen`` implementation is preferred, as it is a more honest expression
-    of the uncertainty).
+    encounter across software (and the delicatessen implementation is to be preferred, as it is a more honest
+    expression of the uncertainty).
 
     Finally, the tweedie distribution for GLM is a generalization of the Poisson and gamma distributions. Unlike the
-    negative-binomial and gamma distributions, there is a fixed (i.e., not estimated) hyperparameter bounded to be >0.
-    When the tweedie distribution hyperparameter is set to 1, it is equivalent to the Poisson distribution. When the
-    tweedie distribution hyperparameter is set to 2, it is equivalent to the gamma distribution. When the tweedie
-    distribution hyperparameter is set to 3, it is equivalent to the inverse-normal distribution. However, the tweedie
-    distribution hyperparameter can be specified for any values. Here, we illustrate the tweedie distribution that is
-    between a Poisson and gamma distribution.
+    negative-binomial and gamma distributions, there is a fixed (i.e., not estimated) hyperparameter bounded to be
+    :math:`>0`. When the tweedie distribution hyperparameter is set to 1, it is equivalent to the Poisson distribution.
+    When the tweedie distribution hyperparameter is set to 2, it is equivalent to the gamma distribution. When the
+    tweedie distribution hyperparameter is set to 3, it is equivalent to the inverse-normal distribution. However, the
+    tweedie distribution hyperparameter can be specified for any values. Here, we illustrate the tweedie distribution
+    that is between a Poisson and gamma distribution.
 
     >>> def psi(theta):
     >>>     return ee_glm(theta, X=X, y=d['Y1'],
@@ -364,15 +348,11 @@ def ee_mlogit(theta, X, y, weights=None, offset=None):
     r"""Estimating equation for multinomial logistic regression. This estimating equation functionality supports
     unranked categorical outcome data, unlike ``ee_regression`` and ``ee_glm``.
 
-    Note
-    ----
     Unlike the other regression estimating equations, ``ee_mlogit`` expects a matrix of indicators for each possible
     value of ``y``, with the first column being used as the referent category. In other words, the outcome variable is
     a matrix of dummy variables that includes the reference.
-
-
-    The estimating equation for column :math:`r` of the indicator variable :math:`Y_{r}`
-    of a :math:`Y` with :math:`k` unique categories is
+    The estimating equation for column :math:`r` of the indicator variable :math:`Y_{r}` of a :math:`Y` with :math:`k`
+    unique categories is
 
     .. math::
 
@@ -380,35 +360,30 @@ def ee_mlogit(theta, X, y, weights=None, offset=None):
         X_i = 0
 
     where :math:`\theta_r` are the coefficients correspond to the log odds ratio comparing :math:`Y_r` to all other
-    categories of :math:`Y`. Here, :math:`\theta` is a 1-by-(b :math`\times` (k-1)) array, where b is the distinct
-    covariates included as part of X. So, the stack of estimating equations consists of :math:`(k-1)` estimating
-    equations of the dimension :math:`X_i`. For example, if X is a 3-by-n matrix and :math:`Y` has three unique
+    categories of :math:`Y`. Here, :math:`\theta` is a 1-by-(`b` :math`\times` (`k`-1)) array, where `b` is the distinct
+    covariates included as part of ``X``. So, the stack of estimating equations consists of (`k`-1) estimating
+    equations of the dimension :math:`X_i`. For example, if X is a 3-by-`n` matrix and :math:`Y` has three unique
     categories, then :math:`\theta` will be a 1-by-6 array.
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughout, these
-    user-defined functions are defined as ``psi``.
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b :math:`\times` (k-1) values. Therefore, initial values should consist of the
-        same number as the number of columns present in the design matrix for each category of the outcome matrix
+        Theta in this case consists of `b` :math:`\times` (`k`-1) values. Therefore, initial values should consist of
+        the same number as the number of columns present in the design matrix for each category of the outcome matrix
         besides the reference.
     X : ndarray, list, vector
-        2-dimensional design matrix of n observed covariates for b variables.
+        2-dimensional design matrix of `n` observed covariates for `b` variables.
     y : ndarray, list, vector
-        2-dimensional indicator matrix of n observed outcomes.
+        2-dimensional indicator matrix of `n` observed outcomes.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a (b*(k-1))-by-n NumPy array evaluated for the input ``theta``
+        Returns a (`b` :math:`\times` (`k`-1))-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -532,46 +507,41 @@ def ee_robust_regression(theta, X, y, model, k, loss='huber', weights=None, uppe
     occurring is zero.
 
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options include: ``'linear'`` (linear regression).
     k : int, float
         Tuning or hyperparameter for the chosen loss function. Notice that the choice of hyperparameter should depend
         on the chosen loss function.
     loss : str, optional
-        Robust loss function to use. Default is 'huber'. Options include 'andrew', 'hampel', 'huber', 'tukey'.
+        Robust loss function to use. Default is ``'huber'``. Options include ``'andrew'``, ``'hampel'``, ``'tukey'``.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is None, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     lower : int, float, None, optional
-        Lower parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Lower parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
     upper : int, float, None, optional
-        Upper parameter for the 'hampel' loss function. This parameter does not impact the other loss functions.
+        Upper parameter for the Hampel loss function. This parameter does not impact the other loss functions.
         Default is ``None``.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -668,9 +638,9 @@ def ee_ridge_regression(theta, X, y, model, penalty, weights=None, center=0., of
 
     where :math:`\lambda` is the penalty term.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -680,38 +650,37 @@ def ee_ridge_regression(theta, X, y, model, penalty, weights=None, center=0., of
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``). The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``). The penalty is scaled by `n`.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     center : int, float, ndarray, list, vector, optional
         Center or reference value to penalized estimated coefficients towards. Default is ``0``, which penalized
         coefficients towards the null. Other center values can be specified for all coefficients (by providing an
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
-    Construction of a estimating equation(s) with ``ee_ridge_regression`` should be done similar to the
-    following
+    Construction of a estimating equation(s) with ``ee_ridge_regression`` should be done similar to the following
 
     >>> import numpy as np
     >>> import pandas as pd
@@ -796,12 +765,6 @@ def ee_lasso_regression(theta, X, y, model, penalty, epsilon=3.e-3, weights=None
     r"""Estimating equation for an approximate LASSO (least absolute shrinkage and selection operator) regressor. LASSO
     regression applies an L1-regularization through a magnitude penalty.
 
-    Note
-    ----
-    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
-    cannot be used to estimate the variance in all settings.
-
-
     The estimating equation for the approximate LASSO linear regression is
 
     .. math::
@@ -811,13 +774,19 @@ def ee_lasso_regression(theta, X, y, model, penalty, epsilon=3.e-3, weights=None
 
     where :math:`\lambda` is the penalty term.
 
+    Note
+    ----
+    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
+    cannot be used to estimate the variance in all settings.
+
+
     Here, an approximation based on the bridge penalty for the LASSO is used. For the bridge penalty, LASSO is the
     special case where :math:`\epsilon = 0`. By making :math:`\epsilon > 0`, we can approximate the LASSO. The true
     LASSO may not be possible to implement due to the existence of multiple solutions
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -828,36 +797,36 @@ def ee_lasso_regression(theta, X, y, model, penalty, epsilon=3.e-3, weights=None
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``).  The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``).  The penalty is scaled by `n`.
     epsilon : float, optional
         Approximation error to use for the LASSO approximation. Default argument is ``0.003``, which results in a
-        bridge penalty of 1.0003.
+        bridge penalty of ``1.0003``.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     center : int, float, ndarray, list, vector, optional
         Center or reference value to penalized estimated coefficients towards. Default is ``0``, which penalized
         coefficients towards the null. Other center values can be specified for all coefficients (by providing an
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -949,12 +918,6 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
     pre-specified ratio. Notice that the L1 penalty is based on an approximation. See ``ee_lasso_regression`` for
     further details on the approximation for the L1 penalty.
 
-    Note
-    ----
-    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
-    cannot be used to estimate the variance in all settings.
-
-
     The estimating equation for Elastic-Net linear regression with the approximate L1 penalty is
 
     .. math::
@@ -964,9 +927,15 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
 
     where :math:`\lambda` is the penalty term and :math:`r` is the ratio for the L1 vs L2 penalty.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Note
+    ----
+    As the derivative of the estimating equation for LASSO is not defined at :math:`\theta=0`, the bread (and sandwich)
+    cannot be used to estimate the variance in all settings.
+
+
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -976,25 +945,25 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``). The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``). The penalty is scaled by `n`.
     ratio : float
         Ratio for the L1 vs L2 penalty in Elastic-net. The ratio must be be :math:`0 \le r \le 1`. Setting ``ratio=1``
         results in LASSO and ``ratio=0`` results in ridge regression.
     epsilon : float, optional
         Approximation error to use for the LASSO approximation. Default argument is ``0.003``, which results in a
-        bridge penalty of 1.0003.
+        bridge penalty of ``1.0003``.
     weights : ndarray, list, vector, None, optional
         1-dimensional vector of n weights. Default is ```None``, which assigns a weight of 1 to all observations.
     center : int, float, ndarray, list, vector, optional
@@ -1003,12 +972,12 @@ def ee_elasticnet_regression(theta, X, y, model, penalty, ratio, epsilon=3.e-3,
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input theta and y
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -1128,9 +1097,9 @@ def ee_bridge_regression(theta, X, y, model, penalty, gamma, weights=None, cente
 
     where :math:`\lambda` is the penalty term and :math:`\gamma` is a tuning parameter.
 
-    Here, :math:`\theta` is a 1-by-b array, where b is the distinct covariates included as part of X. For example, if
-    X is a 3-by-n matrix, then :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary
-    number of X's (as long as there is enough support in the data).
+    Here, :math:`\theta` is a 1-by-`b` array, which corresponds to the coefficients in the corresponding regression
+    model and `b` is the distinct covariates included as part of ``X``. For example, if ``X`` is a 3-by-`n` matrix, then
+    :math:`\theta` will be a 1-by-3 array. The code is general to allow for an arbitrary number of elements in ``X``.
 
     Note
     ----
@@ -1140,19 +1109,19 @@ def ee_bridge_regression(theta, X, y, model, penalty, gamma, weights=None, cente
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in this case consists of b values. Therefore, initial values should consist of the same number as the
+        Theta in this case consists of `b` values. Therefore, initial values should consist of the same number as the
         number of columns present. This can easily be implemented via ``[0, ] * X.shape[1]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     penalty : int, float, ndarray, list, vector
         Penalty term to apply to all coefficients (if only a integer or float is provided) or the corresponding
         coefficient (if a list or vector of integers or floats is provided). Note that the penalty term should either
-        consists of a single value or b values (to match the length of ``theta``). The penalty is scaled by n.
+        consists of a single value or `b` values (to match the length of ``theta``). The penalty is scaled by `n`.
     gamma : float
         Hyperparameter for the bridge penalty, defined for :math:`\gamma > 0`. However, only :math:`\gamma \ge 1` are
         supported.
@@ -1164,12 +1133,12 @@ def ee_bridge_regression(theta, X, y, model, penalty, gamma, weights=None, cente
         integer or float) or covariate-specific centering values (by providing a vector of values of the same length as
         X).
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``
+        Returns a `b`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -1304,44 +1273,44 @@ def ee_additive_regression(theta, X, y, specifications, model, weights=None, off
     ``ee_additive_regression`` through setting the knot locations.
 
 
-    Here, :math:`\theta` is a 1-by-(b+k) array, where b is the distinct covariates included as part of X and the k
-    distinct spline basis functions. For example, if X is a 2-by-n matrix with a 10-knot natural spline for the second
-    column in X, then :math:`\theta` will be a 1-by-(2+9) array. The code is general to allow for an arbitrary
-    number of X variables and spline knots.
+    Here, :math:`\theta` is a 1-by-(`b`+`k`) array, where `b` is the distinct covariates included as part of ``X`` and
+    the `k` distinct spline basis functions. For example, if ``X`` is a 2-by-`n` matrix with a 10-knot natural spline
+    for the second column in X, then :math:`\theta` will be a 1-by-(2+9) array. The code is general to allow for an
+    arbitrary number of X variables and spline knots.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        Parameter values. Number of values should match the number of columns in the additive design matrix.
+        Theta in this case consists of `b`+`k` values. Number of values should match the number of columns in the
+        additive design matrix.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     y : ndarray, list, vector
-        1-dimensional vector of n observed values.
+        1-dimensional vector of `n` observed values.
     specifications : list, dict, None
         A list of dictionaries that define the hyperparameters for the spline (e.g., number of knots, strength of
         penalty). For terms that should not have splines, ``None`` should be specified instead (see examples below).
-        Each dictionary supports the following parameters:
-        "knots", "natural", "power", "penalty"
-        * knots (list): controls the position of the knots, with knots are placed at given locations. There is no
-            default, so must be specified by the user.
-        * natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
-            Default is ``True``, which corresponds to natural splines.
-        * power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
-            splines.
-        * penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
-            which applies no penalty to the spline basis terms.
+        Each dictionary supports the following parameters: "knots", "natural", "power", "penalty"
+        knots (list): controls the position of the knots, with knots are placed at given locations. There is no
+        default, so must be specified by the user.
+        natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
+        Default is ``True``, which corresponds to natural splines.
+        power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
+        splines.
+        penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
+        which applies no penalty to the spline basis terms.
     model : str
         Type of regression model to estimate. Options are ``'linear'`` (linear regression), ``'logistic'`` (logistic
         regression), and ``'poisson'`` (Poisson regression).
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
     offset : ndarray, list, vector, None, optional
-        A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
+        A 1-dimensional offset to be included in the model. Default is ``None``, which applies no offset term.
 
     Returns
     -------
     array :
-        Returns a (b+k)-by-n NumPy array evaluated for the input ``theta``
+        Returns a (`b`+`k`)-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -1572,6 +1541,15 @@ def _model_transform_(model, assert_linear_model=False):
 
 
 def _inverse_link_(betax, link):
+    """Internal function to return the evaluated inverse link and derivative of the inverse link.
+
+    Parameters
+    ----------
+    betax : ndarray, list, array
+        Parameter values
+    link : str
+        Specified link function
+    """
     # Distributions not implemented: power, inverse power
     if link == 'identity':
         py = identity(betax)                    # Inverse link
@@ -1612,6 +1590,19 @@ def _inverse_link_(betax, link):
 
 
 def _distribution_variance_(dist, mu, hyperparameter=None, alpha=None):
+    """Internal function to return the distribution variance for GLM specifications.
+
+    Parameters
+    ----------
+    dist : str
+        Distribution
+    mu :
+        Prediction
+    hyperparameter : int, float, None, optional
+        Hyperparameter for the Tweedie distribution
+    alpha : int, float, None, optional
+        Hyperparameter for gamma or negative-binomial
+    """
     if dist in ['normal', 'gaussian']:
         v = 1
     elif dist == 'poisson':
diff --git a/delicatessen/estimating_equations/survival.py b/delicatessen/estimating_equations/survival.py
index f853ac3..2283e1b 100644
--- a/delicatessen/estimating_equations/survival.py
+++ b/delicatessen/estimating_equations/survival.py
@@ -27,26 +27,20 @@ def ee_exponential_model(theta, t, delta):
 
         h(t) = \lambda
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
-
     Parameters
     ----------
     theta : ndarray, list, vector
         Theta in the case of the exponential model consists of a single value. Furthermore, the parameter will be
         non-negative. Therefore, an initial value like the ``[1, ]`` should be provided.
     t : ndarray, list, vector
-        1-dimensional vector of n observed times.
+        1-dimensional vector of `n` observed times.
     delta : ndarray, list, vector
-        1-dimensional vector of n event indicators, where 1 indicates an event and 0 indicates right censoring.
+        1-dimensional vector of `n` event indicators, where 1 indicates an event and 0 indicates right censoring.
 
     Returns
     -------
     array :
-        Returns a 1-by-n NumPy array evaluated for the input ``theta``
+        Returns a 1-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -121,28 +115,22 @@ def ee_weibull_model(theta, t, delta):
 
         h(t) = \lambda \gamma t^{\gamma - 1}
 
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
-
-
     Parameters
     ----------
     theta : ndarray, list, vector
-        Theta in the case of the exponential model consists of a single value. Furthermore, the parameter will be
+        Theta in the case of the Weibull model consists of two values. Furthermore, the parameter will be
         non-negative. Therefore, an initial value like the ``[1, ]`` is recommended.
     t : ndarray, list, vector
-        1-dimensional vector of n observed times. No missing data should be included (missing data may cause
+        1-dimensional vector of `n` observed times. No missing data should be included (missing data may cause
         unexpected behavior).
     delta : ndarray, list, vector
-        1-dimensional vector of n event indicators, where 1 indicates an event and 0 indicates right censoring. No
+        1-dimensional vector of `n` event indicators, where 1 indicates an event and 0 indicates right censoring. No
         missing data should be included (missing data may cause unexpected behavior).
 
     Returns
     -------
     array :
-        Returns a 2-by-n NumPy array evaluated for the input theta.
+        Returns a 2-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -254,7 +242,7 @@ def ee_exponential_measure(theta, times, n, measure, scale):
     Returns
     -------
     array :
-        Returns a t-by-n NumPy array evaluated for the input ``theta``
+        Returns a `t`-by-`n` NumPy array evaluated for the input ``theta``
 
     Examples
     --------
@@ -376,8 +364,8 @@ def ee_weibull_measure(theta, times, n, measure, scale, shape):
     Parameters
     ----------
     theta : ndarray, list, vector
-        theta consists of t values. The initial values should consist of the same number of elements as provided in the
-        ``times`` argument.
+        theta consists of `t` values. The initial values should consist of the same number of elements as provided in
+        the ``times`` argument.
     times : int, float, ndarray, list, vector
         A single time or 1-dimensional collection of times to calculate the measure at. The number of provided times
         should consist of the same number of elements as provided in the ``theta`` argument.
@@ -396,7 +384,7 @@ def ee_weibull_measure(theta, times, n, measure, scale, shape):
     Returns
     -------
     array :
-        Returns a t-by-n NumPy array evaluated for the input ``theta``
+        Returns a `t`-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -546,34 +534,29 @@ def ee_aft_weibull(theta, X, t, delta, weights=None):
     following relation between the coefficients: :math:`\lambda = - \mu \gamma`,
     :math:`\beta_{PH} = - \beta_{AFT} \gamma`, and :math:`\gamma = \exp(\sigma)`.
 
-    Here, :math:`\theta` is a 1-by-(2+b) array, where b is the distinct covariates included as part of X. For example,
-    if X is a 3-by-n matrix, then theta will be a 1-by-5 array. The code is general to allow for an arbitrary number of
-    X's (as long as there is enough support in the data).
-
-    Note
-    ----
-    All provided estimating equations are meant to be wrapped inside a user-specified function. Throughtout, these
-    user-defined functions are defined as ``psi``.
+    Here, :math:`\theta` is a 1-by-(2+`b`) array, where `b` is the distinct covariates included as part of ``X``. For
+    example, if ``X`` is a 3-by-`n` matrix, then theta will be a 1-by-5 array. The code is general to allow for an
+    arbitrary dimension of ``X``.
 
     Parameters
     ----------
     theta : ndarray, list, vector
-        theta consists of 1+b+1 values. Therefore, initial values should consist of the same number as the number of
+        theta consists of 1+`b`+1 values. Therefore, initial values should consist of the same number as the number of
         columns present in ``X`` plus 2. This can easily be implemented via
         ``[0, ] + [0, ] * X.shape[1] + [0, ]``.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     t : ndarray, list, vector
-        1-dimensional vector of n observed times.
+        1-dimensional vector of `n` observed times.
     delta : ndarray, list, vector
-        1-dimensional vector of n values indicating whether the time was an event or censoring.
+        1-dimensional vector of `n` values indicating whether the time was an event or censoring.
     weights : ndarray, list, vector, None, optional
-        1-dimensional vector of n weights. Default is ``None``, which assigns a weight of 1 to all observations.
+        1-dimensional vector of `n` weights. Default is ``None``, which assigns a weight of 1 to all observations.
 
     Returns
     -------
     array :
-        Returns a b-by-n NumPy array evaluated for the input ``theta``.
+        Returns a 1+`b`+1-by-`n` NumPy array evaluated for the input ``theta``.
 
     Examples
     --------
@@ -711,13 +694,13 @@ def ee_aft_weibull_measure(theta, times, X, measure, mu, beta, sigma):
     Parameters
     ----------
     theta : ndarray, list, vector
-        theta consists of t values. The initial values should consist of the same number of elements as provided in the
+        theta consists of `t` values. The initial values should consist of the same number of elements as provided in the
         ``times`` argument.
     times : int, float, ndarray, list, vector
         A single time or 1-dimensional collection of times to calculate the measure at. The number of provided times
         should consist of the same number of elements as provided in the ``theta`` argument.
     X : ndarray, list, vector
-        2-dimensional vector of n observed values for b variables.
+        2-dimensional vector of `n` observed values for `b` variables.
     measure : str
         Measure to calculate. Options include survival (``'survival'``), density (``'density'``), risk or the cumulative
         density (``'risk'``), hazard (``'hazard'``), or cumulative hazard (``'cumulative_hazard'``).
@@ -731,7 +714,7 @@ def ee_aft_weibull_measure(theta, times, X, measure, mu, beta, sigma):
     Returns
     -------
     array :
-        Returns a t-by-n NumPy array evaluated for the input theta
+        Returns a `t`-by-`n` NumPy array evaluated for the input theta
 
     Examples
     --------
diff --git a/delicatessen/mestimation.py b/delicatessen/mestimation.py
index 7b87d2c..f272542 100644
--- a/delicatessen/mestimation.py
+++ b/delicatessen/mestimation.py
@@ -28,18 +28,20 @@ class MEstimator:
     ----
     Estimating equations are advantageous in both theoretical and applied research. They simplifies proofs of
     consistency and asymptotic normality of estimators under a large-sample approximation framework. In application,
-    this approach to esitmation simplifies estimation of the variance of parameters and automates the delta-method.
+    this approach simplifies variance estimation and automates the delta-method.
 
 
     M-Estimators consists of two broad step: point estimation and variance estimation. Point estimation is carried out
-    by determining the values of :math:`\theta` where the sum of the estimating equations are zero. For variance
-    estimation, the asymptotic sandwich variance estimator is used, which consists of
+    by determining the values of :math:`\theta` where the sum of the estimating equations are zero. This is done via
+    standard root-finding algorithms.
+
+    For variance estimation, sandwich variance estimator is used. The asymptotic sandwich variance estimator consists of
 
     .. math::
 
-        B_n(O, \hat{\theta})^{-1} F_n(O, \hat{\theta}) \left\{B_n(O, \hat{\theta}^{-1})\right\}^T
+        V_n(O, \hat{\theta}) = B_n(O, \hat{\theta})^{-1} F_n(O, \hat{\theta}) \left\{B_n(O, \hat{\theta}^{-1})\right\}^T
 
-    where :math:`B` is the 'bread' and :math:`F` is the 'filling'
+    where :math:`B` is the 'bread' and :math:`F` is the 'filling' matrix. These matrices are defined as
 
     .. math::
 
@@ -49,31 +51,33 @@ class MEstimator:
 
         F_n(O, \hat{\theta}) = n^{-1} \sum_{i=1}^{n} \psi(O_i, \hat{\theta}) \psi(O_i, \hat{\theta})^T
 
-    The partial derivatives for the bread are calculated using either numerical approximation (e.g., forward difference
-    method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's ``linalg.pinv``. For
-    the filling, the dot product is taken at :math:`\hat{\theta}`.
+    respectively. The partial derivatives for the bread are calculated using either numerical approximation (e.g.,
+    forward difference method) or forward-mode automatic differentiation. Inverting the bread is done via NumPy's
+    ``linalg.pinv``. For the filling, the dot product is taken at :math:`\hat{\theta}`.
 
     Note
     ----
-    A hard part, that must be done by the user, is to specify the estimating equations. Be sure to check the provided
-    examples for the expected format. Pre-built estimating equations for common problems are also made available.
+    The difficult part (that must be done by the user) is to specify the estimating equations. Be sure to check the
+    provided examples for the expected format. Pre-built estimating equations for common problems are also made
+    available.
 
 
     After completion of these steps, point and variance estimates are stored. These can be extracted from
-    ``MEstimator``.
+    ``MEstimator``. Further, confidence intervals, Z-scores, P-values, or S-values can all be generated.
 
     Note
     ----
     For complex regression problems, the root-finding algorithms are not as robust relative to maximization approaches.
-    A solution for difficult problems is to 'pre-wash' the initial values.
+    A simple solution for difficult problems is to 'pre-wash' or find the solution to the equations and provide those
+    as the initial starting values.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     init : list, set, array
-        Initial values for the root-finding algorithm.
+        Initial values for the root-finding algorithm. A total of `v` values should be provided.
     subset : list, set, array, None, optional
         Optional argument to conduct the root-finding procedure on a subset of parameters in the estimating equations.
         The input list is used to location index the parameter array via ``np.take()``. The subset list will
@@ -82,9 +86,10 @@ class MEstimator:
 
     Note
     ----
-    Because the root-finding procedure is NOT ran for parameters outside of the subset, those coefficients must be
-    solved outside of ``MEstimator``. In general, I do NOT recommend using the ``subset`` argument unless a series of
-    complex estimating equations need to be solved.
+    Because the root-finding procedure is NOT ran for parameters outside of the subset, those coefficients *must* be
+    solved outside of ``MEstimator``. In general, I do *NOT* recommend using the ``subset`` argument unless a series of
+    complex estimating equations need to be solved. In general, this argument does not massively improve speed until
+    the estimating equations consist of hundreds of parameters.
 
     Examples
     --------
@@ -115,6 +120,7 @@ class MEstimator:
     >>> estr.variance                               # Covariance
     >>> estr.asymptotic_variance                    # Asymptotic covariance
     >>> np.sqrt(np.diag(estr.asymptotic_variance))  # Standard deviation
+    >>> estr.variance                               # Covariance
     >>> np.sqrt(np.diag(estr.variance))             # Standard error
     >>> estr.confidence_intervals()                 # Confidence intervals
     >>> estr.z_scores()                             # Z-scores
@@ -122,9 +128,9 @@ class MEstimator:
     >>> estr.s_values()                             # S-values
 
     Alternatively, a custom estimating equation can be specified. This is done by constructing a valid estimating
-    equation for the ``MEstimator``. The ``MEstimator`` expects the ``psi`` function to return a b-by-n array, where b
-    is the number of parameters (length of ``theta``) and n is the total number of observations. Below is an example
-    of the mean and variance estimating equation from before
+    equation for the ``MEstimator``. The ``MEstimator`` expects the ``psi`` function to return a `v`-by-`n` array,
+    where `v` is the number of parameters (length of ``theta``) and n is the total number of observations. Below is an
+    example of the mean and variance estimating equation from before, but implemented by-hand
 
     >>> def psi(theta):
     >>>     y = np.array(y_dat)
@@ -137,7 +143,7 @@ class MEstimator:
     >>> estr = MEstimator(stacked_equations=psi, init=[0, 0, ])
     >>> estr.estimate()
 
-    Note that ``len(init)`` should be equal to b. So in this case, two initial values are provided.
+    Note that ``len(init)`` should be equal to `v`. So in this case, two initial values are provided.
 
     ``MEstimator`` can also be run with a user-provided root-finding algorithm. To specify a custom root-finder, a
     function must be created by the user that consists of two keyword arguments (``stacked_equations``, ``init``) and
@@ -164,7 +170,10 @@ class MEstimator:
     Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
     (pp. 297-337). Springer, New York, NY.
 
-    Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. The American Statistician, 56(1), 29-38.
+    Ross RK, Zivich PN, Stringer JSA, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction
+    and applied examples. *International Journal of Epidemiology*, 53(2), dyae030.
+
+    Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. *The American Statistician*, 56(1), 29-38.
     """
     def __init__(self, stacked_equations, init=None, subset=None):
         self.stacked_equations = stacked_equations     # User-input stacked estimating equations
@@ -183,36 +192,38 @@ def __init__(self, stacked_equations, init=None, subset=None):
         self.asymptotic_variance = None   # Asymptotic covariance matrix for theta values (calculated later)
 
     def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='approx', dx=1e-9, allow_pinv=True):
-        """Function to carry out the point and variance estimation of theta. After this procedure, the point estimates
-        (in ``theta``) and the covariance matrix (in ``variance``) can be extracted.
+        """Run the point and variance estimation procedures for given estimating equation and starting values. This
+        function carries out the point and variance estimation of ``theta``. After this procedure, the point estimates
+        (in ``theta``) and the covariance matrix (in ``variance``) can be extracted from the ``MEstimator`` object.
 
         Parameters
         ----------
         solver : str, function, callable, optional
             Method to use for the root-finding procedure. Default is the Levenberg-Marquardt algorithm
-            (``scipy.optimize.root(method='lm')``). Other built-in option is the secant method
-            (``scipy.optimize.newton``), and a modification of the Powell hybrid method
-            (``scipy.optimize.root(method='hybr')``). Finally, any generic root-finding algorithm can be used via a
-            user-provided callable object. The function must consist of two keyword arguments: ``stacked_equations``,
-            and ``init``. Additionally, the function should return only the optimized values. Please review the
-            provided example in the documentation for how to implement a custom root-finding algorithm.
+            (``scipy.optimize.root(method='lm')``, specified by ``solver='lm'``). Other built-in option is the secant
+            method (``scipy.optimize.newton``, specified by ``solver='newton'``), and a modification of the Powell
+            hybrid method (``scipy.optimize.root(method='hybr')``, specified by ``solver='hybr'``). Finally, any generic
+            root-finding algorithm can be used via a user-provided callable object. The function must consist of two
+            keyword arguments: ``stacked_equations``, and ``init``. Additionally, the function should return only the
+            optimized values. Please review the provided example in the documentation for how to implement a custom
+            root-finding algorithm.
         maxiter : int, optional
             Maximum iterations to consider for the root finding procedure. Default is 5000 iterations. For complex
-            estimating equations (without preceding optimization), this value may need to be increased. This argument
-            is not used for user-specified solvers.
+            estimating equations, this value may need to be increased. This argument is not used when a custom
+            root-finding method (e.g., ``solver``) is provided.
         tolerance : float, optional
-            Maximum tolerance for errors in the root finding. This argument is passed ``scipy.optimize`` via the
-            ``tol`` parameter. This argument is not used for user-specified solvers. Default is 1e-9.
+            Maximum tolerance for errors in the root finding in ``scipy.optimize``. Default is 1e-9. This argument is
+            not used when a custom root-finding method (e.g., ``solver``) is provided.
         deriv_method : str, optional
-            Method to compute the derivative of the estimating equations for the bread matrix. Options include numerical
-            approximation via the forward difference method via SciPy (``'approx'``), forward difference implemented
-            by-hand (`'fapprox'`), backward difference implemented by-hand (`'bapprox'`),  central difference
-            implemented by-hand (`'capprox'`), or forward-mode automatic differentiation (``'exact'``).
-            Default is ``'approx'``.
+            Method to compute the derivative of the estimating equations for the bread matrix. Default is ``'approx'``.
+            Options include numerical approximation via the forward difference method via SciPy (``'approx'``), forward
+            difference as implemented in delicatessen (`'fapprox'`), backward difference as implemented in delicatessen
+            (`'bapprox'`), central difference implemented as in delicatessen (`'capprox'`), or forward-mode automatic
+            differentiation as implemented in delicatessen(``'exact'``).
         dx : float, optional
-            Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
-            for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-            used with numerical approximation methods. Default is 1e-9.
+            Spacing to use to numerically approximate the partial derivatives of the bread matrix. Default is 1e-9.
+            Here, a small value for ``dx`` should be used, since some large values can result in poor approximations.
+            This argument is only used with numerical approximation methods.
         allow_pinv : bool, optional
             Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
             non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
@@ -322,8 +333,8 @@ def estimate(self, solver='lm', maxiter=5000, tolerance=1e-9, deriv_method='appr
             self.variance = self.asymptotic_variance / self.n_obs
 
     def confidence_intervals(self, alpha=0.05):
-        r"""Calculate Wald-type :math:`(1 - \alpha) \times` 100% confidence intervals using the point estimates and
-        the sandwich variance. The formula for the confidence intervals are
+        r"""Calculate two-sided Wald-type :math:`(1 - \alpha) \times` 100% confidence intervals using the point
+        and sandwich variance estimates. The formula for the confidence intervals is
 
         .. math::
 
@@ -372,7 +383,7 @@ def z_scores(self, null=0):
             \frac{\hat{\theta} - \theta}{\widehat{SE}(\hat{\theta})}
 
         where :math:`\theta` is the null. The ``.estimate()`` function must be called before the Z-scores can be
-        calculated.
+        calculated. Note that the default value for the null is zero.
 
         Parameters
         ----------
@@ -418,13 +429,8 @@ def p_values(self, null=0):
     def s_values(self, null=0):
         r"""Calculate two-sided Wald-type S-values using the point estimates and the sandwich variance. The S-value,
         or Shannon Information value, is a transformation of the P-values that has been argued to be more easily
-        interpretable as it can be related back to simple coin-flipping scenarios.
-
-        Suppose the S-value is :math:`s`. Then :math:`s` corresponds to the number of heads in a row with a fair coin
-        (equal chances heads or tails). As :math:`s` increases, one would be more 'surprised' by the result (e.g., it
-        might not be surprising to have two heads in a row, but it would be surprising for 10 in a row).
-
-        The transformation from a P-value into a S-value is.
+        interpretable as it can be related back to simple coin-flipping scenarios. The transformation from a P-value
+        into a S-value is.
 
         .. math::
 
@@ -433,6 +439,11 @@ def s_values(self, null=0):
         where :math:`P` is the corresponding P-value. The ``.estimate()`` function must be called before the S-values
         can be calculated.
 
+        The S-value can be contextualized in terms of coin-flips. Suppose the S-value is :math:`s`. Then :math:`s`
+        corresponds to the number of heads in a row with a fair coin (equal chances heads or tails). As :math:`s`
+        increases, one would be more 'surprised' by the result (e.g., it might not be surprising to have 2 heads in a
+        row, but it would be surprising for 20 in a row).
+
         Parameters
         ----------
         null: int, float, ndarray, optional
@@ -526,8 +537,8 @@ def _mestimator_sum_(stacked_equations, subset):
 
     @staticmethod
     def _solve_coefficients_(stacked_equations, init, method, maxiter, tolerance):
-        """Quasi-Newton solver for the values of theta, such that the estimating equations are equal to zero. Default
-        uses the secant method from SciPy's `newton` optimizer.
+        """Calls the root-finding procedure for the values of theta, such that the estimating equations are equal to
+        zero. Default uses the Levenberg-Marquardt algorithm from SciPy.
 
         Parameters
         ----------
@@ -589,11 +600,9 @@ def _solve_coefficients_(stacked_equations, init, method, maxiter, tolerance):
                 raise ValueError("The user-specified root-finding `solver` must return the solution to the "
                                  "optimization")
         else:
-            raise ValueError("The solver '" +  # ... otherwise throw ValueError
-                             str(method) +
-                             "' is not available. Please see the "
-                             "documentation for valid"
-                             "options for the optimizer.")
+            # ... otherwise throw ValueError if no other root-finding steps are triggered.
+            raise ValueError("The solver '" +  str(method) + "' is not available. Please see the "
+                             "documentation for valid options for the optimizer.")
 
         # Return optimized theta array
         return psi
diff --git a/delicatessen/sandwich.py b/delicatessen/sandwich.py
index e337345..1fcba4b 100644
--- a/delicatessen/sandwich.py
+++ b/delicatessen/sandwich.py
@@ -24,23 +24,31 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
         V_n(O_i; \theta) = B_n(O_i; \theta)^{-1} F_n(O_i; \theta) \left[ B_n(O_i; \theta)^{-1} \right]^{T}
 
-    where :math:`B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial}{\partial \theta} \psi(O_i; \theta)`,
-    :math:`F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T`, and :math:`\psi(O_i; \theta)` is the
-    estimating function.
+    where :math:`\psi(O_i; \theta)` is the estimating function,
+
+    .. math::
+
+        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial}{\partial \theta} \psi(O_i; \theta),
+
+    and
+
+    .. math::
+
+        F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T .
 
     To compute the bread matrix, :math:`B_n`, the matrix of partial derivatives is computed by using either finite
     difference methods or automatic differentiation. For finite differences, the default is to use SciPy's
-    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use homebrew version
-    that allows for forward, backward, and center differences. Automatic differentiation is also supported by a
-    homebrew version.
+    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use the delicatessen
+    homebrew version that allows for forward, backward, and center differences. Automatic differentiation is also
+    supported by a homebrew version.
 
     To compute the meat matrix, :math:`F_n`, only linear algebra methods, implemented through NumPy, are necessary.
-    The sandwich is then constructed from these individual pieces.
+    The sandwich is then constructed from these pieces using linear algebra methods from NumPy.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     theta : list, set, array
         Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
@@ -53,7 +61,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     dx : float, optional
         Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
         for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-        used with numerical approximation methods. Default is 1e-9.
+        used with numerical approximation methods. Default is ``1e-9``.
     allow_pinv : bool, optional
         Whether to allow for the pseudo-inverse (via ``numpy.linalg.pinv``) if the bread matrix is determined to be
         non-invertible. If you want to disallow the pseudo-inverse (i.e., use ``numpy.linalg.inv``), set this
@@ -62,7 +70,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
 
     Examples
     --------
@@ -76,7 +84,7 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
     >>> y_dat = [1, 2, 4, 1, 2, 3, 1, 5, 2]
 
     The following is an illustration of how to compute sandwich covariance using only an estimating equation and the
-    paramter values. The mean and variance (that correspond to ``ee_mean_variance``) can be computed using NumPy by
+    parameter values. The mean and variance (that correspond to ``ee_mean_variance``) can be computed using NumPy by
 
     >>> mean = np.mean(y_dat)
     >>> var = np.var(y_dat, ddof=0)
@@ -96,11 +104,18 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
     >>> sandwich = sandwich_asymp / len(y_dat)
 
+    The standard errors are then
+
+    >>> se = np.sqrt(np.diag(sandwich))
+
     References
     ----------
     Boos DD, & Stefanski LA. (2013). M-estimation (estimating equations). In Essential Statistical Inference
     (pp. 297-337). Springer, New York, NY.
 
+    Ross RK, Zivich PN, Stringer JSA, & Cole SR. (2024). M-estimation for common epidemiological measures: introduction
+    and applied examples. *International Journal of Epidemiology*, 53(2), dyae030.
+
     Stefanski LA, & Boos DD. (2002). The calculus of M-estimation. The American Statistician, 56(1), 29-38.
     """
     # Evaluating at provided theta values
@@ -132,21 +147,23 @@ def compute_sandwich(stacked_equations, theta, deriv_method='approx', dx=1e-9, a
 
 
 def compute_bread(stacked_equations, theta, deriv_method, dx=1e-9):
-    """Function to compute the bread matrix. The bread matrix is defined as
+    r"""Function to compute the bread matrix. The bread matrix is defined as
 
     .. math::
 
-        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial \psi(O_i; \theta)}{\partial \theta}
+        B_n(O_i; \theta) = \sum_{i=1}^n \frac{\partial }{\partial \theta} \psi(O_i; \theta)
 
-    The matrix of partial derivatives is computed by using either finite difference methods or automatic
-    differentiation. For finite differences, the default is to use SciPy's ``approx_fprime`` functionality, which uses
-    forward finite differences. However, you can also use homebrew version that allows for forward, backward, and
-    center differences. Automatic differentiation is also supported by a homebrew version.
+    where :math:`\psi(O_i; \theta)` is the estimating function.
+    To compute the bread matrix, :math:`B_n`, the matrix of partial derivatives is computed by using either finite
+    difference methods or automatic differentiation. For finite differences, the default is to use SciPy's
+    ``approx_fprime`` functionality, which uses forward finite differences. However, you can also use the delicatessen
+    homebrew version that allows for forward, backward, and center differences. Automatic differentiation is also
+    supported by a homebrew version.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     theta : list, set, array
         Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
@@ -159,12 +176,12 @@ def compute_bread(stacked_equations, theta, deriv_method, dx=1e-9):
     dx : float, optional
         Spacing to use to numerically approximate the partial derivatives of the bread matrix. Here, a small value
         for ``dx`` should be used, since some large values can result in poor approximations. This argument is only
-        used when numerical approximation methods. Default is 1e-9.
+        used when numerical approximation methods. Default is ``1e-9``.
 
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     def estimating_equation(input_theta):
         if len(input_theta) == 1:
@@ -215,19 +232,20 @@ def estimating_equation(input_theta):
 
 
 def compute_meat(stacked_equations, theta):
-    """Function to compute the meat matrix. The meat matrix is defined as
+    r"""Function to compute the meat matrix. The meat matrix is defined as
 
     .. math::
 
         F_n(O_i; \theta) = \sum_{i=1}^n \psi(O_i; \theta) \psi(O_i; \theta)^T
 
+    where :math:`\psi(O_i; \theta)` is the estimating function.
     Rather than summing over all the individual contributions, this implementation takes a single dot product of the
     stacked estimating functions. This implementation is much faster than summing over :math:`n` matrices.
 
     Parameters
     ----------
     stacked_equations : function, callable
-        Function that returns a b-by-n NumPy array of the estimating equations. See provided examples in the
+        Function that returns a `v`-by-`n` NumPy array of the estimating equations. See provided examples in the
         documentation for how to construct a set of estimating equations.
     theta : list, set, array
         Parameter estimates to compute the empirical sandwich variance estimator at. Note that this function assumes
@@ -236,14 +254,14 @@ def compute_meat(stacked_equations, theta):
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     evald_theta = np.asarray(stacked_equations(theta=theta))  # Evaluating EE at theta-hat
     return np.dot(evald_theta, evald_theta.T)                 # Return the fast dot product calculation
 
 
 def build_sandwich(bread, meat, allow_pinv=True):
-    """Function to combine the sandwich elements together. This function takes the bread and meat matrices, does the
+    r"""Function to combine the sandwich elements together. This function takes the bread and meat matrices, does the
     inversions, and then combines them together. This function is separate from ``compute_sandwich`` as it is called
     by both ``compute_sandwich`` and ``MEstimator``.
 
@@ -261,7 +279,7 @@ def build_sandwich(bread, meat, allow_pinv=True):
     Returns
     -------
     array :
-        Returns a p-by-p NumPy array for the input ``theta``, where ``p = len(theta)``
+        Returns a `p`-by-`p` NumPy array for the input ``theta``, where ``p = len(theta)``
     """
     # Check if there is an issue with the bread matrix
     if np.any(np.isnan(bread)):                                   # If bread contains NaN, breaks
diff --git a/delicatessen/utilities.py b/delicatessen/utilities.py
index 829cec0..3a3f263 100644
--- a/delicatessen/utilities.py
+++ b/delicatessen/utilities.py
@@ -6,7 +6,11 @@
 
 
 def logit(prob):
-    """Logistic transformation of probabilities. Returns log-odds
+    r"""Logistic transformation. Used to transform probabilities into log-odds.
+
+    .. math::
+
+        \log \left( \frac{p}{1-p} \right)
 
     Parameters
     ----------
@@ -15,13 +19,18 @@ def logit(prob):
 
     Returns
     -------
-    logit-transformed probabilities
+    array :
+        logit-transformed values
     """
     return np.log(prob / (1 - prob))
 
 
 def inverse_logit(logodds):
-    """Inverse logistic transformation. Returns probabilities
+    r"""Inverse logistic transformation. Used to transform log-odds into probabilities.
+
+    .. math::
+
+        \frac{1}{1 + \exp(o)}
 
     Parameters
     ----------
@@ -30,13 +39,14 @@ def inverse_logit(logodds):
 
     Returns
     -------
-    inverse-logit transformed results (i.e. probabilities for log-odds)
+    array :
+        inverse-logit transformed values
     """
     return 1 / (1 + np.exp(-logodds))
 
 
 def identity(value):
-    """Identity transformation. Returns itself
+    """Identity transformation. Used to transform input into itself (i.e., no transformation in applied).
 
     Note
     ----
@@ -56,7 +66,7 @@ def identity(value):
 
 
 def polygamma(n, x):
-    """Polygamma functions. This is a wrapper function of ``scipy.special.polygamma`` meant to enable automatic
+    """Polygamma function. This is a wrapper function of ``scipy.special.polygamma`` meant to enable automatic
     differentation with ``delicatessen``. When the input is a ``PrimalTangentPairs`` object, then an internal function
     that implements the polygamma function is called. Otherwise, ``scipy.special.polygamma`` is called for the input
     object.
@@ -162,12 +172,12 @@ def standard_normal_pdf(x):
 
 def robust_loss_functions(residual, loss, k, a=None, b=None):
     r"""Loss functions for robust mean and robust regression estimating equations. This function is called internally
-    for ``ee_mean_robust`` and ``ee_robust_regression``. This function can also be loaded, so user's can easily adapt
+    for ``ee_mean_robust`` and ``ee_robust_regression``. This function can also be accessed, so user's can easily adapt
     their own regression models into robust regression models using the pre-defined loss functions.
 
     Note
     ----
-    The loss functions here are technically the first-order derivatives of the loss functions
+    The loss functions here are technically the first-order derivatives of the loss functions you see in the literature.
 
 
     The following score of the loss functions, :math:`f_k()`, are available.
@@ -176,30 +186,30 @@ def robust_loss_functions(residual, loss, k, a=None, b=None):
 
     .. math::
 
-        f_k(x) = I(k \pi <= x <= k \pi) \times \sin(x/k)
+        f_k(x) = I(k \pi \le x \le k \pi) \times \sin(x/k)
 
     Huber
 
     .. math::
 
-        f_k(x) = x \times I(-k < x < k) + k \times (1 - I(-k < x < k)) \times \text{sign}(x)
+        f_k(x) = x I(-k < x < k) + \text{sign}(x) k (1 - I(-k < x < k))
 
     Tukey's biweight
 
     .. math::
 
-        f_k(x) = x \times I(-k < x < k) + x \left( 1 - (x/k)^2 \right)^2
+        f_k(x) = x I(-k < x < k) + x \left( 1 - (x/k)^2 \right)^2
 
     Hampel (Hampel's add two additional parameters, :math:`a` and :math:`b`)
 
     .. math::
 
-        f_k(x) =
+        f_{k,a,b}(x) =
         \begin{bmatrix}
             I(-a < x < a) \times x \\
-            + I(a \ge |x| < b) \times a \times \text{sign}(x) \\
-            + I(b \ge x < k) \times a \frac{k - x}{k - b} \\
-            + I(-b \le x > -k) \times -a \frac{-k + x}{-k + b} \\
+            + I(a \le |x| < b) \times a \times \text{sign}(x) \\
+            + I(b \le x < k) \times a \frac{k - x}{k - b} \\
+            + I(-k \ge x > -b) \times -a \frac{-k + x}{-k + b} \\
             + I(|x| \ge k) \times 0
         \end{bmatrix}
 
@@ -315,7 +325,8 @@ def regression_predictions(X, theta, covariance, offset=None, alpha=0.05):
     values. Importantly, this method allows for the variance of :math:`\hat{Y}` to be estimated without having to expand
     the estimating equations. As such, this functionality is meant to be used after ``MEstimator`` has been used to
     estimate the coefficients (i.e., this function is for use after the M-estimator has computed the results for the
-    chosen regression model). Therefore, this function should be viewed as a post-processing functionality.
+    chosen regression model). Therefore, this function should be viewed as a post-processing functionality for
+    generating plots or tables.
 
     Note
     ----
@@ -329,9 +340,9 @@ def regression_predictions(X, theta, covariance, offset=None, alpha=0.05):
         2-dimensional vector of values to generate predicted variances for. The number of columns must match the number
         of coefficients / parameters in ``theta``.
     theta : ndarray
-        Estimated coefficients from ``delicatessen.MEstimator.theta``.
+        Estimated coefficients from ``MEstimator.theta``.
     covariance : ndarray
-        Estimated covariance matrix from ``delicatessen.MEstimator.variance``.
+        Estimated covariance matrix from ``MEstimator.variance``.
     offset : ndarray, None, optional
         A 1-dimensional offset to be included in the model. Default is None, which applies no offset term.
     alpha : float, optional
@@ -448,9 +459,10 @@ def spline(variable, knots, power=3, restricted=True, normalized=False):
 
         r_k(X) = I(X > k) \left\{ X - k \right\}^a - s_K(X)
 
-    where :math:`K` is largest knot value. Splines are normalized by the upper knot minus the lower knot to the
-    corresponding power. Normalizing the splines can be helpful for the root-finding procedure, but does change the
-    interpretation of the corresponding coefficients.
+    where :math:`K` is largest knot value.
+
+    Splines are normalized by the upper knot minus the lower knot to the corresponding power. Normalizing the splines
+    can be helpful for the root-finding procedure.
 
     Parameters
     ----------
@@ -546,16 +558,16 @@ def additive_design_matrix(X, specifications, return_penalty=False):
         penalty). For terms that should not have splines, ``None`` should be specified instead (see examples below).
         Each dictionary supports the following parameters:
         "knots", "natural", "power", "penalty"
-        * knots (list): controls the position of the knots, with knots are placed at given locations. There is no
-            default, so must be specified by the user.
-        * natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
-            Default is ``True``, which corresponds to natural splines.
-        * power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
-            splines.
-        * penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
-            which applies no penalty to the spline basis terms.
-        * normalized (bool): whether to normalize the spline terms. Default is ``False``, with a default change coming
-            with v3.0 release.
+        knots (list): controls the position of the knots, with knots are placed at given locations. There is no
+        default, so must be specified by the user.
+        natural (bool): controls whether to generate natural (restricted) or unrestricted splines.
+        Default is ``True``, which corresponds to natural splines.
+        power (float): controls the power to raise the spline terms to. Default is 3, which corresponds to cubic
+        splines.
+        penalty (float): penalty term (:math:`\lambda`) applied to each corresponding spline basis term. Default is 0,
+        which applies no penalty to the spline basis terms.
+        normalized (bool): whether to normalize the spline terms. Default is ``False``, with a default change coming
+        with v3.0 release.
     return_penalty : bool, optional
         Whether the list of the corresponding penalty terms should also be returned. This functionality is used
         internally to create the list of penalty terms to provide the Ridge regression model, where only the spline
diff --git a/docs/Reference/Utilities.rst b/docs/Reference/Utilities.rst
index e2b5cdb..c0fa9f3 100644
--- a/docs/Reference/Utilities.rst
+++ b/docs/Reference/Utilities.rst
@@ -41,5 +41,5 @@ Differentiation
 .. autosummary::
   :toctree: generated/
 
+  approx_differentiation
   auto_differentiation
-
diff --git a/docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst b/docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst
new file mode 100644
index 0000000..b75e7f2
--- /dev/null
+++ b/docs/Reference/generated/delicatessen.derivative.approx_differentiation.rst
@@ -0,0 +1,6 @@
+﻿delicatessen.derivative.approx\_differentiation
+===============================================
+
+.. currentmodule:: delicatessen.derivative
+
+.. autofunction:: approx_differentiation
\ No newline at end of file