FiniteSampleCriteria.tex

\documentclass[12pt,letterpaper]{article}
\usepackage{amsmath, amssymb, graphics}
\usepackage{hyperref}
\usepackage{mathtools}
\usepackage{fullpage}

\mathtoolsset{showonlyrefs,showmanualtags}
% \allowdisplaybreaks[1]

\newcommand{\Pochhammer}[2]{\left(#1\right)_{#2}}

\begin{document}

\section*{Finite sample criteria for autoregressive order selection}

This document details autoregressive model selection criteria following
Broersen.\footnote{Broersen, P. M. T. ``Finite sample criteria for
autoregressive order selection.'' IEEE Transactions on Signal Processing 48
(December 2000): 3550-3558.  \url{http://dx.doi.org/10.1109/78.887047}.}
Emphasis is placed on converting the formulas into forms efficiently computable
when evaluating a single model.  When evaluating a hierarchy of models,
computing using intermediate results may be more efficient.

\subsection*{Setting}

An AR(K) process and its AR(p) model are given by
\begin{subequations}
\begin{align}
    x_n+a_1x_{n-1}+\dots+a_Kx_{n-K}&=\epsilon_n
    \\
    {x}_n+\hat{a}_1x_{n-1}+\dots+\hat{a}_px_{n-p}&=\hat{\epsilon}_n
\end{align}
\end{subequations}
in which $\epsilon_n\sim N\left(0,\sigma_{\epsilon}^2\right)$ and
$\hat{\epsilon}_n\sim N\left(0,\hat{\sigma}_{\epsilon}^2\right)$.
Model selection criteria for evaluating which of several candidates most
parsimoniously fits an AR(K) process generally have the form
\begin{align}
    \label{eq:general}
    \text{criterion}\!\left(v_\text{method},N,p,\alpha\right)
    &=
    \ln \text{residual}\!\left(p,v_\text{method}\right)
    +
    \text{overfit}\!\left(\text{criterion},v_\text{method},N,p,\alpha\right)
    .
\end{align}
Among all candidates and using a given criterion, the ``best'' model minimizes
the criterion.  Here, $N$ represents the number of samples used to estimate
model parameters, $p$ denotes the order of the estimated model,
$v_\text{method}=v_\text{method}\!\left(N,i\right)$ is the method-specific
estimation variance for model order $i$, and $\alpha$ is an optional factor
with a criterion-dependent meaning.  When estimating
$\hat{a}_1,\dots,\hat{a}_p$ given sample data $x_n$, the residual variance is
\begin{align}
    \text{residual}\!\left(v_\text{method},p\right)
    =
    \text{residual}\!\left(p\right)
    &=
    \hat{\sigma}_\epsilon^2
    .
\end{align}
Therefore the left term in~\eqref{eq:general} penalizes misfitting the data
independently of the estimation method used.  One may therefore distinguish
among criterion using only the overfitting penalty term, namely
$\text{overfit}\!\left(\text{criterion},v_\text{method},N,p,\alpha\right)$.

In Broersen's work, the penalty term depends upon the model estimation
method used through the estimation variance $v$:
\begin{subequations}
\label{eq:v1}
\begin{align}
    v_\text{Yule--Walker}\!\left(N,i\right) &= \frac{N-i}{N\left(N+2\right)}
    &
    &i \neq 0
    \\
    v_\text{Burg}\!\left(N,i\right) &= \frac{1}{N + 1 - i}
    &
    &i \neq 0
    \\
    v_\text{LSFB}\!\left(N,i\right) &= \frac{1}{N + 1.5 - 1.5i}
    &
    &i \neq 0
    \\
    v_\text{LSF}\!\left(N,i\right)  &= \frac{1}{N + 2 - 2i}
    &
    &i \neq 0
\end{align}
\end{subequations}
Here ``LSFB'' and ``LSF'' are shorthand for least squares estimation minimizing
both the forward and backward prediction or only the forward prediction,
respectively.  The estimation variance for $i=0$ depends only on whether or not
the sample mean has been subtracted:
\begin{subequations}
\label{eq:v0}
\begin{align}
    v\!\left(N,0\right) &= \frac{1}{N}
    &
    &\text{sample mean subtracted}
    \\
    v\!\left(N,0\right) &= 0
    &
    &\text{sample mean retained}
\end{align}
\end{subequations}

\subsection*{Infinite sample overfit penalty terms}

The method-independent generalized information criterion (GIC) has overfitting
penalty
\begin{align}
    \text{overfit}\!\left(\text{GIC},N,p,\alpha\right)
    &=
    \alpha \frac{p}{N}
\end{align}
independent of $v_\text{model}$.  The Akaike information criterion
(AIC) has
\begin{align}
    \label{eq:aic}
    \text{overfit}\!\left(\text{AIC},N,p\right)
    &=
    \text{overfit}\!\left(\text{GIC},N,p,2\right)
\end{align}
while the consistent criterion BIC and minimally consistent criterion
(MCC) have
\begin{align}
    \label{eq:bic}
    \text{overfit}\!\left(\text{BIC},N,p\right)
    &=
    \text{overfit}\!\left(\text{GIC},N,p,\ln{} N\right)
    \\
    \label{eq:mcc}
    \text{overfit}\!\left(\text{MCC},N,p\right)
    &=
    \text{overfit}\!\left(\text{GIC},N,p,2\ln\ln{}N\right)
    .
\end{align}
Additionally, Broersen uses $\alpha = 3$ with GIC referring to the result as
GIC(p,3).  The asymptotically-corrected Akaike information criterion
($\text{AIC}_\text{C}$) of Hurvich and Tsai\footnote{Hurvich, Clifford M. and
Chih-Ling Tsai. "Regression and time series model selection in small samples."
Biometrika 76 (June 1989): 297-307.  http://dx.doi.org/10.1093/biomet/76.2.297}
is
\begin{align}
    \text{overfit}\!\left(\text{AIC}_\text{C},N,p\right)
    &=
    \frac{2p}{N-p-1}
    .
\end{align}

\subsection*{Finite sample overfit penalty terms}

\subsubsection*{Finite information criterion\footnote{FIC is mistakenly called
the ``finite sample information criterion'' on page 3551 of Broersen 2000 but
referred to correctly as the ``finite information criterion'' on page 187 of
Broersen's 2006 book.}}

The finite information criterion (FIC) is an extension of GIC meant to account
for finite sample size and the estimation method employed.  The FIC overfit
penalty term is
\begin{align}
    \text{overfit}\!\left(\text{FIC},v_\text{method},N,p,\alpha\right)
    &=
    \alpha \sum_{i=0}^{p} v_\text{method}\!\left(N,i\right)
    \notag
    \\
    &=
    \alpha\left(
      v\!\left(N,0\right)
    + \sum_{i=1}^{p} v_\text{method}\!\left(N,i\right)
    \right)
\end{align}
where $v\!\left(N,0\right)$ is evaluated using \eqref{eq:v0} and
$v_\text{method}\!\left(N,i\right)$ from \eqref{eq:v1}.  The factor $\alpha$
may be chosen as in~\eqref{eq:aic}, \eqref{eq:bic}, or~\eqref{eq:mcc}.  Again,
Broersen uses $\alpha = 3$ calling the result FIC(p,3).

By direct computation one finds the following:
\begin{subequations}
\begin{align}
    \text{overfit}\!\left(\text{FIC},v_\text{Yule--Walker},N,p,\alpha\right)
    &=
    \alpha\left(
      v\!\left(N,0\right)
    - \frac{p \left(1-2 N+p\right)}{2 N \left(N+2\right)}
    \right)
\\
    \text{overfit}\!\left(\text{FIC},v_\text{Burg},N,p,\alpha\right)
    &=
    \alpha\left(
      v\!\left(N,0\right)
    - \psi\!\left(N+1\right)
    + \psi\!\left(N+1-p\right)
    \right)
\\
    \text{overfit}\!\left(\text{FIC},v_\text{LSFB},N,p,\alpha\right)
    &=
    \alpha\left(
      v\!\left(N,0\right)
    - \frac{2}{3} \left(
            \psi\!\left(\frac{3+2N}{3}\right)
          - \psi\!\left(\frac{3+2N}{3}-p\right)
      \right)
    \right)
\\
    \text{overfit}\!\left(\text{FIC},v_\text{LSF},N,p,\alpha\right)
    &=
    \alpha\left(
      v\!\left(N,0\right)
    - \frac{1}{2} \left(
            \psi\!\left(\frac{2+N}{2}\right)
          - \psi\!\left(\frac{2+N}{2}-p\right)
      \right)
    \right)
\end{align}
\end{subequations}
The simplifications underneath the Burg, LSFB, and LSF results use that
\begin{align}
    \sum_{i=1}^{p} \frac{1}{N+a-ai}
    &=
    \sum_{i=0}^{p-1} \frac{1}{N-ai}
    =
    \frac{1}{a} \sum_{i=0}^{p-1} \frac{1}{\frac{N}{a}-i}
    =
    \frac{1}{a} \left(
          \psi\!\left(\frac{N}{a}+1\right)
        - \psi\!\left(\frac{N}{a}-p+1\right)
    \right)
\end{align}
holds $\forall{}a\in\mathbb{R}$ because the digamma function $\psi$
telescopes according to
\begin{align}
    \psi\!\left(x+1\right)
    =
    \frac{1}{x}
    +
    \psi\!\left(x\right)
    &\implies
    \psi\!\left(x+k\right)
    -
    \psi\!\left(x\right)
    =
    \sum_{i=0}^{k-1} \frac{1}{x + i}
    .
\end{align}
For strictly positive abscissae, $\psi$ may be numerically evaluated following
Bernardo.\footnote{Bernardo, J. M.  ``Algorithm AS 103: Psi (digamma)
function.'' Journal of the Royal Statistical Society.  Series C (Applied
Statistics) 25 (1976).  \url{http://www.jstor.org/stable/2347257}}

\subsubsection*{Finite sample information criterion}

The finite sample information criterion (FSIC) is a finite sample approximation
to the Kullback--Leibler discrepancy\footnote{Presumably FSIC could be related,
through the Kullback symmetric divergence, to the KICc and AKICc criteria
proposed by Seghouane, A. K. and M. Bekara.  ``A Small Sample Model Selection
Criterion Based on Kullback's Symmetric Divergence.'' IEEE Transactions on
Signal Processing 52 (December 2004): 3314-3323.
\url{http://dx.doi.org/10.1109/TSP.2004.837416}.}. FSIC has the overfit penalty
term
\begin{align}
    \text{overfit}\!\left(\text{FSIC},v_\text{method},N,p\right)
    &=
    \prod_{i=0}^{p}
    \frac{
        1 + v_\text{method}\!\left(N,i\right)
    }{
        1 - v_\text{method}\!\left(N,i\right)
    }
    - 1
    \notag
    \\
    &=
    \frac{
        1 + v\!\left(N,0\right)
    }{
        1 - v\!\left(N,0\right)
    }
    \cdot
    \prod_{i=1}^{p}
    \frac{
        1 + v_\text{method}\!\left(N,i\right)
    }{
        1 - v_\text{method}\!\left(N,i\right)
    }
    - 1
    .
    \label{eq:fsicGeneral}
\end{align}

The product in the context of the Yule--Walker estimation may be reexpressed as
\begin{align}
    \prod_{i=1}^{p}
    \frac{
        1 + v_\text{Yule--Walker}\!\left(N,i\right)
    }{
        1 - v_\text{Yule--Walker}\!\left(N,i\right)
    }
    &=
    \prod_{i=1}^{p}
    \frac{
        N^2 + 3N - i
    }{
        N^2 +  N + i
    }
    \notag
    \\
    &=
    \left(-1\right)^p
    \frac{
        \Pochhammer{1-3n-n^2}{p}
    }{
        \Pochhammer{1+ n-n^2}{p}
    }
    =
    \frac{
        \Pochhammer{n^2+3n-p}{p}
    }{
        \Pochhammer{1+n-n^2}{p}
    }
    \label{eq:pochYW}
\end{align}
where the ``rising factorial'' is denoted by the Pochhammer symbol
\begin{align}
    \Pochhammer{x}{k} &= \frac{\Gamma\left(x+k\right)}{\Gamma\left(x\right)}
    .
\end{align}
When $x$ is a negative integer and $\Gamma$ is therefore undefined, the
limiting value of the ratio is implied.  The product in the context of the
Burg, LSFB, or LSF estimation methods becomes
\begin{align}
    \prod_{i=1}^{p}
    \frac{
        1 + v_\text{Burg$\vert{}$LSFB$\vert{}$LSF}\!\left(N,i\right)
    }{
        1 - v_\text{Burg$\vert{}$LSFB$\vert{}$LSF}\!\left(N,i\right)
    }
    &=
    \prod_{i=1}^{p}
    \frac{
        N + a\left(1-i\right) + 1
    }{
        N + a\left(1-i\right) - 1
    }
    =
    \frac{
        \Pochhammer{-\frac{1+N}{a}}{p}
    }{
        \Pochhammer{\frac{1-N}{a}}{p}
    }
    \label{eq:pochCommon}
\end{align}
where $a\in\mathbb{R}$ is a placeholder for a method-specific constant.
Routines for computing the Pochhammer symbol may be found in, for example,
SLATEC\footnote{ Vandevender, W. H. and K. H. Haskell. ``The SLATEC
mathematical subroutine library.'' ACM SIGNUM Newsletter 17 (September 1982):
16-21.  \url{http://dx.doi.org/10.1145/1057594.1057595} } or the GNU Scientific
Library\footnote{ M. Galassi et al, GNU Scientific Library Reference Manual
(3rd Ed.), ISBN 0954612078. \url{http://www.gnu.org/software/gsl/} }.  In
particular, both suggested sources handle negative integer input correctly.

By direct substitution of \eqref{eq:pochYW} or \eqref{eq:pochCommon} into
\eqref{eq:fsicGeneral} one obtains:
\begin{align}
    \text{overfit}\!\left(\text{FSIC},v_\text{Yule--Walker},N,p\right)
    &=
    \frac{
        1 + v\!\left(N,0\right)
    }{
        1 - v\!\left(N,0\right)
    }
    \cdot
    \frac{
        \Pochhammer{n^2+3n-p}{p}
    }{
        \Pochhammer{1+n-n^2}{p}
    }
    - 1
    \\
    \text{overfit}\!\left(\text{FSIC},v_\text{Burg},N,p\right)
    &=
    \frac{
        1 + v\!\left(N,0\right)
    }{
        1 - v\!\left(N,0\right)
    }
    \cdot
    \frac{
        \Pochhammer{-1-N}{p}
    }{
        \Pochhammer{1-N}{p}
    }
    - 1
    \\
    \text{overfit}\!\left(\text{FSIC},v_\text{LSFB},N,p\right)
    &=
    \frac{
        1 + v\!\left(N,0\right)
    }{
        1 - v\!\left(N,0\right)
    }
    \cdot
    \frac{
        \Pochhammer{\frac{-2-2N}{3}}{p}
    }{
        \Pochhammer{\frac{2-2N}{3}}{p}
    }
    - 1
    \\
    \text{overfit}\!\left(\text{FSIC},v_\text{LSF},N,p\right)
    &=
    \frac{
        1 + v\!\left(N,0\right)
    }{
        1 - v\!\left(N,0\right)
    }
    \cdot
    \frac{
        \Pochhammer{\frac{-1-N}{2}}{p}
    }{
        \Pochhammer{\frac{1-N}{2}}{p}
    }
    - 1
\end{align}

\subsubsection*{Combined information criterion}

The combined information criterion (CIC) takes the behavior of FIC(p,3) at low
orders and FSIC at high orders.  For any estimation method CIC has the
overfit penalty term
\begin{align}
    \text{overfit}\!\left(\text{CIC},v_\text{method},N,p\right)
    =
    \max\Bigl\{
        &\text{overfit}\!\left(\text{FSIC},v_\text{method},N,p\right),
\\
        &\text{overfit}\!\left(\text{FIC},v_\text{method},N,p,3\right)
    \Bigr\}
    .
\end{align}

\end{document}