Skip to content

Commit

Permalink
Improved KernelSmoothing
Browse files Browse the repository at this point in the history
Now, a log transformation can be applied to ease the reconstruction of skewed distributions.

Added documentation & tests. Fixed minor errors. Thks to adutfoy@github.com.
  • Loading branch information
regislebrun committed Jun 24, 2024
1 parent b5fd833 commit 08eef25
Show file tree
Hide file tree
Showing 10 changed files with 448 additions and 77 deletions.
18 changes: 11 additions & 7 deletions lib/etc/openturns.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -576,13 +576,14 @@
<KernelMixture-SmallSize value_int="50" />

<!-- OT::KernelSmoothing parameters -->
<KernelSmoothing-AbsolutePrecision value_float="0.0" />
<KernelSmoothing-CutOffPlugin value_float="5.0" />
<KernelSmoothing-RelativePrecision value_float="1.0e-5" />
<KernelSmoothing-ResidualPrecision value_float="1.0e-10" />
<KernelSmoothing-BinNumber value_int="1024" />
<KernelSmoothing-MaximumIteration value_int="50" />
<KernelSmoothing-SmallSize value_int="250" />
<KernelSmoothing-AbsolutePrecision value_float="0.0" />
<KernelSmoothing-CutOffPlugin value_float="5.0" />
<KernelSmoothing-RelativePrecision value_float="1.0e-5" />
<KernelSmoothing-ResidualPrecision value_float="1.0e-10" />
<KernelSmoothing-DefaultShiftScale value_float="1.0e-5" />
<KernelSmoothing-BinNumber value_int="1024" />
<KernelSmoothing-MaximumIteration value_int="50" />
<KernelSmoothing-SmallSize value_int="250" />

<!-- OT::LogNormal parameters -->
<LogNormal-CharacteristicFunctionSmallSigmaThreshold value_float="0.2" />
Expand All @@ -601,6 +602,9 @@
<MarginalDistribution-MaximumSubIntervals value_int="128" />
<MarginalDistribution-Rule value_str="G15K31" />

<!-- MarginalUniformOrderStatistics parameters -->
<MarginalUniformOrderStatistics-LargeCaseCDF value_int="1000" />

<!-- Meixner parameters -->
<MeixnerDistribution-MaximumAbsoluteError value_float="1.0e-12" />
<MeixnerDistribution-MaximumConstraintError value_float="1.0e-12" />
Expand Down
4 changes: 4 additions & 0 deletions lib/src/Base/Common/ResourceMap.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1209,6 +1209,7 @@ void ResourceMap::loadDefaultConfiguration()
addAsScalar("KernelSmoothing-CutOffPlugin", 5.0);
addAsScalar("KernelSmoothing-RelativePrecision", 1.0e-5);
addAsScalar("KernelSmoothing-ResidualPrecision", 1.0e-10);
addAsScalar("KernelSmoothing-DefaultShiftScale", 1.0e-5);
addAsUnsignedInteger("KernelSmoothing-BinNumber", 1024);
addAsUnsignedInteger("KernelSmoothing-MaximumIteration", 50);
addAsUnsignedInteger("KernelSmoothing-SmallSize", 250);
Expand All @@ -1230,6 +1231,9 @@ void ResourceMap::loadDefaultConfiguration()
addAsString("MarginalDistribution-Rule", "G15K31");
addAsUnsignedInteger("MarginalDistribution-MaximumSubIntervals", 128);

// MarginalUniformOrderStatistics //
addAsUnsignedInteger("MarginalUniformOrderStatistics-LargeCaseCDF", 1000);

// Meixner parameters //
addAsScalar("MeixnerDistribution-MaximumAbsoluteError", 1.0e-12);
addAsScalar("MeixnerDistribution-MaximumConstraintError", 1.0e-12);
Expand Down
85 changes: 81 additions & 4 deletions lib/src/Uncertainty/Distribution/KernelSmoothing.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "openturns/PersistentObjectFactory.hxx"
#include "openturns/Brent.hxx"
#include "openturns/MethodBoundEvaluation.hxx"
#include "openturns/SymbolicFunction.hxx"
#include "openturns/Function.hxx"
#include "openturns/HermiteFactory.hxx"
#include "openturns/UniVariatePolynomial.hxx"
Expand All @@ -33,6 +34,7 @@
#include "openturns/SobolSequence.hxx"
#include "openturns/ResourceMap.hxx"
#include "openturns/JointDistribution.hxx"
#include "openturns/CompositeDistribution.hxx"
#include "openturns/BlockIndependentDistribution.hxx"

BEGIN_NAMESPACE_OPENTURNS
Expand Down Expand Up @@ -256,8 +258,41 @@ Distribution KernelSmoothing::build(const Sample & sample) const
{
// For 1D sample, use the rule that give the best tradeoff between speed and precision
if (sample.getDimension() == 1)
return build(sample, computeMixedBandwidth(sample));

{
if (useLogTransform_)
{
const Scalar skewness = sample.computeSkewness()[0];
const Scalar xMin = sample.getMin()[0];
const Scalar xMax = sample.getMax()[0];
const Scalar delta = (xMax - xMin) * std::max(SpecFunc::Precision, ResourceMap::GetAsScalar("KernelSmoothing-DefaultShiftScale"));
SymbolicFunction transform;
SymbolicFunction inverseTransform;
OSS oss(true);
// To get the full double precision
oss.setPrecision(17);
if (skewness >= 0.0)
{
const Scalar shift = delta - xMin;
transform = SymbolicFunction("x", String(oss << "log(x+(" << shift << "))"));
oss.clear();
inverseTransform = SymbolicFunction("y", String(oss << "exp(y)-(" << shift << ")"));
}
else
{
const Scalar shift = xMax + delta;
transform = SymbolicFunction("x", String(oss << "log((" << shift << ")-x)"));
oss.clear();
inverseTransform = SymbolicFunction("y", String(oss << "(" << shift << ") - exp(y)"));
}
const Sample transformedSample(transform(sample));
const Distribution transformedDistribution(build(transformedSample, computeMixedBandwidth(transformedSample)));
CompositeDistribution fitted(inverseTransform, transformedDistribution);
fitted.setDescription(sample.getDescription());
return fitted;
} // useLogTransform
else
return build(sample, computeMixedBandwidth(sample));
} // dimension 1
// For nD sample, use the only available rule
return build(sample, computeSilvermanBandwidth(sample));
}
Expand All @@ -279,8 +314,8 @@ Distribution KernelSmoothing::build(const Sample & sample,
if (xmin == xmax)
{
bandwidth_ = bandwidth;
KernelSmoothing::Implementation result(new Dirac(xmin));
result->setDescription(sample.getDescription());
Dirac result(xmin);
result.setDescription(sample.getDescription());
return result;
}
Indices degenerateIndices;
Expand Down Expand Up @@ -549,6 +584,7 @@ TruncatedDistribution KernelSmoothing::buildAsTruncatedDistribution(const Sample
// Now, work on the extended sample
SampleImplementation newSample(newSampleData.getSize(), 1);
newSample.setData(newSampleData);
newSample.setDescription(sample.getDescription());
size = newSample.getSize();
const Bool mustBin = binned_ && (dimension * std::log(1.0 * binNumber_) < std::log(1.0 * size));
if (binned_ != mustBin) LOGINFO("Will not bin the data because the bin number is greater than the sample size");
Expand Down Expand Up @@ -585,6 +621,11 @@ void KernelSmoothing::setBoundaryCorrection(const Bool boundaryCorrection)
boundingOption_ = (boundaryCorrection ? BOTH : NONE);
}

Bool KernelSmoothing::getBoundaryCorrection() const
{
return (boundingOption_ != NONE);
}

/* Boundary correction accessor */
void KernelSmoothing::setBoundingOption(const BoundingOption boundingOption)
{
Expand Down Expand Up @@ -615,6 +656,38 @@ void KernelSmoothing::setAutomaticUpperBound(const Bool automaticUpperBound)
automaticUpperBound_ = automaticUpperBound;
}

/* Binning accessors */
void KernelSmoothing::setBinning(const Bool binned)
{
binned_ = binned;
}

Bool KernelSmoothing::getBinning() const
{
return binned_;
}

/* Bin number accessor */
void KernelSmoothing::setBinNumber(const UnsignedInteger binNumber)
{
if (binNumber_ < 2) throw InvalidArgumentException(HERE) << "Error: The number of bins=" << binNumber << " is less than 2.";
}

UnsignedInteger KernelSmoothing::getBinNumber() const
{
return binNumber_;
}

/* Use log transform accessor */
void KernelSmoothing::setUseLogTransform(const Bool useLog)
{
useLogTransform_ = useLog;
}

Bool KernelSmoothing::getUseLogTransform() const
{
return useLogTransform_;
}

/* Method save() stores the object through the StorageManager */
void KernelSmoothing::save(Advocate & adv) const
Expand Down Expand Up @@ -646,6 +719,10 @@ void KernelSmoothing::load(Advocate & adv)
adv.loadAttribute("automaticLowerBound_", automaticLowerBound_);
adv.loadAttribute("upperBound_", upperBound_);
adv.loadAttribute("automaticUpperBound_", automaticUpperBound_);
if (adv.hasAttribute("useLogTransform_"))
adv.loadAttribute("useLogTransform_", useLogTransform_);
else
useLogTransform_ = false;
}

END_NAMESPACE_OPENTURNS
29 changes: 22 additions & 7 deletions lib/src/Uncertainty/Distribution/openturns/KernelSmoothing.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -75,20 +75,33 @@ public:
/** Kernel accessor */
Distribution getKernel() const;

/* Boundary correction accessor, shortcut for setBoundingOption(NONE) or setBoundingOption(BOTH) */
/** Boundary correction accessor, shortcut for setBoundingOption(NONE) or setBoundingOption(BOTH) */
void setBoundaryCorrection(const Bool boundaryCorrection);
Bool getBoundaryCorrection() const;

/* Boundary correction accessor */
/** Boundary correction accessor */
void setBoundingOption(const BoundingOption boundingOption);

/* Boundary accessor */
/** Boundary accessor */
void setLowerBound(const Scalar lowerBound);
void setUpperBound(const Scalar upperBound);

/* Automatic boundary accessor */
/** Automatic boundary accessor */
void setAutomaticLowerBound(const Bool automaticLowerBound);
void setAutomaticUpperBound(const Bool automaticUpperBound);

/** Binning accessors */
void setBinning(const Bool binned);
Bool getBinning() const;

/** Bin number accessor */
void setBinNumber(const UnsignedInteger binNumber);
UnsignedInteger getBinNumber() const;

/** Use log transform accessor */
void setUseLogTransform(const Bool useLog);
Bool getUseLogTransform() const;

/** Compute the bandwidth according to Silverman's rule */
Point computeSilvermanBandwidth(const Sample & sample) const;

Expand Down Expand Up @@ -124,20 +137,22 @@ private:
Distribution kernel_;

// Flag to tell if we compute a binned version of the estimator
Bool binned_;
Bool binned_ = false;

// Number of bins in each dimension
UnsignedInteger binNumber_;
UnsignedInteger binNumber_ = ResourceMap::GetAsUnsignedInteger("KernelSmoothing-BinNumber");

// Direction of the boundary treatment
BoundingOption boundingOption_;
BoundingOption boundingOption_ = NONE;

// Known bounds
Scalar lowerBound_;
Bool automaticLowerBound_;
Scalar upperBound_;
Bool automaticUpperBound_;

// Use log transform
Bool useLogTransform_ = false;
}; /* class KernelSmoothing */


Expand Down
10 changes: 9 additions & 1 deletion python/doc/bibliography.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ Bibliography
http://ceres-solver.org
.. [chacon2018] Chacón, J. E., & Duong, T. (2018).
*Multivariate kernel smoothing and its applications.* CRC Press.
.. [charpentier2015] Charpentier, A., & Flachaire, E. (2014).
*Log-Transform Kernel Density Estimation of Income Distribution* WP 2015-Nr 6,
AMSE Aix Marseille School of Economics.
`pdf <https://www.amse-aixmarseille.fr/sites/default/files/_dt/2012/wp_2015_-_nr_06.pdf>`__
.. [chihara1978] Chihara, T. S. (1978).
*An introduction to orthogonal polynomials.* Dover publications.
.. [chapelle2002] Chapelle, O., Vapnik, V., & Bengio, Y. (2002).
Expand Down Expand Up @@ -193,7 +197,7 @@ Bibliography
.. [jin2005] R. Jin, W. Chen, and A. Sudjianto. *An efficient algorithm for
constructing optimal design of computer experiments.*
Journal of Statistical Planning and Inference, 134 :268-287, 2005.
`pdf <https://openturns.github.io/openturns/papers/jin2005.pdf>`__
`pdf <https://openturns.github.io/openturns/papers/jones1993.pdf>`__
.. [johnson1990] Johnson M, Moore L and Ylvisaker D (1990).
*Minimax and maximin distance design.*
Journal of Statistical Planning and Inference 26(2): 131-148.
Expand All @@ -204,6 +208,10 @@ Bibliography
*Global optimization of expensive black-box functions*,
Journal of Global Optimization, 13(4), 455-492, 1998.
`pdf <https://openturns.github.io/openturns/papers/jones1998.pdf>`__
.. [jones1993] M.C. Jones,
*Simple boundary correction for kernel density estimation*,
Statistics and Computing. Vol. 3, Issue 3, 1993, pp. 135-146,
https://doi.org/10.1007/BF00147776
.. [Keutelian1991] Hovhannes Keutelian.
*The Kolmogorov-Smirnov test when parameters are estimated from data*,
30 April 1991, Fermilab.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@
# Boundary corrections
# --------------------
#
# We finish this example on an advanced feature of the kernel smoothing, the boundary corrections.
# We detail here an advanced feature of the kernel smoothing, the boundary corrections.
#

# %%
Expand Down Expand Up @@ -257,5 +257,60 @@
# %%
# The boundary correction made has a remarkable impact on the quality of the estimate for the small values.

# %%
# Log-transform treatment
# -----------------------
#
# We finish this example on another advanced feature of the kernel smoothing: the log-transform treatment.
# This treatment is highly suited to skewed distributions, which are all challenging for kernel smoothing.
#

# %%
# We consider several distributions which have significant skewness:
distCollection = [ot.LogNormal(0.0, 2.5), ot.Beta(20000.5, 2.5, 0.0, 1.0), ot.Exponential(),
ot.WeibullMax(1.0, 0.9, 0.0), ot.Mixture([ot.Normal(-1.0, 0.5), ot.Normal(1.0, 1.0)], [0.4, 0.6]),
ot.Mixture([ot.LogNormal(-1.0, 1.0, -1.0), ot.LogNormal(1.0, 1.0, 1.0)], [0.2, 0.8])]

# %%
# For each distribution, we do the following steps:
#
# - we generate a sample of size 5000,
# - we fit a kernel smoothing distribution without the log-transform treatment,
# - we fit a kernel smoothing distribution with the log-transform treatment,
# - we plot the real distribution and both non parametric estimations.
#
# Other transformations could be used, but the Log-transform one is quite effective. If the skewness is moderate,
# there is almost no change wrt simple kernel smoothing. But if the skewness is large, the transformation performs
# very well. Note that, in addition, this transformation performs an automatic boundary correction.
grid = ot.GridLayout(2, 3)
ot.RandomGenerator.SetSeed(0)
for i, distribution in enumerate(distCollection):
sample = distribution.getSample(5000)

# We draw the real distribution
graph = distribution.drawPDF()
graph.setLegends([distribution.getClassName()])
# We choose the default kernel
kernel = ot.KernelSmoothing()

# We activate no particular treatment
fitted = kernel.build(sample)
curve = fitted.drawPDF()
curve.setLegends(["Fitted"])
graph.add(curve)

# We activate the log-transform treatment
kernel.setUseLogTransform(True)
fitted = kernel.build(sample)
curve = fitted.drawPDF()
curve.setLegends(["Fitted LogTransform"])
curve = curve.getDrawable(0)
curve.setLineStyle("dashed")

graph.add(curve)
graph.setColors(ot.Drawable.BuildDefaultPalette(3))
grid.setGraph(i // 3, i % 3, graph)

view = viewer.View(grid)

plt.show()
11 changes: 9 additions & 2 deletions python/doc/theory/data_analysis/data_analysis.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,21 @@ Comparison of two samples
qqplot_graph
smirnov_test

Estimation of a parametric model
--------------------------------
Estimation of a nonparametric model
-----------------------------------

.. toctree::
:maxdepth: 1

empirical_cdf
kernel_smoothing

Estimation of a parametric model
--------------------------------

.. toctree::
:maxdepth: 1

maximum_likelihood
parametric_estimation

Expand Down
Loading

0 comments on commit 08eef25

Please sign in to comment.