diff --git a/src/EmpiricalDistributions.jl b/src/EmpiricalDistributions.jl index 4dd0719..f5ea19e 100644 --- a/src/EmpiricalDistributions.jl +++ b/src/EmpiricalDistributions.jl @@ -13,6 +13,7 @@ using Distributions using StatsBase +include("hist_funcs.jl") include("uv_binned_dist.jl") include("mv_binned_dist.jl") diff --git a/src/hist_funcs.jl b/src/hist_funcs.jl new file mode 100644 index 0000000..988aa75 --- /dev/null +++ b/src/hist_funcs.jl @@ -0,0 +1,76 @@ +# This file is a part of EmpiricalDistributions.jl, licensed under the MIT License (MIT). + + +function _pdf(h::Histogram{T,N}, xs::NTuple{N,Real}) where {T,N} + @assert h.isdensity # Implementation requires normalized histogram + + idx = StatsBase.binindex(h, xs) + r::T = zero(T) + if checkbounds(Bool, h.weights, idx...) + @inbounds r = h.weights[idx...] + end + r +end + + +function _mean(h::StatsBase.Histogram{<:Real, N}; T::DataType = Float64) where {N} + @assert !h.isdensity # Implementation currently assumes non-normalized histogram + + s_inv::T = inv(sum(h.weights)) + m::Vector{T} = zeros(T, N) + mps = StatsBase.midpoints.(h.edges) + cart_inds = CartesianIndices(h.weights) + for i in cart_inds + for idim in 1:N + m[idim] += s_inv * mps[idim][i[idim]] * h.weights[i] + end + end + return m +end + + +_findmaxidx_tuple_or_int(A::AbstractVector{<:Real}) = findmax(A)[2] +_findmaxidx_tuple_or_int(A::AbstractArray{<:Real}) = findmax(A)[2].I + +function _mode(h::StatsBase.Histogram; T::DataType = Float64) + @assert h.isdensity # Implementation requires normalized histogram + + maxidx = _findmaxidx_tuple_or_int(h.weights) + mode_corner1 = map(getindex, h.edges, maxidx) + mode_corner2 = map(getindex, h.edges, maxidx .+ 1) + cov_est = T[(mode_corner1 .+ mode_corner2) ./ 2...] +end + + +function _var(h::StatsBase.Histogram{<:Real, N}; T::DataType = Float64, mean = StatsBase.mean(h, T = T), ) where {N} + @assert !h.isdensity # Implementation currently assumes non-normalized histogram + + s_inv::T = inv(sum(h.weights)) + v::Vector{T} = zeros(T, N) + mps = StatsBase.midpoints.(h.edges) + cart_inds = CartesianIndices(h.weights) + for i in cart_inds + for idim in 1:N + v[idim] += s_inv * (mps[idim][i[idim]] - mean[idim])^2 * h.weights[i] + end + end + return v +end + + +function _cov(h::StatsBase.Histogram{<:Real, N}; T::DataType = Float64, mean = StatsBase.mean(h, T = T)) where {N} + @assert !h.isdensity # Implementation currently assumes non-normalized histogram + + s_inv::T = inv(sum(h.weights)) + c::Matrix{T} = zeros(T, N, N) + mps = StatsBase.midpoints.(h.edges) + cart_inds = CartesianIndices(h.weights) + for i in cart_inds + for idim in 1:N + for jdim in 1:N + c[idim, jdim] += s_inv * (mps[idim][i[idim]] - mean[idim]) * (mps[jdim][i[jdim]] - mean[jdim]) * h.weights[i] + end + end + end + return c +end diff --git a/src/mv_binned_dist.jl b/src/mv_binned_dist.jl index 9d7d9da..a1738ce 100644 --- a/src/mv_binned_dist.jl +++ b/src/mv_binned_dist.jl @@ -2,7 +2,7 @@ """ - UvBinnedDist <: Distribution{Univariate,Continuous} + MvBinnedDist <: Distribution{Multivariate,Continuous} Wraps a multi-dimensional histograms and presents it as a binned multivariate distribution. @@ -11,16 +11,21 @@ Constructor: MvBinnedDist(h::Histogram{<:Real,N}) """ -struct MvBinnedDist{T, N} <: Distributions.Distribution{Multivariate,Continuous} - h::StatsBase.Histogram{<:Real, N} - edges::NTuple{N, <:AbstractVector{T}} - cart_inds::CartesianIndices{N, NTuple{N, Base.OneTo{Int}}} - - probabilty_edges::AbstractVector{T} - - μ::AbstractVector{T} - var::AbstractVector{T} - cov::AbstractMatrix{T} +struct MvBinnedDist{ + T <: Real, + N, + H <: Histogram{<:Real, N}, + VT <: AbstractVector{T}, + MT <: AbstractMatrix{T} +} <: Distributions.Distribution{Multivariate,Continuous} + hist::H + _edges::NTuple{N, <:AbstractVector{T}} + _cart_inds::CartesianIndices{N, NTuple{N, Base.OneTo{Int}}} + _probability_edges::VT + _mean::VT + _mode::VT + _var::VT + _cov::MT end export MvBinnedDist @@ -37,83 +42,45 @@ function MvBinnedDist(h::StatsBase.Histogram{<:Real, N}, T::DataType = Float64) probabilty_edges[i+1] = v > 1 ? 1 : v end - mean = _mean(h) - var = _var(h, mean = mean) - cov = _cov(h, mean = mean) + mean_est = _mean(h) + mode_est = _mode(nh) + var_est = _var(h, mean = mean_est) + cov_est = _cov(h, mean = mean_est) - return MvBinnedDist{T, N}( + return MvBinnedDist( nh, collect.(nh.edges), CartesianIndices(nh.weights), probabilty_edges, - mean, - var, - cov + mean_est, + mode_est, + var_est, + cov_est ) end +Base.convert(::Type{Histogram}, d::MvBinnedDist) = d.hist + + Base.length(d::MvBinnedDist{T, N}) where {T, N} = N Base.size(d::MvBinnedDist{T, N}) where {T, N} = (N,) Base.eltype(d::MvBinnedDist{T, N}) where {T, N} = T -Statistics.mean(d::MvBinnedDist{T, N}) where {T, N} = d.μ -Statistics.var(d::MvBinnedDist{T, N}) where {T, N} = d.var -Statistics.cov(d::MvBinnedDist{T, N}) where {T, N} = d.cov - - -function _mean(h::StatsBase.Histogram{<:Real, N}; T::DataType = Float64) where {N} - s_inv::T = inv(sum(h.weights)) - m::Vector{T} = zeros(T, N) - mps = StatsBase.midpoints.(h.edges) - cart_inds = CartesianIndices(h.weights) - for i in cart_inds - for idim in 1:N - m[idim] += s_inv * mps[idim][i[idim]] * h.weights[i] - end - end - return m -end - - -function _var(h::StatsBase.Histogram{<:Real, N}; T::DataType = Float64, mean = StatsBase.mean(h, T = T), ) where {N} - s_inv::T = inv(sum(h.weights)) - v::Vector{T} = zeros(T, N) - mps = StatsBase.midpoints.(h.edges) - cart_inds = CartesianIndices(h.weights) - for i in cart_inds - for idim in 1:N - v[idim] += s_inv * (mps[idim][i[idim]] - mean[idim])^2 * h.weights[i] - end - end - return v -end - - -function _cov(h::StatsBase.Histogram{<:Real, N}; T::DataType = Float64, mean = StatsBase.mean(h, T = T)) where {N} - s_inv::T = inv(sum(h.weights)) - c::Matrix{T} = zeros(T, N, N) - mps = StatsBase.midpoints.(h.edges) - cart_inds = CartesianIndices(h.weights) - for i in cart_inds - for idim in 1:N - for jdim in 1:N - c[idim, jdim] += s_inv * (mps[idim][i[idim]] - mean[idim]) * (mps[jdim][i[jdim]] - mean[jdim]) * h.weights[i] - end - end - end - return c -end +Statistics.mean(d::MvBinnedDist{T, N}) where {T, N} = d._mean +StatsBase.mode(d::MvBinnedDist{T, N}) where {T, N} = d._mode +Statistics.var(d::MvBinnedDist{T, N}) where {T, N} = d._var +Statistics.cov(d::MvBinnedDist{T, N}) where {T, N} = d._cov function Distributions._rand!(r::AbstractRNG, d::MvBinnedDist{T,N}, A::AbstractVector{<:Real}) where {T, N} rand!(r, A) - next_inds::UnitRange{Int} = searchsorted(d.probabilty_edges::Vector{T}, A[1]::T) + next_inds::UnitRange{Int} = searchsorted(d._probability_edges::Vector{T}, A[1]::T) cell_lin_index::Int = min(next_inds.start, next_inds.stop) - cell_car_index = d.cart_inds[cell_lin_index] + cell_car_index = d._cart_inds[cell_lin_index] for idim in Base.OneTo(N) i = cell_car_index[idim] - sub_int = d.edges[idim][i:i+1] + sub_int = d._edges[idim][i:i+1] sub_int_width::T = sub_int[2] - sub_int[1] A[idim] = sub_int[1] + sub_int_width * A[idim] end @@ -122,11 +89,25 @@ end function Distributions._rand!(r::AbstractRNG, d::MvBinnedDist{T,N}, A::AbstractMatrix{<:Real}) where {T, N} Distributions._rand!.((r,), (d,), nestedview(A)) + return A +end + + +# Similar to unroll_tuple in StaticArrays.jl: +@generated function _unsafe_unroll_tuple(A::AbstractArray, ::Val{L}) where {L} + exprs = [:(A[idx0 + $j]) for j = 0:(L-1)] + quote + idx0 = firstindex(A) + Base.@_inline_meta + @inbounds return $(Expr(:tuple, exprs...)) + end end -function Distributions.pdf(d::MvBinnedDist{T, N}, x::AbstractArray{<:Real, 1}) where {T, N} - return @inbounds d.h.weights[StatsBase.binindex(d.h, Tuple(x))...] +function Distributions.pdf(d::MvBinnedDist{T,N}, x::AbstractVector{<:Real}) where {T,N} + length(eachindex(x)) == N || throw(ArgumentError("Length of variate doesn't match dimensionality of distribution")) + x_tpl = _unsafe_unroll_tuple(x, Val(N)) + _pdf(d.hist, x_tpl) end diff --git a/src/uv_binned_dist.jl b/src/uv_binned_dist.jl index 104cc95..68480f3 100644 --- a/src/uv_binned_dist.jl +++ b/src/uv_binned_dist.jl @@ -11,24 +11,24 @@ Constructor: UvBinnedDist(h::Histogram{<:Real,1}) """ -struct UvBinnedDist{T <: AbstractFloat} <: Distribution{Univariate,Continuous} - h::Histogram{<:Real, 1} - inv_weights::Vector{T} - edges::Vector{T} - volumes::Vector{T} - - probabilty_edges::Vector{T} - probabilty_volumes::Vector{T} - probabilty_inv_volumes::Vector{T} - - acc_prob::Vector{T} - - μ::T - var::T - cov::Matrix{T} - σ::T +struct UvBinnedDist{ + T <: Real, + H <: Histogram{<:Real,1}, + VT <: AbstractVector{T} +} <: Distribution{Univariate,Continuous} + hist::H + _inv_weights::VT + _volumes::VT + _probabilty_edges::VT + _probabilty_volumes::VT + _probabilty_inv_volumes::VT + _acc_prob::VT + _mean::T + _mode::T + _var::T end + export UvBinnedDist @@ -36,94 +36,74 @@ function UvBinnedDist(h::Histogram{<:Real, 1}, T::DataType = Float64) nh = normalize(h) probabilty_widths::Vector{T} = h.weights * inv(sum(h.weights)) probabilty_edges::Vector{T} = Vector{Float64}(undef, length(probabilty_widths) + 1) - probabilty_edges[1] = 0 + probabilty_edges[firstindex(probabilty_edges)] = 0 @inbounds for (i, w) in enumerate(probabilty_widths) probabilty_edges[i+1] = probabilty_edges[i] + probabilty_widths[i] end probabilty_edges[end] = 1 volumes = diff(h.edges[1]) - mean = Statistics.mean(StatsBase.midpoints(nh.edges[1]), ProbabilityWeights(nh.weights)) - var = Statistics.var(StatsBase.midpoints(nh.edges[1]), ProbabilityWeights(nh.weights), mean = mean) acc_prob::Vector{T} = zeros(T, length(nh.weights)) for i in 2:length(acc_prob) acc_prob[i] += acc_prob[i-1] + nh.weights[i-1] * volumes[i-1] end - d::UvBinnedDist{T} = UvBinnedDist{T}( + mean_est = _mean(h) + mode_est = _mode(nh) + var_est = _var(h, mean = mean_est) + + return UvBinnedDist( nh, inv.(nh.weights), - nh.edges[1], volumes, probabilty_edges, probabilty_widths, inv.(probabilty_widths), acc_prob, - mean, - var, - fill(var, 1, 1), - sqrt(var) + first(mean_est), + first(mode_est), + first(var_est) ) end -function Random.rand(rng::AbstractRNG, d::UvBinnedDist{T})::T where {T <: AbstractFloat} - r::T = rand() - next_inds::UnitRange{Int} = searchsorted(d.probabilty_edges, r) - next_ind_l::Int = next_inds.start - next_ind_r::Int = next_inds.stop - if next_ind_l > next_ind_r - next_ind_l = next_inds.stop - next_ind_r = next_inds.start - end - ret::T = d.edges[next_ind_l] - if next_ind_l < next_ind_r - ret += d.volumes[next_ind_l] * (d.probabilty_edges[next_ind_r] - r) * d.probabilty_inv_volumes[next_ind_l] - end - return ret -end +Base.convert(::Type{Histogram}, d::UvBinnedDist) = d.hist -function Distributions.pdf(d::UvBinnedDist{T}, x::Real)::T where {T <: AbstractFloat} - i::Int = StatsBase.binindex(d.h, x) - return @inbounds d.h.weights[i] -end +Base.length(d::UvBinnedDist{T}) where {T} = 1 +Base.size(d::UvBinnedDist{T}) where T = () +Base.eltype(d::UvBinnedDist{T}) where {T} = T -function Distributions.logpdf(d::UvBinnedDist{T}, x::Real)::T where {T <: AbstractFloat} - return log(pdf(d, x)) -end +Statistics.mean(d::UvBinnedDist{T, N}) where {T, N} = d._mean +StatsBase.mode(d::UvBinnedDist{T, N}) where {T, N} = d._mode +Statistics.var(d::UvBinnedDist{T, N}) where {T, N} = d._var +Statistics.cov(d::UvBinnedDist{T, N}) where {T, N} = d._cov +Distributions.minimum(d::UvBinnedDist) = first(d.hist.edges[1]) +Distributions.maximum(d::UvBinnedDist) = last(d.hist.edges[1]) -function Distributions.cdf(d::UvBinnedDist{T}, x::Real)::T where {T <: AbstractFloat} - i::Int = StatsBase.binindex(d.h, x) - p::T = @inbounds sum(d.h.weights[1:i-1] .* d.volumes[1:i-1]) - p += (x - d.edges[i]) * d.h.weights[i] - return p -end +Distributions.pdf(d::UvBinnedDist, x::Real) = _pdf(d.hist, (x,)) +Distributions.logpdf(d::UvBinnedDist, x::Real) = log(pdf(d, x)) -function Distributions.minimum(d::UvBinnedDist{T})::T where {T <: AbstractFloat} - d.edges[1] -end -function Distributions.maximum(d::UvBinnedDist{T})::T where {T <: AbstractFloat} - d.edges[end] -end - - -function Distributions.insupport(d::UvBinnedDist{T}, x::Real)::Bool where {T <: AbstractFloat} - d.edges[1] <= x <= d.edges[end] +# ToDo: Efficient implementation, should cache CDF +function Distributions.cdf(d::UvBinnedDist, x::Real) + i::Int = StatsBase.binindex(d.hist, x) + p::T = @inbounds sum(d.hist.weights[1:i-1] .* d._volumes[1:i-1]) + p += (x - d.hist.edges[1][i]) * d.hist.weights[i] + return p end function Distributions.quantile(d::UvBinnedDist{T}, x::Real)::T where {T <: AbstractFloat} - r::UnitRange{Int} = searchsorted(d.acc_prob, x) + r::UnitRange{Int} = searchsorted(d._acc_prob, x) idx::Int = min(r.start, r.stop) - p::T = d.acc_prob[ idx ] - q::T = d.edges[idx] + p::T = d._acc_prob[ idx ] + q::T = d.hist.edges[1][idx] missing_p::T = x - p - inv_weight::T = d.inv_weights[idx] + inv_weight::T = d._inv_weights[idx] if !isinf(inv_weight) q += missing_p * inv_weight end @@ -131,11 +111,18 @@ function Distributions.quantile(d::UvBinnedDist{T}, x::Real)::T where {T <: Abst end -Base.eltype(d::UvBinnedDist{T}) where {T <: AbstractFloat}= T - - -Statistics.mean(d::UvBinnedDist) = d.μ - -Statistics.var(d::UvBinnedDist) = d.var - -Statistics.cov(d::UvBinnedDist) = d.cov +function Random.rand(rng::AbstractRNG, d::UvBinnedDist{T})::T where {T <: AbstractFloat} + r::T = rand() + next_inds::UnitRange{Int} = searchsorted(d._probabilty_edges, r) + next_ind_l::Int = next_inds.start + next_ind_r::Int = next_inds.stop + if next_ind_l > next_ind_r + next_ind_l = next_inds.stop + next_ind_r = next_inds.start + end + ret::T = d.hist.edges[1][next_ind_l] + if next_ind_l < next_ind_r + ret += d._volumes[next_ind_l] * (d._probabilty_edges[next_ind_r] - r) * d._probabilty_inv_volumes[next_ind_l] + end + return ret +end diff --git a/test/test_mv_binned_dist.jl b/test/test_mv_binned_dist.jl index 6e1ad63..fa3467a 100644 --- a/test/test_mv_binned_dist.jl +++ b/test/test_mv_binned_dist.jl @@ -3,8 +3,8 @@ using EmpiricalDistributions using Test -using Random -using Distributions, StatsBase, LinearAlgebra +using Random, LinearAlgebra +using Distributions, StatsBase, ArraysOfArrays @testset "mv_binned_dist" begin @@ -16,11 +16,39 @@ using Distributions, StatsBase, LinearAlgebra n = 10^6 r = rand(true_dist, n) append!(h, (r[1, :], r[2, :])) + + @test @inferred(MvBinnedDist(h)) isa MvBinnedDist d = MvBinnedDist(h) - @test all(isapprox.(μ, d.μ, atol = 0.01)) - @test all(isapprox.(Σ, d.cov, atol = 0.01)) - rand!(d, r) + + @test @inferred(convert(Histogram, d)) === d.hist + + @test @inferred(length(d)) == 2 + @test @inferred(size(d)) == (2,) + @test @inferred(eltype(d)) == Float64 + + @test all(isapprox.(mean(true_dist), @inferred(mean(d)), atol = 0.01)) + @test all(isapprox.(mode(true_dist), @inferred(mode(d)), atol = 0.05)) + @test all(isapprox.(var(true_dist), @inferred(var(d)), atol = 0.01)) + @test all(isapprox.(cov(true_dist), @inferred(cov(d)), atol = 0.01)) + + @test @inferred(rand!(d, r)) === r + fit_dist = fit(MvNormal, r) @test all(isapprox.(μ, fit_dist.μ, atol = 0.01)) @test all(isapprox.(Σ, fit_dist.Σ.mat, atol = 0.01)) + + @test @inferred(pdf(d, r[:,1])) isa Float64 + @test isapprox(pdf.(Ref(true_dist), nestedview(r)), pdf.(Ref(d), nestedview(r)), rtol = 0.1) + @test @inferred(pdf(d, [1000, 1000])) == 0 + @test @inferred(pdf(d, [-1000, -1000])) == 0 + + @test @inferred(logpdf(d, r[:,1])) == log(pdf(d, r[:,1])) + @test @inferred(logpdf(d, [1000, 1000])) == -Inf + @test @inferred(logpdf(d, [-1000, -1000])) == -Inf + + @test @inferred(rand(d)) isa Vector{Float64} + @test size(rand(d)) == (length(d),) + + @test @inferred(rand(d, 10)) isa Matrix{Float64} + @test size(rand(d, 10)) == (length(d), 10) end diff --git a/test/test_uv_binned_dist.jl b/test/test_uv_binned_dist.jl index 9f2fda4..90b345a 100644 --- a/test/test_uv_binned_dist.jl +++ b/test/test_uv_binned_dist.jl @@ -13,10 +13,40 @@ using Distributions, StatsBase true_dist = Normal(μ, σ) h = Histogram(μ-10σ:σ/10:μ+10σ) append!(h, rand(true_dist, 10^7)) + + + @test @inferred(UvBinnedDist(h)) isa UvBinnedDist d = UvBinnedDist(h) - @test isapprox(μ, d.μ, atol = 0.01) - @test isapprox(σ, d.σ, atol = 0.01) - fit_dist = fit(Normal, rand(d, 10^7)) + + @test @inferred(convert(Histogram, d)) === d.hist + + @test @inferred(length(d)) == 1 + @test @inferred(size(d)) == () + @test @inferred(eltype(d)) == Float64 + + @test all(isapprox.(mean(true_dist), @inferred(mean(d)), atol = 0.01)) + @test all(isapprox.(mode(true_dist), @inferred(mode(d)), atol = 0.05)) + @test all(isapprox.(var(true_dist), @inferred(var(d)), atol = 0.01)) + + @test @inferred(minimum(d)) == μ-10σ + @test @inferred(maximum(d)) == μ+10σ + + @test @inferred(rand(d)) isa Real + @test @inferred(rand(d, 10)) isa Vector{<:Real} + @test size(rand(d, 10)) == (10,) + + r = rand(d, 10^6) + + fit_dist = fit(Normal, r) @test isapprox(μ, fit_dist.μ, atol = 0.01) @test isapprox(σ, fit_dist.σ, atol = 0.01) + + @test @inferred(pdf(d, r[1])) isa Float64 + @test isapprox(pdf.(Ref(true_dist), r), pdf.(Ref(d), r), rtol = 0.1) + @test @inferred(pdf(d, 1000)) == 0 + @test @inferred(pdf(d, -1000)) == 0 + + @test @inferred(logpdf(d, r[1])) == log(pdf(d, r[1])) + @test @inferred(logpdf(d, 1000)) == -Inf + @test @inferred(logpdf(d, -1000)) == -Inf end