Skip to content

Commit

Permalink
Address Gotchas and peformance to address #4 and other code clean-up.
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanbringuier committed Apr 14, 2022
1 parent c919f25 commit 4542b2b
Show file tree
Hide file tree
Showing 6 changed files with 92 additions and 28 deletions.
12 changes: 12 additions & 0 deletions docs/src/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,16 @@ using CBFV
data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12])
rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target"))
features = generatefeatures(data)
```

Here is an example with an existing feature combined with the generated features:

```@example
using DataFrames
using CBFV
data = DataFrame(:formula=>["B2O3","Be1I2","Be1F3Li1"],
:temperature=>[1400.00,1200.0,1100.00],
:heat_capacity=>[89.115,134.306,192.464])
rename!(data,Dict(:heat_capacity=>:target))
features = generatefeatures(data,combine=true)
```
1 change: 1 addition & 0 deletions src/CBFV.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ using CSV
using DataFrames

export FileName
export readdatabasefile
export processelementdatabase
export processinputdata
export generatefeatures
Expand Down
12 changes: 10 additions & 2 deletions src/Databases.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,21 @@ Returns DataFrame of an elemental database file in [databases/](databases/)
# Arguments
- `pathtofile::String`: path to the CSV formatted file to read
- `stringtype::Type{Union{String,InlineString}}=String` : `CSV.jl` string storage type
- `pool::Bool=false` : `CSV.File` will pool `String` column values.
# Returns
- `data::DataFrame`: the dataframe representation of the csv file.
!!! note
Some of the behaviors of `CSV.jl` will create data types that are inconnsistant with
the several function argument types in `CBFV`. If you use this function to read the
data files the data frame constructed via CSV will work properly.
"""
function readdatabasefile(pathtofile::AbstractString;
stringtype::Type{T}=String) where T<:Union{String,InlineString}
stringtype::Type{T}=String,
pool=false) where T<:Union{String,InlineString}
# Use CSV and dataframes
data = CSV.File(pathtofile,stringtype=stringtype) |> DataFrame
data = CSV.File(pathtofile,stringtype=stringtype,pool=pool) |> DataFrame
return data
end # function readdatabasefile
83 changes: 59 additions & 24 deletions src/Featurization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,22 @@ end
This is the primary function that assigns the features based on the CBFV approach. For more
details its best to see the original python CBFV and references in README file.
# Arguments
- `processeddata::Vector{Dict{Symbol,Any}}` : the formulas processed against elemental database
- `formulae::AbstractArray` : the formula string values, this should be some subtype of `Array{String,1}`
- `sumfeatures::Bool=false` : wheter to create a `sum_` feature vector
# Returns
- `featuresarry::Vector{Matrix{Float64}}` : feature vectors for each row in original data set.
- `skippedformula::Vector{String}` : skipped formulas
!!! note
The `generatefeatures` call does not do anything (i.e. return) the skippedformulas.
"""
function assignfeatures(processeddata::Vector{Dict{Symbol,Any}},
formulae::Array{String,1},
sumfeatures::Bool=false)
formulae::AbstractArray,
sumfeatures::Bool=false)

iterformulae = ProgressBar(1:length(formulae))
skippedformula = Array{String,1}()
Expand All @@ -69,11 +80,11 @@ function assignfeatures(processeddata::Vector{Dict{Symbol,Any}},
fmax = maximum(properties, dims=1)
fmin = minimum(properties, dims=1)
_, fraccomp = fractionalcomposition(formula)
favg = sum(fraccomp .* properties, dims=1)
favg = sum(fraccomp .* properties, dims=1) #FIX: Not sure whats going on here
fdev = sum(fraccomp .* abs.(properties .- favg), dims=1)

prominant = isapprox.(fraccomp, maximum(fraccomp))
fmode = minimum(properties[prominant, :], dims=1)

fweight = sumfeatures ? sum(amount .* properties, dims=1) : amount .* properties

if sumfeatures
Expand Down Expand Up @@ -110,9 +121,9 @@ moments from the element features in the formula.
"""
function constructfeaturedataframe(featcolnames::Vector{String},
features::Array{Float64,2},
extrafeatures::Tuple{Bool,DataFrame},
sumfeatures::Bool)
features::Array{Float64,2},
extrafeatures::Tuple{Bool,DataFrame},
sumfeatures::Bool)

if sumfeatures
colprefixes = ["sum_", "avg_", "dev_", "range_", "max_", "min_", "mode_"]
Expand Down Expand Up @@ -143,7 +154,6 @@ end # function constructfeaturedataframe

"""
generatefeatures(data; elementdata,dropduplicate,combine,sumfeatures,returndataframe)
generatefeatures(data, elementdata; kwargs...)
generatefeatures(dataname; kwargs...)
This is the primary function for generating the CBFV features for a dataset of formulas with or without
Expand All @@ -157,7 +167,7 @@ assigning of features is then executed based on the CBFV approach. If the `retur
# Arguments
- `data::DataFrame`: This is the data set that you want to be featurized for example.
- `elementdata::Union{String,FileName}`: The name of the internal database or the file path and
- `elementdata::Union{String,FileName} or Union{String,DataFrame}`: The name of the internal database or the file path and
name to an external database.
- `dropduplicate::Bool=true`: Option to drop duplicate entries.
- `combine::Bool=false`: Option to combine existing features in `data` with the generated feature set.
Expand All @@ -184,18 +194,25 @@ d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.
generatefeatures(d)
```
"""
function generatefeatures(data::DataFrame;
elementdata::String="oliynyk",
dropduplicate=true,
combine=false,
sumfeatures=false,
returndataframe=true)
function generatefeatures(data::DataFrame,
elementdata::Union{String,DataFrame}="oliynyk";
dropduplicate=true,
combine=false,
sumfeatures=false,
returndataframe=true)

# Remove duplicate entries
if dropduplicate
moddata = unique(data)
else
moddata = data

end

# Process input data
checkdataframe(data)
formulae = data[!, :formula]
featcolnames, processeddata = processinputdata(dropduplicate ? unique(data) : data, elementdata)
formulae = moddata[!, :formula]
featcolnames, processeddata = processinputdata(moddata, elementdata)

targets = [row[:target] for row in processeddata]

Expand All @@ -205,8 +222,7 @@ function generatefeatures(data::DataFrame;
sumfeatures)

# Extra features from original data
extra_df = dropduplicate ? unique(data) : data
extrafeatures = extra_df[!, Not([:formula, :target])]
extrafeatures = moddata[!, Not([:formula, :target])]
if combine checkifempty(extrafeatures) end

if returndataframe
Expand All @@ -223,11 +239,30 @@ function generatefeatures(data::DataFrame;

end # function generatefeaturesdata

# Issue #4 TODO: Work in support for custom element data sets. Requires
# working back through `generatefeatures`-> `processinputdata` -> ....
# generatefeatures(data::DataFrame, elementdata::FileName; kwargs...) = begin
# generatefeatures(data, elementdata=elementdata, kwargs...)
# end

generatefeatures(data::DataFrame;
elementdata::Union{FileName,String}="oliynyk",
dropduplicate=true,
combine=false,
sumfeatures=false,
returndataframe=true) = begin
if typeof(elementdata) == FileName
elementdataframe = readdatabasefile(elementdata.fullpath)
generatefeatures(data,elementdataframe,
dropduplicate=dropduplicate,
combine=combine,
sumfeatures=sumfeatures,
returndataframe=returndataframe)
else
generatefeatures(data,elementdata,
dropduplicate=dropduplicate,
combine=combine,
sumfeatures=sumfeatures,
returndataframe=returndataframe)
end

end


generatefeatures(dataname::String; kwargs...) = begin
# Digest data file before processing
Expand Down
3 changes: 2 additions & 1 deletion src/Types.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# see LICENSE

"""
generatefeatures Datatype for multiple dispatch
generatefeatures Datatype for multiple dispatch. Allows for passing
external database.
""" struct FileName
fullpath::String
end
9 changes: 8 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,19 @@ using CSV, DataFrames
tmpfile = tempname()
CSV.write(tmpfile,d)
@test featdb == CBFV.generatefeatures(tmpfile,returndataframe=true)


@test CBFV.generatefeatures(d,combine=true)[!,:property] == d[!,:property]

testdb = CSV.File("pycbfv_test_data.csv") |> DataFrame
@test length(names(featdb[!,Not([:target,:formula])])) == length(names(testdb))
@testset "Column $n" for n in names(testdb)
@test testdb[!,n] featdb[!,n]
end

featdb_ext = CBFV.generatefeatures(d,
elementdata=CBFV.FileName((@__DIR__)*"/../databases/oliynyk.csv"))
@test featdb_ext == featdb


end # Featurization.jl testset
end

0 comments on commit 4542b2b

Please sign in to comment.