Address Gotchas and peformance to address #4 and other code clean-up.

JuliaMatSci · Apr 14, 2022 · 4542b2b · 4542b2b
1 parent c919f25
commit 4542b2b
Show file tree

Hide file tree

Showing 6 changed files with 92 additions and 28 deletions.
diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -26,4 +26,16 @@ using CBFV
 data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12])
 rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target"))
 features = generatefeatures(data)
+```
+
+Here is an example with an existing feature combined with the generated features:
+
+```@example
+using DataFrames
+using CBFV
+data = DataFrame(:formula=>["B2O3","Be1I2","Be1F3Li1"],
+                 :temperature=>[1400.00,1200.0,1100.00],
+                 :heat_capacity=>[89.115,134.306,192.464])
+rename!(data,Dict(:heat_capacity=>:target))
+features = generatefeatures(data,combine=true)
 ```
diff --git a/src/CBFV.jl b/src/CBFV.jl
@@ -6,6 +6,7 @@ using CSV
 using DataFrames
 
 export FileName
+export readdatabasefile
 export processelementdatabase
 export processinputdata
 export generatefeatures

diff --git a/src/Databases.jl b/src/Databases.jl
@@ -27,13 +27,21 @@ Returns DataFrame of an elemental database file in [databases/](databases/)
 
 # Arguments
 - `pathtofile::String`: path to the CSV formatted file to read
+- `stringtype::Type{Union{String,InlineString}}=String` : `CSV.jl` string storage type
+- `pool::Bool=false` : `CSV.File` will pool `String` column values.
 
 # Returns
 - `data::DataFrame`: the dataframe representation of the csv file.
+
+!!! note
+    Some of the behaviors of `CSV.jl` will create data types that are inconnsistant with
+    the several function argument types in `CBFV`. If you use this function to read the
+    data files the data frame constructed via CSV will work properly.
 """
 function readdatabasefile(pathtofile::AbstractString;
-                          stringtype::Type{T}=String) where T<:Union{String,InlineString}
+                          stringtype::Type{T}=String,
+                          pool=false) where T<:Union{String,InlineString}
     # Use CSV and dataframes
-    data = CSV.File(pathtofile,stringtype=stringtype) |> DataFrame
+    data = CSV.File(pathtofile,stringtype=stringtype,pool=pool) |> DataFrame
     return data
 end  # function readdatabasefile
diff --git a/src/Featurization.jl b/src/Featurization.jl
@@ -47,11 +47,22 @@ end
 This is the primary function that assigns the features based on the CBFV approach. For more
 details its best to see the original python CBFV and references in README file.
 
+# Arguments
+- `processeddata::Vector{Dict{Symbol,Any}}` : the formulas processed against elemental database
+- `formulae::AbstractArray` : the formula string values, this should be some subtype of `Array{String,1}`
+- `sumfeatures::Bool=false` : wheter to create a `sum_` feature vector
+
+# Returns
+- `featuresarry::Vector{Matrix{Float64}}` : feature vectors for each row in original data set.
+- `skippedformula::Vector{String}` : skipped formulas
+
+!!! note
+    The `generatefeatures` call does not do anything (i.e. return) the skippedformulas.
 
 """
 function assignfeatures(processeddata::Vector{Dict{Symbol,Any}},
-    formulae::Array{String,1},
-    sumfeatures::Bool=false)
+                        formulae::AbstractArray,
+                        sumfeatures::Bool=false)
 
     iterformulae = ProgressBar(1:length(formulae))
     skippedformula = Array{String,1}()
@@ -69,11 +80,11 @@ function assignfeatures(processeddata::Vector{Dict{Symbol,Any}},
         fmax = maximum(properties, dims=1)
         fmin = minimum(properties, dims=1)
         _, fraccomp = fractionalcomposition(formula)
-        favg = sum(fraccomp .* properties, dims=1)
+        favg = sum(fraccomp .* properties, dims=1) #FIX: Not sure whats going on here
         fdev = sum(fraccomp .* abs.(properties .- favg), dims=1)
+
         prominant = isapprox.(fraccomp, maximum(fraccomp))
         fmode = minimum(properties[prominant, :], dims=1)
-
         fweight = sumfeatures ? sum(amount .* properties, dims=1) : amount .* properties
 
         if sumfeatures
@@ -110,9 +121,9 @@ moments from the element features in the formula.
 
 """
 function constructfeaturedataframe(featcolnames::Vector{String},
-    features::Array{Float64,2},
-    extrafeatures::Tuple{Bool,DataFrame},
-    sumfeatures::Bool)
+                                   features::Array{Float64,2},
+                                   extrafeatures::Tuple{Bool,DataFrame},
+                                   sumfeatures::Bool)
 
     if sumfeatures
         colprefixes = ["sum_", "avg_", "dev_", "range_", "max_", "min_", "mode_"]
@@ -143,7 +154,6 @@ end  # function constructfeaturedataframe
 
 """
     generatefeatures(data; elementdata,dropduplicate,combine,sumfeatures,returndataframe)
-    generatefeatures(data, elementdata; kwargs...)
     generatefeatures(dataname; kwargs...)
 
 This is the primary function for generating the CBFV features for a dataset of formulas with or without
@@ -157,7 +167,7 @@ assigning of features is then executed based on the CBFV approach. If the `retur
 
 # Arguments
 - `data::DataFrame`: This is the data set that you want to be featurized for example.
-- `elementdata::Union{String,FileName}`: The name of the internal database or the file path and
+- `elementdata::Union{String,FileName} or Union{String,DataFrame}`: The name of the internal database or the file path and
 name to an external database.
 - `dropduplicate::Bool=true`: Option to drop duplicate entries.
 - `combine::Bool=false`: Option to combine existing features in `data` with the generated feature set.
@@ -184,18 +194,25 @@ d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.
 generatefeatures(d)
 ```
 """
-function generatefeatures(data::DataFrame;
-    elementdata::String="oliynyk",
-    dropduplicate=true,
-    combine=false,
-    sumfeatures=false,
-    returndataframe=true)
+function generatefeatures(data::DataFrame,
+                          elementdata::Union{String,DataFrame}="oliynyk";
+                          dropduplicate=true,
+                          combine=false,
+                          sumfeatures=false,
+                          returndataframe=true)
+
+    # Remove duplicate entries
+    if dropduplicate
+        moddata = unique(data)
+    else 
+        moddata = data
 
+    end
 
     # Process input data
     checkdataframe(data)
-    formulae = data[!, :formula]
-    featcolnames, processeddata = processinputdata(dropduplicate ? unique(data) : data, elementdata)
+    formulae = moddata[!, :formula]
+    featcolnames, processeddata = processinputdata(moddata, elementdata)
 
     targets = [row[:target] for row in processeddata]
 
@@ -205,8 +222,7 @@ function generatefeatures(data::DataFrame;
         sumfeatures)
 
     # Extra features from original data
-    extra_df = dropduplicate ? unique(data) : data
-    extrafeatures = extra_df[!, Not([:formula, :target])]
+    extrafeatures = moddata[!, Not([:formula, :target])]
     if combine checkifempty(extrafeatures) end
 
     if returndataframe
@@ -223,11 +239,30 @@ function generatefeatures(data::DataFrame;
 
 end  # function generatefeaturesdata 
 
-# Issue #4 TODO: Work in support for custom element data sets. Requires
-# working back through `generatefeatures`-> `processinputdata` -> ....
-# generatefeatures(data::DataFrame, elementdata::FileName; kwargs...) = begin
-#    generatefeatures(data, elementdata=elementdata, kwargs...)
-# end
+
+generatefeatures(data::DataFrame;
+                 elementdata::Union{FileName,String}="oliynyk",
+                 dropduplicate=true,
+                 combine=false,
+                 sumfeatures=false,
+                returndataframe=true) = begin
+    if typeof(elementdata) == FileName
+        elementdataframe = readdatabasefile(elementdata.fullpath)
+        generatefeatures(data,elementdataframe,
+                        dropduplicate=dropduplicate,
+                        combine=combine,
+                        sumfeatures=sumfeatures,
+                        returndataframe=returndataframe)
+    else
+        generatefeatures(data,elementdata,
+                        dropduplicate=dropduplicate,
+                        combine=combine,
+                        sumfeatures=sumfeatures,
+                        returndataframe=returndataframe)
+    end
+
+end
+
 
 generatefeatures(dataname::String; kwargs...) = begin
     # Digest data file before processing

diff --git a/src/Types.jl b/src/Types.jl
@@ -1,7 +1,8 @@
 # see LICENSE
 
 """
-generatefeatures Datatype for multiple dispatch
+generatefeatures Datatype for multiple dispatch. Allows for passing
+external database.
 """ struct FileName
     fullpath::String
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -108,12 +108,19 @@ using CSV, DataFrames
         tmpfile = tempname()
         CSV.write(tmpfile,d)
         @test featdb == CBFV.generatefeatures(tmpfile,returndataframe=true)
-
+
+        @test CBFV.generatefeatures(d,combine=true)[!,:property] == d[!,:property]
+
         testdb = CSV.File("pycbfv_test_data.csv") |> DataFrame
         @test length(names(featdb[!,Not([:target,:formula])])) == length(names(testdb))
         @testset "Column $n" for n in names(testdb)
             @test testdb[!,n] ≈ featdb[!,n]
         end
 
+        featdb_ext = CBFV.generatefeatures(d,
+                                           elementdata=CBFV.FileName((@__DIR__)*"/../databases/oliynyk.csv"))
+        @test featdb_ext == featdb
+
+
     end # Featurization.jl testset
 end