From e3c764d870e29ef9d4afce37bc8f2b81811a6501 Mon Sep 17 00:00:00 2001 From: Eddie Pantridge <5595494+erp12@users.noreply.github.com> Date: Sun, 2 Jan 2022 23:11:53 -0500 Subject: [PATCH] Redesigns API to allow for pre/post generation data generation. (#3) * Demonstrates implementation of epsilon lexicase selection. * Demonstrates implementation of downsampling. * Expands the toolbox. --- README.md | 17 +- build.clj | 12 +- deps.edn | 16 +- docs/Intro.md | 3 + examples/erp12/ga_clj/examples/alphabet.cljc | 28 +- .../ga_clj/examples/alphabet_lexicase.cljc | 47 --- .../ga_clj/examples/symbolic_regression.cljc | 134 ++++++++ src/erp12/ga_clj/generational.cljc | 140 +++++--- src/erp12/ga_clj/individual.cljc | 45 +-- src/erp12/ga_clj/toolbox.cljc | 324 ++++++++++++++---- src/erp12/ga_clj/utils.cljc | 7 + test/erp12/ga_clj/individual_test.cljc | 41 +-- test/erp12/ga_clj/toolbox_test.cljc | 61 +++- test/erp12/ga_clj/utils_test.cljc | 11 +- 14 files changed, 624 insertions(+), 262 deletions(-) create mode 100644 docs/Intro.md delete mode 100644 examples/erp12/ga_clj/examples/alphabet_lexicase.cljc create mode 100644 examples/erp12/ga_clj/examples/symbolic_regression.cljc diff --git a/README.md b/README.md index b70dbb7..ff6b3f3 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Outline - Not easily extended via interop. - Decoupling from assumptions - Genomes can be any data. - - Phenomes/individuals are open maps that hold genomes, errors, and anything else the user wants to add to them. + - Individuals are open maps that hold genomes, errors, and anything else the user wants to add to them. - "breeding" new genomes is done via a user supplied function. - Solving common issues - Ensure same, well tested, implementations of common algorithms. (aka toolbox) @@ -35,16 +35,17 @@ In the future, we may also publish releases to Clojars. ## Guide -Current ga-clj only supports generational genetic algorithms. +Currently, ga-clj only supports generational genetic algorithms. ### Terminology - `genome-factory`: A nullary function for creating random genomes. -- `genome->phenome`: A function from genome to a "phenome" map containing additional used to drive breeding. - Often this map contains the errors/fitness associated with the genome but could also contain any other values. -- `breed`: A function that takes the current population of individuals as input and returns a new child genome. - Often this function will perform parent selection and variation operators. The `erp12.ga-clj.toolbox` namespace - provides implementations of commonly used algorithms that will likely be useful to call in breed functions. +- `genome->individual`: A function from genome to an "individual" map containing additional data used to drive breeding. + Often this map contains the errors/fitness associated with the genome but could also contain any other values. +- `breed`: A function that takes the current population of individuals and maybe other data about the state of evolution + and returns a new child genome. Often this function will perform parent selection and variation operators. + The `erp12.ga-clj.toolbox` namespace provides implementations of commonly used algorithms that will likely be useful + to call in breed functions. Additional terminology and configuration parameters can be found in the docstring of `evolve` functions found in namespaces that provide a specific kind of genetic algorithm. For example: @@ -71,6 +72,4 @@ See the `CONTRIBUTING.md` for more information, including how to run tests. - Add more examples that test the design/abstraction in a wider range of scenarios. - TSP? - Knapsack problem? -- Figure out how best to handle logging, monitoring, data collection, etc. - - In the library or in user code? - Rationale and Guide diff --git a/build.clj b/build.clj index 6b610a7..171dec5 100644 --- a/build.clj +++ b/build.clj @@ -22,12 +22,18 @@ (process-result (b/process {:command-args ["node" "out/node-tests.js"]})) opts) +(defn example + [opts] + ;; @todo Pass args to example file. + (process-result (b/process {:command-args ["clojure" "-M:examples" "-m" (name (:ns opts))]}))) + (defn examples - [_] - (doseq [example-ns ['erp12.ga-clj.examples.alphabet]] + [opts] + (doseq [example-ns ['erp12.ga-clj.examples.alphabet + 'erp12.ga-clj.examples.symbolic-regression]] (println "\nRunning example" example-ns) ;; @todo Pass smaller population sizes and max generations to examples via command args to keep CI fast. - (process-result (b/process {:command-args ["clojure" "-M:examples" "-m" (name example-ns)]})))) + (example (assoc opts :ns example-ns)))) (defn ci [opts] diff --git a/deps.edn b/deps.edn index a169983..14902e0 100644 --- a/deps.edn +++ b/deps.edn @@ -1,13 +1,21 @@ {:paths ["src"] - :deps {} + :deps {kixi/stats {:mvn/version "0.5.4"}} :aliases {:build {:extra-deps {io.github.seancorfield/build-clj {:git/tag "v0.5.4" :git/sha "bc9c0cc"}} :ns-default build} :test {:extra-paths ["test"] - :extra-deps {com.cognitect/test-runner {:git/url "https://github.com/cognitect-labs/test-runner.git" - :git/tag "v0.5.0" :git/sha "b3fd0d2"}} + :extra-deps {io.github.cognitect-labs/test-runner {:git/tag "v0.5.0" :git/sha "b3fd0d2"}} :main-opts ["-m" "cognitect.test-runner"] :exec-fn cognitect.test-runner.api/test} :test-cljs {:extra-paths ["test"] :extra-deps {thheller/shadow-cljs {:mvn/version "2.16.6"}} :main-opts ["-m" "shadow.cljs.devtools.cli"]} - :examples {:extra-paths ["examples"]}}} + :examples {:extra-paths ["examples"]} + :codox {:extra-deps {codox/codox {:mvn/version "0.10.8"}} + :exec-fn codox.main/generate-docs + :exec-args {:source-paths ["src"] + :doc-paths ["docs"] + :output-path "../ga-clj-DOC" + :source-uri "https://github.com/erp12/ga-clj/blob/{version}/{filepath}#L{line}" + :project {:name "GA CLJ" + :version "0.0.0" + :description "No-assumptions genetic algorithms in Clojure"}}}}} diff --git a/docs/Intro.md b/docs/Intro.md new file mode 100644 index 0000000..fdbd5fc --- /dev/null +++ b/docs/Intro.md @@ -0,0 +1,3 @@ +# Getting Started + +Write me! diff --git a/examples/erp12/ga_clj/examples/alphabet.cljc b/examples/erp12/ga_clj/examples/alphabet.cljc index 4592ecb..e70f931 100644 --- a/examples/erp12/ga_clj/examples/alphabet.cljc +++ b/examples/erp12/ga_clj/examples/alphabet.cljc @@ -14,29 +14,29 @@ [& _] (println (ga/evolve {;; Generates random genomes as a permutation of the target genome. - :genome-factory #(shuffle target) - ;; Phenomes are a map containing a scalar `:error` for the genome. + :genome-factory #(shuffle target) + ;; Individuals are a map containing a scalar `:error` for the genome. ;; In this case, we use the hamming distance. ;; The `:genome` is added implicitly. - :genome->phenome (fn [gn] {:error (tb/hamming-distance gn target)}) + :genome->individual (fn [gn _] {:error (tb/hamming-distance gn target)}) ;; To "breed" a new genome from the population, we: ;; 1. Select 2 parents with tournament selection. ;; 2. Pass their genomes to uniform-crossover. ;; 3. Mutate the resulting genome by swapping the position of 2 genes. - :breed (fn [population] - (->> (repeatedly 2 #(tournament population)) - (map :genome) - tb/uniform-crossover - tb/swap-2-genes)) + :breed (fn [generation] + (->> (repeatedly 2 #(tournament generation)) + (map :genome) + tb/uniform-crossover + tb/swap-2-genes)) ;; We compare individuals on the basis of the error values. Lower is better. - :phenome-cmp (comparator #(< (:error %1) (:error %2))) + :individual-cmp (comparator #(< (:error %1) (:error %2))) ;; We stop evolution when either: ;; 1. We find an individual with zero error or ;; 2. We reach 300 generations. - :stop-fn (fn [{:keys [generation best]}] - (cond - (= (:error best) 0) :solution-found - (= generation 300) :max-generation-reached)) + :stop-fn (fn [{:keys [generation best]}] + (cond + (= (:error best) 0) :solution-found + (= generation 300) :max-generation-reached)) ;; Each generation will contain 1000 individuals. - :population-size 1000})) + :population-size 1000})) (shutdown-agents)) diff --git a/examples/erp12/ga_clj/examples/alphabet_lexicase.cljc b/examples/erp12/ga_clj/examples/alphabet_lexicase.cljc deleted file mode 100644 index 7291433..0000000 --- a/examples/erp12/ga_clj/examples/alphabet_lexicase.cljc +++ /dev/null @@ -1,47 +0,0 @@ -(ns erp12.ga-clj.examples.alphabet-lexicase - (:gen-class) - (:require [erp12.ga-clj.generational :as ga] - [erp12.ga-clj.toolbox :as tb])) - -(def target - (vec "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) - -(def lexicase - (tb/make-lexicase-selection {:epsilon nil})) - -(defn -main - "Evolves vector of letters in alphabetical order." - [& _] - (println - (ga/evolve {;; Generates random genomes as a permutation of the target genome. - :genome-factory #(shuffle target) - ;; Phenomes are a map containing a scalar `:error` for the genome. - ;; Additionally :errors is 0 or 1 for each character. - ;; The `:genome` is added implicitly. - :genome->phenome (fn [gn] - (let [errors (map #(if (= %1 %2) 0 1) - gn - target)] - {:errors errors - :error (apply + errors)})) - ;; To "breed" a new genome from the population, we: - ;; 1. Select 2 parents with lexicase selection. - ;; 2. Pass their genomes to uniform-crossover. - ;; 3. Mutate the resulting genome by swapping the position of 2 genes. - :breed (fn [population] - (->> (repeatedly 2 #(lexicase population)) - (map :genome) - tb/uniform-crossover - tb/swap-2-genes)) - ;; We compare individuals on the basis of the error values. Lower is better. - :phenome-cmp (comparator #(< (:error %1) (:error %2))) - ;; We stop evolution when either: - ;; 1. We find an individual with zero error or - ;; 2. We reach 300 generations. - :stop-fn (fn [{:keys [generation best]}] - (cond - (= (:error best) 0) :solution-found - (= generation 300) :max-generation-reached)) - ;; Each generation will contain 1000 individuals. - :population-size 1000})) - (shutdown-agents)) diff --git a/examples/erp12/ga_clj/examples/symbolic_regression.cljc b/examples/erp12/ga_clj/examples/symbolic_regression.cljc new file mode 100644 index 0000000..ecfb217 --- /dev/null +++ b/examples/erp12/ga_clj/examples/symbolic_regression.cljc @@ -0,0 +1,134 @@ +(ns erp12.ga-clj.examples.symbolic-regression + "Symbolic regression to fit data generated from x^2 + 3x + 2 with a bit of noise added. + Adapted from: Lee Spector (https://github.com/lspector/gp)" + (:require [kixi.stats.math :as math] + [kixi.stats.distribution :as dist] + [erp12.ga-clj.generational :as ga] + [erp12.ga-clj.toolbox :as tb])) + +;; x^2 + 3x + 2 + noise +(def x-train (vec (range -20.0 20.0 0.05))) +(def y-train (mapv (fn [x noise] + (+ (+ (+ (* x x) (* 3 x)) 2) + noise)) + x-train + ;; Add some noise to the data. + (dist/sample (count x-train) (dist/normal {:mu 0 :sd 0.1})))) + +(defn p-div + "Protected division." + [n d] + (if (zero? d) + 0 + (/ n d))) + +(def fn->arity + {+ 2 + - 2 + * 2 + p-div 2 + math/sin 1 + math/cos 1}) + +(defn random-function + [] + (rand-nth (keys fn->arity))) + +(defn random-terminal + [] + (rand-nth (list 'x (- (rand 5) 1)))) + +(defn random-code + "Create a random tree of code." + [depth] + (if (or (zero? depth) + (zero? (rand-int 2))) + (random-terminal) + (let [f (random-function)] + (cons f (repeatedly (get fn->arity f) + #(random-code (dec depth))))))) + +(def mutate + ;; We mutate trees by replacing a random subtree with a random tree. + (tb/make-subtree-mutation {:tree-generator #(random-code 2)})) + +(def select + ;; Creates a parent selection function that uses the lexicase selection + ;; algorithm. We supply a function-live value to the `:epsilon` field + ;; to specify how a value for epsilon will be pulled from the generation. + ;; In this case, we will store a vector of per-case epsilon under the `:epsilon` + ;; key of the generation map. + (tb/make-lexicase-selection {:epsilon :epsilon}) + + ;; To use traditional lexicase selection (no epsilon) we can specify `:epsilon` as + ;; a falsey value, which is also the default. + ; (tb/make-lexicase-selection) + + ;; We can also supply a static, constant, value for epsilon. + ; (tb/make-lexicase-selection {:epsilon 0.1}) + ) + +(defn -main + [& _] + (println + (ga/evolve {;; Creates a random tree, corresponding to an equation, with a max depth of 3. + :genome-factory #(random-code 3) + ;; Before each generation (evaluation), randomly select 5% of the training + ;; cases to use for evaluation. All genomes in the generation will be evaluated + ;; on these same cases. + :pre-generation (fn [] + {:batch-cases (random-sample 0.05 (range (count x-train)))}) + ;; Individuals are a maps containing + ;; 1. A `:model` represented as a callable Clojure funciton. + ;; 2. A vector of predictions, stored under `:y-pred`. + ;; 3. A vector of `:errors`, one for each training case in this generation's batch. + ;; 4. The mean error across all cases, stored under `:mae`. + ;; 5. The `:genome` tree which created the model. This is added implicitly. + :genome->individual (fn [gn {:keys [batch-cases]}] + (let [model (eval `(fn ~(vector 'x) ~gn)) + x-batch (mapv #(nth x-train %) batch-cases) + y-batch (mapv #(nth y-train %) batch-cases) + y-pred (mapv model x-batch) + errors (mapv #(Math/abs (- %1 %2)) y-pred y-batch)] + {:model model + :y-pred y-pred + :errors errors + :mae (tb/mean errors)})) + ;; After each generation is evaluated, compute a vector of `:epsilon` values + ;; to use in parent selection. In this case, we will use the default computation + ;; of epsilon: the median absolute deviation. + :post-generation (fn [population] + {:epsilon (tb/compute-epsilon-per-case population)}) + ;; To "breed" a new genome from the population, we: + ;; 1. Select 2 parents with lexicsae selection. This will look-up + ;; 2. Pass their genomes to subtree crossover. + ;; 3. Mutate the resulting genome by with subtree mutation. + :breed (fn [generation] + (->> (repeatedly 2 #(select generation)) + (map :genome) + (apply tb/subtree-crossover) + mutate)) + ;; We compare individuals on the basis of their mean absolute error. Lower is better. + :individual-cmp (comparator #(and (< (:mae %1) (:mae %2)) + (not (math/infinite? (:mae %1))))) + ;; We stop evolution when either: + ;; 1. We reach 300 generations. + ;; 2. We find an individual with zero MAE on the entire dataset. + :stop-fn (fn [{:keys [generation-number best new-best?]}] + (println "Generation:" generation-number + "Best MAE:" (:mae best) + "Best Tree Size:" (tb/tree-size (:genome best)) + "Best Tree Depth:" (tb/tree-depth (:genome best))) + (cond + ;; Stop evolution after 300 generations. + (= generation-number 300) :max-generation-reached + ;; If a new "best" individual is found (based on MEA of a batch) + ;; Test the new best individual on the full training set. + ;; If the full MAE is below 0.3, report that the solution is found + new-best? (let [y-pred (mapv (:model best) x-train) + mae (tb/mae y-pred y-train)] + (when (<= mae 0.2) + :solution-found)))) + ;; Each generation will contain 1000 individuals. + :population-size 1000})) + (shutdown-agents)) diff --git a/src/erp12/ga_clj/generational.cljc b/src/erp12/ga_clj/generational.cljc index f3ad0fd..88df0df 100644 --- a/src/erp12/ga_clj/generational.cljc +++ b/src/erp12/ga_clj/generational.cljc @@ -1,70 +1,108 @@ (ns erp12.ga-clj.generational "A simple generational genetic algorithm." (:require [erp12.ga-clj.individual :as i] - [erp12.ga-clj.utils :as u])) + [erp12.ga-clj.utils :as u]) + #?(:clj (:require [erp12.ga-clj.utils :refer [with-error-context]]) + :cljs (:require-macros [erp12.ga-clj.utils :refer [with-error-context]]))) -;; @todo Allow extra per-generation data? For example, which cases used for downsampled-lexicase. +(defn- make-generation + [{:keys [pre-generation population-factory post-generation] :as opts}] + (let [m1 (pre-generation) + population (population-factory m1) + m2 (post-generation population)] + (merge m1 {:population population} m2))) -(defn evolve - "Run a generational genetic algorithm. - - The number of individuals in each generation is given by `population-size`. - - The genomes of the initial population are generated via the nullary `genome-factory` function. - - Genomes are translated into individuals by calling the `genome->phenome` function, which must - a map. The genome will be added to the map automatically. - - The `finalization` takes the population of individuals (post `genome->phenome`) and is expected to - return a population of (possibly transformed) individuals. This function can be used to enhance - individuals with attributes that require access to other individuals, filter the population, - compute population statistics, log progress, and perform arbitrary side effects once per generation. +(defn- initial-generation + [{:keys [genome-factory genome->individual population-size mapper] + :or {mapper map} + :as opts}] + (make-generation + (assoc opts + :population-factory + (fn [config] + (mapper (fn [_] (i/make-individual genome->individual (genome-factory) config)) + (range population-size)))))) - The `phenome-cmp` is a comparator function that can be used to compare 2 individuals. It is used to track - the best individual seen throughout evolution. +(defn- next-generation + [{:keys [genomes genome->individual mapper] + :or {mapper map} + :as opts}] + (make-generation + (assoc opts + :population-factory + (fn [config] + (mapper #(i/make-individual genome->individual % config) genomes))))) - The `stop-fn` is used to determine if evolution should be stopped. It is passed a map containing the - generation number (`:generation`), current population (`:population`), and best individual seen during - evolution so far (`:best`). The `stop-fn` should return `nil` if evolution should continue. Otherwise, - it can return a truthy value that will be included in the map returned by `evolve` under the key `:result`. +(defn evolve + "Run a generational genetic algorithm. - The `mapper` can be any function that implements a map-like higher order function. - Common values are `map`, `pmap`, and `mapv`. The default is `pmap` for CLojure and `map` for Clojurescript. + Options: + :genome-factory - A nullary function to produce the (random) genomes of the initial population. + :pre-generation - A nullary function that will be called before each generation. Must return a map. + The resulting data will be available to all calls of `:genome->individual` + throughout the generation. Default is a function which returns an empty map. + :genome->individual - A function which converts a genome into a map, referred to as the \"individual\". + The :genome is added to the resulting map implicitly. + :post-generation - A unary function that will be called before each generation. Must take a collection + of individuals (the population of the previous generation) and return a map. + The resulting data will be available to all calls of `:breed` used to create the + subsequent generation. Default is a function which returns ae empty map. + :breed - A function for creating a new \"child\" genome. Must take a map containing the + data of the previous generation (containing at least a :population key) and + return a genome. This is where selection and variation must occur. + :individual-cmp - A comparator function that can be used to compare 2 individuals. It is used to + track the best individual seen throughout evolution. + :stop-fn - A functions used to determine if evolution should be stopped. Details below. + :population-size - The number of individuals in each generation. + :mapper - A map-like higher order function. Common values are `map`, `pmap`, and `mapv`. + The default is `pmap` for clj and `map` for cljs. The `evolve` function will return a map with the following entries: :generation - The generation number after which evolution stopped. :result - The truthy value returned by the `stop-fn`. :best - The best individual seen throughout evolution. - " - [{:keys [genome-factory genome->phenome finalization breed phenome-cmp stop-fn population-size mapper] - :or {finalization vec - mapper #?(:clj pmap :cljs map)} + The `stop-fn` is used to determine if evolution should be stopped. It is passed a map containing the + following options: + :generation-number - The number of generations that have gone + :generation - The entire generation map, including the population. + :best - The best individual seen throughout evolution so far according to the :individual-cmp. + :new-best? - True if the :best individual is new this generation, false otherwise. Can be used + to skip expensive logic to determine if the solution is good enough. + The `stop-fn` should return `nil` if evolution should continue. Otherwise, it must return a truthy value + that will be included in the map returned by `evolve` under the key `:result`." + [{:keys [genome-factory pre-generation genome->individual post-generation breed individual-cmp stop-fn population-size mapper] + :or {pre-generation hash-map + post-generation (fn [_] {}) + mapper #?(:clj pmap :cljs map)} :as opts}] - ;; @todo Is there a more informative name than `finalization`? {:pre [(some? genome-factory) - (some? genome->phenome) - (some? phenome-cmp) + (some? genome->individual) + (some? individual-cmp) (some? breed) (some? stop-fn) (some? population-size)]} - (loop [generation 0 - population (finalization (i/random-population opts)) - best-seen nil] - (let [new-best (u/min-by-cmp phenome-cmp - (if (nil? best-seen) - population - (conj population best-seen))) - result (stop-fn {:generation generation - :population population - :best new-best})] - (if (some? result) - {:generation generation - :result result - :best new-best} - (recur (inc generation) - ;; @todo Use with-error-context to display the context of any errors thrown during selection and variation. - (->> (repeatedly population-size #(breed population)) - (mapper #(i/genome->individual (:genome->phenome opts) %)) - finalization) - new-best))))) + (let [opts (merge opts + ;; Put the default values into opts if not supplied. + {:pre-generation pre-generation + :post-generation post-generation + :mapper mapper})] + (loop [generation-number 0 + generation (initial-generation opts) + best-seen nil] + (let [population (:population generation) + new-best (u/min-by-cmp individual-cmp + (if (nil? best-seen) + population + (conj population best-seen))) + result (stop-fn {:generation-number generation-number + :generation generation + :best new-best + :new-best? (not= best-seen new-best)})] + (if (some? result) + {:generation generation-number + :result result + :best new-best} + (recur (inc generation-number) + (next-generation (assoc opts :genomes (repeatedly population-size #(breed generation)))) + new-best)))))) diff --git a/src/erp12/ga_clj/individual.cljc b/src/erp12/ga_clj/individual.cljc index e61bf9b..c5f794b 100644 --- a/src/erp12/ga_clj/individual.cljc +++ b/src/erp12/ga_clj/individual.cljc @@ -1,36 +1,23 @@ (ns erp12.ga-clj.individual "Functions for creating and transforming individuals. - An individual is a map with (at least) the keyword `:genome`. - Typically, individuals will contain other attributes such as data structures representing - the fitness/error of the individual." - #?(:clj (:require [erp12.ga-clj.utils :refer [with-error-context]]) + An individual is a map with (at least) the keyword `:genome`. Typically, individuals will + contain other attributes such as the fitness/error of the individual." + #?(:clj (:require [erp12.ga-clj.utils :refer [with-error-context]]) :cljs (:require-macros [erp12.ga-clj.utils :refer [with-error-context]]))) -(defn genome->individual - "Builds an individual (a map containing genome and phenome data) from a genome. - The `genome->phenome` must return a map. +(defn make-individual + "Builds an individual (a map containing a genome and other data). - If an exception is raised while building the individual, it will be wrapped - in an outer exception that includes the genome in the `ex-data` to aid in - debugging." - [genome->phenome genome] - (with-error-context - {:msg "Failed to create phenome from genome." - :genome genome} - (assoc (genome->phenome genome) :genome genome))) - -(defn random-population - "Creates a population of individuals from random genomes. + The `genome->individual` function must return a map. The map does not need to contain + the genome because it will be implicitly added under the `:genome` key. - Genomes are created with the nullary `genome-factory` function. - Each genome is converted into an individual using the `genome->phenome` - function via `(genome->individual)`. - - The `mapper` can be any function that implements a map-like higher order - function. Common values are `map`, `pmap`, and `mapv`. The default is `map`. - " - [{:keys [genome-factory genome->phenome population-size mapper] - :or {mapper map}}] - (let [make-random-individual (comp (partial genome->individual genome->phenome) genome-factory)] - (mapper (fn [_] (make-random-individual)) (range population-size)))) + If an exception is raised while building the individual, it will be wrapped + in an outer exception that includes the genome in the `ex-data` to aid in + debugging." + [genome->individual genome opts] + (with-error-context + {:msg "Failed to create individual from genome." + :genome genome + :opts opts} + (assoc (genome->individual genome opts) :genome genome))) diff --git a/src/erp12/ga_clj/toolbox.cljc b/src/erp12/ga_clj/toolbox.cljc index c708572..5ba6847 100644 --- a/src/erp12/ga_clj/toolbox.cljc +++ b/src/erp12/ga_clj/toolbox.cljc @@ -2,106 +2,288 @@ "Algorithms often used by genetic algorithms. Most functions fall into one of the following families of algorithms: - - Error/loss functions - - Parent selection - - Mutation - - Recombination + - Error/loss functions + - Parent selection + - Mutation + - Recombination Some functions make assumptions about the structure of genomes and/or the attributes of individuals. - See function docstrings for more details.") - -;; @todo Add tests for this whole file! + See function docstrings for more details." + (:require [kixi.stats.math :as math] + [kixi.stats.core :as stat] + [erp12.ga-clj.utils :as u])) ;; Errors / Loss +(defn mean + "Arithmetic mean." + [arr] + (transduce identity stat/mean arr)) + +(defn mae + "Mean absolute error." + [y-true y-pred] + (transduce identity stat/mean (map #(math/abs (- %1 %2)) y-true y-pred))) + +(defn mse + "Mean squared error." + [y-true y-pred] + (transduce identity (stat/mse first second) (map vector y-pred y-true))) + +(defn rmse + "Root mean squared error." + [y-true y-pred] + (transduce identity (stat/rmse first second) (map vector y-pred y-true))) + (defn hamming-distance [actual expected] (apply + (map #(if (= %1 %2) 0 1) actual expected))) +(defn stdev + "The sample standard deviation." + [arr] + (transduce identity stat/standard-deviation arr)) + +(defn median + [coll] + "The median." + ;; Source: https://github.com/clojure-cookbook/clojure-cookbook/blob/master/01_primitive-data/1-20_simple-statistics.asciidoc + ;; We don't use the kixi.stat.math implementation because it is CLJ only. + (let [sorted (sort coll) + cnt (count sorted) + halfway (quot cnt 2)] + (if (odd? cnt) + (nth sorted halfway) + (let [bottom (dec halfway) + bottom-val (nth sorted bottom) + top-val (nth sorted halfway)] + (mean [bottom-val top-val]))))) + +(defn mad + "Median absolute deviation (MAD)" + [arr] + (let [med (median arr) + dev (map #(math/abs (- % med)) arr)] + (median dev))) + ;; Selection (defn make-tournament-selection + "Creates a tournament selection function for selecting individuals from a generation's population. + Randomly samples a subset (aka tournament) of the population and selects the \"best\" individual + from within the subset. + + The argument map requires the following keys: + :size - The number of individuals in the tournament. + :by - A function applied to each individual. The return value should be comparable, and is + typically a number representing the error of the individual. The individual in the tournament + with the lowest value will be selected. + + The selection function returned by `make-tournament-selection` will leverage the `:population` field + of a generation. + + See: Section 2.3 of the field guide to genetic programming. + http://www0.cs.ucl.ac.uk/staff/W.Langdon/ftp/papers/poli08_fieldguide.pdf + " [{:keys [by size]}] - (fn [population] + (fn [{:keys [population]}] (apply min-key by (take size (shuffle population))))) +(defn compute-epsilon-per-case + "Computes a vector of epsilon values for epsilon lexicase selection based on the + error vectors of a collection of individuals. + + Options: + :errors-fn - The function that will extract the error vector from the individual. Default is :errors. + :agg-by - A aggregation function from all the error values of a single training case to a single + value of epsilon. Default is the median absolute deviation (mad)." + ([individuals] + (compute-epsilon-per-case individuals {})) + ([individuals {:keys [errors-fn agg-by] + :or {errors-fn :errors agg-by mad}}] + (->> individuals + (map errors-fn) + (apply (partial map vector)) + (mapv agg-by)))) + +(defn- get-epsilon + [epsilon context] + (cond + (or (number? epsilon) (indexed? epsilon)) epsilon + (ifn? epsilon) (get-epsilon (epsilon context) {}) + (or (false? epsilon) (nil? epsilon)) 0 + :else (throw (ex-info "Failed to compute usable epsilon." + {:epsilon epsilon + :context context})))) (defn lexicase-selection - "Helper for make-lexicase-selection. Takes the population and cases shuffled - in a random order. Behaves deterministically to aid in testing." - [candidates cases errors-key] - ;; Stop when cases is empty or candidates has only 1 left. + "Performs lexicase selection. + + Options: + :candidates - A collection of individuals to select from. + :errors-fn - A function to call on individuals to return their error vectors. + :cases - A sequence of case (or error) indices. Typically, a randomly ordered seq + from 0 to the length of each individual's error vector. + :epsilon - Default is zero. See below for details. + + The :epsilon option controls how to compute the value for epsilon. A scalar number will + be used the epsilon across all cases. An indexed collection of numbers will use each + element as the epsilon for the corresponding index of the error vectors. + + Citations: + https://arxiv.org/abs/2106.06085 + https://arxiv.org/abs/1905.09372 + https://arxiv.org/abs/1905.13266 + https://arxiv.org/abs/1709.05394" + [{:keys [candidates errors-fn cases epsilon] :or {epsilon 0} :as opts}] (if (or (empty? cases) (= 1 (count candidates))) (rand-nth candidates) (let [the-case (first cases) - best (apply min (map #(nth (errors-key %) the-case) - candidates))] - (recur (filter #(= best (nth (errors-key %) the-case)) - candidates) - (rest cases) - errors-key)))) - -;; @todo Implement epsilon lexicase -;; Not sure how where I would calculate epsilons just once per generation. -(defn epsilon-lexicase-selection - "Implements semi-dynamic epsilon lexicase, which seems best experimentally. - semi-dynamic = use local best, but global epsilons that are calculated - only once per generation." - [candidates cases errors-key] - :TODO - ) - - -;; @todo Pre-selection filtering of the population? Maybe only if using lexicase-selection? + get-error #(nth (errors-fn %) the-case) + epsilon-this-case (if (indexed? epsilon) + (nth epsilon the-case) + epsilon) + threshold (+ (apply min (map get-error candidates)) epsilon-this-case)] + (recur (assoc opts + :candidates (filter #(<= (get-error %) threshold) candidates) + :cases (rest cases)))))) + (defn make-lexicase-selection - "Applies lexicase selection to the population, returning a single individual. - errors-key is the key associated with the error vector in the individual; if - missing, defaults to :errors." - [{:keys [use-epsilon-lexicase errors-key] :or {errors-key :errors}}] - (fn [population] - (let [cases (shuffle (range (count (get (first population) errors-key))))] - (if use-epsilon-lexicase - (lexicase-selection population - cases - errors-key) - (epsilon-lexicase-selection population - cases - errors-key))))) + "Creates a selection function for performing lexicase selection. See`lexicase-selection`.\n + Options: + :errors-fn - The function that will extract the error vector from the individual. + Default is :errors. + :epsilon - The value for epsilon, or a strategy for computing epsilon. Default is nil. + See below for details. + + The :epsilon option can be + - A falsey value. This will result in an epsion of 0, aka traditional lexicase selection. + - A scalar number. This will be used the epsilon across all cases + - An indexed collection of numbers. Each element will be the epsilon for the corresponding + index of the individuals' error vectors. + - A function (specifically ifn?). ... + + Citations: + https://arxiv.org/abs/2106.06085 + https://arxiv.org/abs/1905.09372 + https://arxiv.org/abs/1905.13266 + https://arxiv.org/abs/1709.05394" + ([] + (make-lexicase-selection {})) + ([{:keys [errors-fn epsilon] :or {errors-fn :errors}}] + (fn [{:keys [population] :as generation}] + (lexicase-selection {:candidates (if (not epsilon) + ;; @todo If epsilon is pre-computed, why can't we distinct the candidates by their errors? + (u/random-distinct-by errors-fn population) + population) + :errors-fn errors-fn + :cases (shuffle (range (count (errors-fn (first population))))) + :epsilon (get-epsilon epsilon (dissoc generation :population))})))) ;; Mutation (defn swap-2-genes - "Swaps the position of 2 genes in the given genome." + "Swaps the position of 2 genes in the given sequential genome." [genome] (let [gn (vec genome) [idx1 idx2] (repeatedly 2 #(rand-int (count gn)))] (assoc gn idx2 (gn idx1) idx1 (gn idx2)))) +(defn make-uniform-addition + "Creates a mutation function that uniformly randomly adds genes to a sequential genome. + + Options: + :addition-rate - The probability of adding a gene at any particular location. + :genetic-source - A 0-arg function that returns a (random) gene when called." + [{:keys [addition-rate genetic-source]}] + (fn [genome] + (mapcat #(if (< (rand) addition-rate) + (if (< (rand) 0.5) + (list % (genetic-source)) + (list (genetic-source) %)) + (list %)) + genome))) + +(defn make-uniform-deletion + "Creates a mutation function that uniformly randomly removes genes from a sequential genome. + + Options: + :deletion-rate - The probability of removing each gene." + [{:keys [deletion-rate]}] + (fn [genome] (random-sample (- 1.0 deletion-rate) genome))) + +(defn make-umad + "Creates a mutation function that performs \"Uniform Mutation by Addition and Deletion\" + on a sequential, linear genome. + + Options: + :addition-rate - The probability of adding a gene at any particular location. + :deletion-rate - The probability of removing each gene. + :genetic-source - A 0-arg function that returns a (random) gene when called. + + Citation: + https://dl.acm.org/doi/10.1145/3205455.3205603" + [opts] + (comp (make-uniform-deletion opts) + (make-uniform-addition opts))) + +(defn make-size-neutral-umad + "Creates a mutation function that performs \"Uniform Mutation by Addition and Deletion\" + on a sequential, linear genome such that the addition and deletion rates are balanced. + The average change in genome length will be zero. -(defn uniform-addition - [genome addition-rate genetic-source] - (mapcat #(if (< (rand) addition-rate) - (if (< (rand) 0.5) - (list % (rand-nth genetic-source)) - (list (rand-nth genetic-source) %)) - (list %)) - genome)) + Options: + :rate - The probability of adding a gene at any particular location. + :genetic-source - A 0-arg function that returns a (random) gene when called. -(defn uniform-deletion - [genome deletion-rate] - (random-sample (- 1.0 deletion-rate) genome)) + Citation: + https://dl.acm.org/doi/10.1145/3205455.3205603" + [{:keys [rate] :as opts}] + (make-umad (merge opts + {:addition-rate rate + :deletion-rate (/ rate (+ 1 rate))}))) -(defn umad - "Performs uniform mutation by addition and deletion. - First pass adds a new gene before or after each gene in the genome. - Second pass deletes genes with some probability." - [genome addition-rate deletion-rate genetic-source] - (-> genome - (uniform-addition addition-rate genetic-source) - (uniform-deletion deletion-rate))) +(defn tree-size + [tree] + (if (sequential? tree) + (count (flatten tree)) + 1)) +(defn tree-depth + [tree] + (if (sequential? tree) + (if (empty? tree) + 0 + (inc (apply max (map tree-depth tree)))) + 0)) + +(defn replace-subtree + [tree position replacement] + (cond + (zero? position) replacement + + (sequential? tree) + (->> tree + (reduce (fn [[offset acc] sub-tree] + [(+ offset (tree-size sub-tree)) + (conj acc (replace-subtree sub-tree (- position offset) replacement))]) + [0 (empty tree)]) + second + ;; Corrects order for prepend collections. + (into (empty tree))) + + :else tree)) + +(defn replace-random-subtree + [tree replacement] + (replace-subtree tree (rand-int (tree-size tree)) replacement)) + +(defn make-subtree-mutation + [{:keys [tree-generator]}] + (fn [tree] + (replace-random-subtree tree (tree-generator)))) ;; Recombination @@ -114,3 +296,17 @@ child-gn (recur (conj child-gn gene) (inc idx)))))) + +;; @todo n-point-crossover +;; @todo 1-point-crossover +;; @todo 2-point-crossover + +(defn random-subtree + [tree] + (if (zero? (rand-int (tree-size tree))) + tree + (recur (rand-nth (rest tree))))) + +(defn subtree-crossover + [tree-a tree-b] + (replace-random-subtree tree-a (random-subtree tree-b))) diff --git a/src/erp12/ga_clj/utils.cljc b/src/erp12/ga_clj/utils.cljc index 715a393..f124d8a 100644 --- a/src/erp12/ga_clj/utils.cljc +++ b/src/erp12/ga_clj/utils.cljc @@ -15,6 +15,13 @@ el (if (neg? (cmp el mn)) el mn))))))) +(defn random-distinct-by + [by coll] + (->> coll + shuffle + (reduce (fn [acc el] (update acc (by el) #(or % el))) {}) + vals)) + (defn- platform "Given a macro's environment, returns a keyword denoting the platform which the post-expansion code will be running in. For example, `:clj` for Clojure and `:cljs` for Clojurescript. diff --git a/test/erp12/ga_clj/individual_test.cljc b/test/erp12/ga_clj/individual_test.cljc index 7a76382..e71c4c8 100644 --- a/test/erp12/ga_clj/individual_test.cljc +++ b/test/erp12/ga_clj/individual_test.cljc @@ -1,53 +1,40 @@ (ns erp12.ga-clj.individual-test (:require [clojure.test :refer [deftest is testing]] - [erp12.ga-clj.individual :refer [genome->individual random-population]]) + [erp12.ga-clj.individual :refer [make-individual]]) #?(:clj (:import (clojure.lang ExceptionInfo)))) -(deftest genome->individual-test - (is (= (genome->individual (fn [genome] +(deftest make-individual-test + (is (= (make-individual (fn [genome opts] {:error-vector [0.9 1.0 0.0 0.5 0.1] :total-error 2.5 :losses {:mse 1.1 :mae 0.75}}) - "gtattccgcgtcggga") + "gtattccgcgtcggga" + {}) {:genome "gtattccgcgtcggga" :error-vector [0.9 1.0 0.0 0.5 0.1] :total-error 2.5 :losses {:mse 1.1 :mae 0.75}})) - (testing "Error in genome->phenome" + (testing "Exception raised in make-individual" (let [e (try - (genome->individual (fn [genome] {:bad (count 0)}) "gtattccgcgtcggga") + (make-individual (fn [_ _] {:bad (count 0)}) "gtattccgcgtcggga" {}) nil (catch #?(:clj ExceptionInfo :cljs js/Error) e e))] - (is (= (ex-message e) "Failed to create phenome from genome.")) - (is (= (ex-data e) {:genome "gtattccgcgtcggga"})) + (is (= (ex-message e) "Failed to create individual from genome.")) + (is (= (ex-data e) {:genome "gtattccgcgtcggga" + :opts {}})) (is (instance? #?(:clj UnsupportedOperationException :cljs js/Error) (ex-cause e))))) (testing "Non-map phenome" (let [e (try - (genome->individual (fn [_] 0) "gtattccgcgtcggga") + (make-individual (fn [_ _] 0) "gtattccgcgtcggga" {}) nil (catch #?(:clj ExceptionInfo :cljs js/Error) e e))] - (is (= (ex-message e) "Failed to create phenome from genome.")) - (is (= (ex-data e) {:genome "gtattccgcgtcggga"})) + (is (= (ex-message e) "Failed to create individual from genome.")) + (is (= (ex-data e) {:genome "gtattccgcgtcggga" + :opts {}})) (is (instance? #?(:clj ClassCastException :cljs js/Error) (ex-cause e)))))) - -(deftest random-population-test - (let [dummy-genome {:value + :children [{:value 1} {:value 2}]} - pop (random-population {:genome-factory (constantly dummy-genome) - :genome->phenome (fn [genome] - {:depth 2 - :errors [3 7] - :total-error 10}) - :population-size 3})] - (is (= (count pop) 3)) - (doseq [individual pop] - (is (= individual - {:genome dummy-genome - :depth 2 - :errors [3 7] - :total-error 10}))))) diff --git a/test/erp12/ga_clj/toolbox_test.cljc b/test/erp12/ga_clj/toolbox_test.cljc index 4d4c002..d1fb57c 100644 --- a/test/erp12/ga_clj/toolbox_test.cljc +++ b/test/erp12/ga_clj/toolbox_test.cljc @@ -1,6 +1,47 @@ (ns erp12.ga-clj.toolbox-test (:require [clojure.test :refer [deftest is testing]] - [erp12.ga-clj.toolbox :refer [lexicase-selection]])) + [kixi.stats.math :as math] + [erp12.ga-clj.toolbox :as tb])) + +(deftest mean-test + (is (= 2.0 (tb/mean [3 2 1])))) + +(deftest mea-test + (is (= 1.0 (tb/mae [5 5 5] [4 6 4])))) + +(deftest mse-test + (is (= 7.0 (tb/mse [5 5 5] [4 7 9])))) + +(deftest rmse-test + (is (= (math/sqrt 7) + (tb/rmse [5 5 5] [4 7 9])))) + +(deftest hamming-distance-test + (is (= 2 (tb/hamming-distance "000" "101"))) + (is (= 3 (tb/hamming-distance [:a :b :c] [:x :y :z]))) + (is (= 0 (tb/hamming-distance [:x :y :z] [:x :y :z])))) + +(deftest stdev-test + (is (= (math/sqrt 7.5) + (tb/stdev [1 2 4 5 8])))) + +(deftest median-test + (is (= 2 (tb/median [2 1 3]))) + (is (= 2.5 (tb/median [1 2 3 4])))) + +(deftest mad-test + (is (= 2.5 (tb/mad (range 10)))) + (is (= 0 (tb/mad (repeat 5 5))))) + +(deftest compute-epsilon-per-case-test + (let [error-vectors [[1 2 3] + [3 4 5] + [6 7 8] + [9 1 0]]] + (is (= [1 1 0] + (tb/compute-epsilon-per-case error-vectors {:errors-fn identity :agg-by #(apply min %)}))) + (is (= [2.5 1.5 2.5] + (tb/compute-epsilon-per-case error-vectors {:errors-fn identity}))))) (deftest lexicase-selection-test (let [population [{:name :A :errors [10 5 5 15 10]} @@ -9,15 +50,9 @@ {:name :D :errors [15 12 14 15 1]} {:name :E :errors [15 12 0 106 1]}]] (testing "Standard lexicase selection" - (is (= :D (:name (lexicase-selection population '(4 1 0 3 2) :errors)))) - (is (= :E (:name (lexicase-selection population '(4 1 0 2 3) :errors)))) - (is (= :B (:name (lexicase-selection population '(0 3 2 4 1) :errors)))) - (is (= :A (:name (lexicase-selection population '(1 0 3 2 4) :errors)))) - (is (= :C (:name (lexicase-selection population '(2 4 3 0 1) :errors)))) - (is (= :E (:name (lexicase-selection population '(2 0 3 4 1) :errors))))) - (testing "Lexicase selection running out of cases" - (let [parent (:name (lexicase-selection (conj population {:name :F :errors [73 60 0 0 1]}) - '(2 4 3 1 0) - :errors))] - (is (or (= parent :C) - (= parent :F))))))) + (is (= :D (:name (tb/lexicase-selection {:candidates population :cases '(4 1 0 3 2) :errors-fn :errors})))) + (is (= :E (:name (tb/lexicase-selection {:candidates population :cases '(4 1 0 2 3) :errors-fn :errors})))) + (is (= :B (:name (tb/lexicase-selection {:candidates population :cases '(0 3 2 4 1) :errors-fn :errors})))) + (is (= :A (:name (tb/lexicase-selection {:candidates population :cases '(1 0 3 2 4) :errors-fn :errors})))) + (is (= :C (:name (tb/lexicase-selection {:candidates population :cases '(2 4 3 0 1) :errors-fn :errors})))) + (is (= :E (:name (tb/lexicase-selection {:candidates population :cases '(2 0 3 4 1) :errors-fn :errors}))))))) diff --git a/test/erp12/ga_clj/utils_test.cljc b/test/erp12/ga_clj/utils_test.cljc index be552f8..9676137 100644 --- a/test/erp12/ga_clj/utils_test.cljc +++ b/test/erp12/ga_clj/utils_test.cljc @@ -1,6 +1,6 @@ (ns erp12.ga-clj.utils-test (:require [clojure.test :refer [deftest is testing]] - [erp12.ga-clj.utils :refer [min-by-cmp with-error-context]]) + [erp12.ga-clj.utils :refer [min-by-cmp random-distinct-by with-error-context]]) #?(:clj (:import (clojure.lang ExceptionInfo)))) (deftest min-by-cmp-test @@ -10,6 +10,15 @@ (is (= (min-by-cmp (comparator <) [1 1 1]) 1))) +(deftest random-distinct-by-test + (let [results (map sort (repeatedly 100 #(random-distinct-by count ["dog" "cat" "fish" "bird"])))] + (doseq [result results] + (is (contains? #{'("dog" "fish") + '("bird" "cat") + '("bird" "dog") + '("cat" "fish")} + result))))) + (deftest with-error-context-test (let [e (try (with-error-context {:msg "Oops!" :extra-data 10} (count 1))