diff --git a/src/io/mandoline/impl.clj b/src/io/mandoline/impl.clj index 89ce350..a35e06a 100644 --- a/src/io/mandoline/impl.clj +++ b/src/io/mandoline/impl.clj @@ -220,7 +220,7 @@ (blank-slab metadata var-name dtype chunk-slice))))) (defn- update-chunk! - [index parent-index version-id store coordinate slab] + [index parent-index version-id store coordinate slab written-chunks] (loop [my-current-hash (chunk-at index coordinate version-id)] (let [bc (get-base-chunk my-current-hash index parent-index store coordinate) @@ -228,21 +228,26 @@ hash (chunk/generate-id slab) ;; fixme implement ref-counting ref-count -1] - (->> slab - :data - .getDataAsByteBuffer - ;; TODO: The following line fixes a bug in ArrayChar, where - ;; .getDataAsByteBuffer doesn't rewind its position index. We - ;; should make sure that this doesn't accidentally rewind - ;; past the start of a chunk. We should also fix this - ;; upstream. - .rewind - ;; write or re-write chunk (generating new chunk ID) - (proto/write-chunk store hash ref-count)) + ;; Write the chunk only if we need to (as far as we know). + (when-not (or (contains? @written-chunks hash) + (= hash my-current-hash)) + (->> slab + :data + .getDataAsByteBuffer + ;; TODO: The following line fixes a bug in ArrayChar, where + ;; .getDataAsByteBuffer doesn't rewind its position index. We + ;; should make sure that this doesn't accidentally rewind + ;; past the start of a chunk. We should also fix this + ;; upstream. + .rewind + ;; write or re-write chunk (generating new chunk ID) + (proto/write-chunk store hash ref-count)) + (swap! written-chunks conj hash)) - ;; If write-index returned nil, the transaction was aborted - ;; because another writer slipped in ahead of us. Re-merge this chunk. - (when-not (proto/write-index index coordinate my-current-hash hash) + (if (proto/write-index index coordinate my-current-hash hash) + hash + ;; If write-index returned nil, the transaction was aborted + ;; because another writer slipped in ahead of us. Re-merge this chunk. (let [sha1 (chunk-at index coordinate version-id)] (log/tracef (str "Retrying chunk update transaction at coordinate %s, " "new sha1 %s") (pr-str coordinate) sha1) @@ -261,10 +266,11 @@ (let [{:keys [metadata var-name]} (proto/target index) {parent-metadata :metadata} (when parent-index (proto/target parent-index)) + written-chunks (atom #{}) update-fn (fn [slab coordinate] (log/tracef "Updating chunk at %s" (pr-str coordinate)) (update-chunk! index parent-index (:version-id metadata) - store coordinate slab))] + store coordinate slab written-chunks))] (log/debugf "Writing to variable %s. Metadata: %s, parent %s" var-name (pr-str metadata) (pr-str parent-metadata)) (doseq [s slabs] diff --git a/test/io/mandoline/impl_test.clj b/test/io/mandoline/impl_test.clj new file mode 100644 index 0000000..98fd528 --- /dev/null +++ b/test/io/mandoline/impl_test.clj @@ -0,0 +1,35 @@ +(ns io.mandoline.impl-test (:require + [clojure.test :refer :all] + [io.mandoline + [chunk :as chunk] + [impl :as impl] + [slab :as slab] + [slice :as slice]] + [io.mandoline.impl.protocol :as proto])) + + +;; Verify that when writing a variable, we don't repeatedly write an identical +;; chunk. That's wasteful. +(deftest write-variable-dedups-chunks + (let [chunks-written (atom 0) + float-type (slab/as-data-type Float/TYPE) + zeros-chunk (slab/empty float-type (slice/mk-slice [0 0] [10 10])) + full-data (slab/empty + float-type (slice/mk-slice [0 0] [1000 1000]) 1.0)] + (with-redefs [impl/get-base-chunk (constantly zeros-chunk) + proto/write-chunk (fn [& _] (swap! chunks-written inc)) + proto/target (constantly + {:metadata + {:version-id :version-0 + :variables {:var {:shape [:dim1 :dim2]}} + :dimensions {:dim1 1000 + :dim2 1000} + :chunk-dimensions {:dim1 10 + :dim2 10}} + :var-name :var}) + proto/write-index (constantly true) + impl/chunk-at (constantly (chunk/generate-id zeros-chunk))] + (impl/write-variable :store :index nil [full-data]) + ;; Even though we were writing thousands of chunks, they were all the same, + ;; and hopefully we didn't write that one chunk too many times. + (is (< 0 @chunks-written 100)))))