Skip to content

Commit

Permalink
Merge pull request #13 from leon-barrett/dedup-chunks
Browse files Browse the repository at this point in the history
Dedup chunks
  • Loading branch information
stevemkim committed Dec 3, 2014
2 parents 48a0721 + 11be519 commit 7a4fb85
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 16 deletions.
38 changes: 22 additions & 16 deletions src/io/mandoline/impl.clj
Original file line number Diff line number Diff line change
Expand Up @@ -220,29 +220,34 @@
(blank-slab metadata var-name dtype chunk-slice)))))

(defn- update-chunk!
[index parent-index version-id store coordinate slab]
[index parent-index version-id store coordinate slab written-chunks]
(loop [my-current-hash (chunk-at index coordinate version-id)]
(let [bc (get-base-chunk my-current-hash index
parent-index store coordinate)
slab (slab/merge bc slab)
hash (chunk/generate-id slab)
;; fixme implement ref-counting
ref-count -1]
(->> slab
:data
.getDataAsByteBuffer
;; TODO: The following line fixes a bug in ArrayChar, where
;; .getDataAsByteBuffer doesn't rewind its position index. We
;; should make sure that this doesn't accidentally rewind
;; past the start of a chunk. We should also fix this
;; upstream.
.rewind
;; write or re-write chunk (generating new chunk ID)
(proto/write-chunk store hash ref-count))
;; Write the chunk only if we need to (as far as we know).
(when-not (or (contains? @written-chunks hash)
(= hash my-current-hash))
(->> slab
:data
.getDataAsByteBuffer
;; TODO: The following line fixes a bug in ArrayChar, where
;; .getDataAsByteBuffer doesn't rewind its position index. We
;; should make sure that this doesn't accidentally rewind
;; past the start of a chunk. We should also fix this
;; upstream.
.rewind
;; write or re-write chunk (generating new chunk ID)
(proto/write-chunk store hash ref-count))
(swap! written-chunks conj hash))

;; If write-index returned nil, the transaction was aborted
;; because another writer slipped in ahead of us. Re-merge this chunk.
(when-not (proto/write-index index coordinate my-current-hash hash)
(if (proto/write-index index coordinate my-current-hash hash)
hash
;; If write-index returned nil, the transaction was aborted
;; because another writer slipped in ahead of us. Re-merge this chunk.
(let [sha1 (chunk-at index coordinate version-id)]
(log/tracef (str "Retrying chunk update transaction at coordinate %s, "
"new sha1 %s") (pr-str coordinate) sha1)
Expand All @@ -261,10 +266,11 @@
(let [{:keys [metadata var-name]} (proto/target index)
{parent-metadata :metadata} (when parent-index
(proto/target parent-index))
written-chunks (atom #{})
update-fn (fn [slab coordinate]
(log/tracef "Updating chunk at %s" (pr-str coordinate))
(update-chunk! index parent-index (:version-id metadata)
store coordinate slab))]
store coordinate slab written-chunks))]
(log/debugf "Writing to variable %s. Metadata: %s, parent %s"
var-name (pr-str metadata) (pr-str parent-metadata))
(doseq [s slabs]
Expand Down
35 changes: 35 additions & 0 deletions test/io/mandoline/impl_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
(ns io.mandoline.impl-test (:require
[clojure.test :refer :all]
[io.mandoline
[chunk :as chunk]
[impl :as impl]
[slab :as slab]
[slice :as slice]]
[io.mandoline.impl.protocol :as proto]))


;; Verify that when writing a variable, we don't repeatedly write an identical
;; chunk. That's wasteful.
(deftest write-variable-dedups-chunks
(let [chunks-written (atom 0)
float-type (slab/as-data-type Float/TYPE)
zeros-chunk (slab/empty float-type (slice/mk-slice [0 0] [10 10]))
full-data (slab/empty
float-type (slice/mk-slice [0 0] [1000 1000]) 1.0)]
(with-redefs [impl/get-base-chunk (constantly zeros-chunk)
proto/write-chunk (fn [& _] (swap! chunks-written inc))
proto/target (constantly
{:metadata
{:version-id :version-0
:variables {:var {:shape [:dim1 :dim2]}}
:dimensions {:dim1 1000
:dim2 1000}
:chunk-dimensions {:dim1 10
:dim2 10}}
:var-name :var})
proto/write-index (constantly true)
impl/chunk-at (constantly (chunk/generate-id zeros-chunk))]
(impl/write-variable :store :index nil [full-data])
;; Even though we were writing thousands of chunks, they were all the same,
;; and hopefully we didn't write that one chunk too many times.
(is (< 0 @chunks-written 100)))))

0 comments on commit 7a4fb85

Please sign in to comment.