From 0f3722d30c5942ef4b0b484bee80bf3833524e02 Mon Sep 17 00:00:00 2001
From: junzhezhang <zhangjunzhetom@gmail.com>
Date: Wed, 31 Oct 2018 14:08:53 +0800
Subject: [PATCH] add documentation

---
 .DS_Store                               |  Bin 8196 -> 0 bytes
 CMakeLists.txt                          |    2 +-
 examples/cifar10/{alexnet.py => cnn.py} |    0
 examples/cifar10/train.py               |    2 +-
 include/singa/core/common.h             |    6 +-
 include/singa/core/device.h             |  215 ++--
 include/singa/core/memory.h             |  209 ++-
 src/.DS_Store                           |  Bin 8196 -> 0 bytes
 src/core/.DS_Store                      |  Bin 6148 -> 0 bytes
 src/core/common/common.cc               |   51 +-
 src/core/device/cuda_gpu.cc             |    9 +-
 src/core/device/device.cc               |   27 +-
 src/core/device/swap_gpu.cc             | 1487 +++++++++-------------
 src/core/memory/memory.cc               | 1543 +++++++++++------------
 14 files changed, 1562 insertions(+), 1989 deletions(-)
 delete mode 100644 .DS_Store
 rename examples/cifar10/{alexnet.py => cnn.py} (100%)
 delete mode 100644 src/.DS_Store
 delete mode 100644 src/core/.DS_Store
diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index e74703a65a58eddc3bf6ead31e5ced542081bb7d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHMTWl0n82<llfinZ8rwFv`x^$&fxfIya%FTk?n^+2#*e$(qDYH9cJ8?R*?96T{
zrKXZ-)EJGI#6&b2;u|JBkr-c0yhn|f*m#Q?6W@F?z8T}?KXaxnZRvwAYG6(>=R4;=
z|37E{?|d^SXBGgkrJ&aXQ~>~^OCVoP)isJ37uSLk2`r_ANb%t2GM1B}-iMjo0`Jfe
zG7vHlG7vHlG7vKGe_(+2Y*E-E`@S3v>yUwvfg6$m@qS3qB`_7>q(uMKL6v_5AQVRc
zzfhZt24Nz=RDhEbX@m;OP(m4s!4U&xIN1|{Un;;!i8358IDB9*GX^IV^k=90CwzCn
zl*F(O83-9z$bfi!O2CCYWHX~zpWiLh^^(c2Fj7*wWa%=dC`>71C%T87(QZ%k3T}&*
zKj!yL-N*#(wBuMaZ7p-uG)4#c>Q=}0G}AWnSGgbWHErD)ZE*^=?)vvRqIts<>rXVu
zqvPWZ^}Fj5^}8A->JsDi$sNhML}SCwi3!FkYwP!QoEjNBJAQ8B!53so7`zssd|rwd
zX7jV$VypZ<5qUn7llgr;lb^ag)^V^?9ptNu^s~Eip6S@`Q6ujX4)SuB67R@5_L00H
z#(T4lRT#0|L0*y7tZc#3JfqpNy3MnO+vgc)Jkc6(oxIoTJNB6rD{O<D7hK~$g96N=
z(MDk|N9EFrHSzVgZAsp-_pY|7vX$k$VwI{65VN*t-ftTDo>5cxhWj-)YuLJJ5A_yJ
z*UVT(+SW#l3LI0Esj{_|RnePcvBg~b3K{cs!Su#>4P#S$WqFxe9lKTGSJ-i#e#K!&
z7c=SpNUS<m%h)ktZ_OK2OS8H$wprmCOS_bayv0^^yTV;1{Qxbe4BeqNGOjB<mX<Lr
zY1*akVZ2l6ElM_}u}R&}DzHxJ%Q_<?l&34r&9PR-KVT=t+=n#R>&h86O)s)*ua3p~
zg}qtV4V^JoliDa(+oz!>m6Sb&wvs4pqb=(mI1UEnU<{ssC*di$0I$MjxB_p&`|vS*
z3ZKE}@GblZKf|x^JNyBE!e7Wx!R5FDqqrVxa04cB5AMY#+=ngLiHGoB?7}pjz>|0i
zH5|emo<<ivJckeCBls9Tj?dzY_!7R1ui#~T9pA#Y@f~~@Kfy2XJNW}6(_%d=`_nak
zPp#l#B(Vk`(;_FYCDwTR-3R_7v3|2cD)gq+Yu43l+`3~|Q}fIs&Re&uCFH9{%EFh+
zv>g1!o2x-7Wi{SIsxr<jrf9xQ=!Z=S@9IQ6&V_JfcS9|8BcyHF-LyHb@)A<6;yZ8K
zuEzNi5~&<*XjD~RN-CD4yZ6xOQqn7%idI&wRrjkYMLc30{9vUvt}^AHYwsdlf;Zqj
z_=q(3CHw$C!7uO|>1_#?V<oBW7L4OMyd5{;7OcaqxD$7i?(QVzwc`Qoz=Nc|G#<r1
z?8gDjqCwglMicX-y&{g{8GH!O`x<-{pT=iMh0l=+FXE+nn(LTWn?!tVR&&Mfylpx5
z5Lu=bI@dYuT~`d0$ZV>_`M-Vf@Bi0T&S5ek10e%*8NkxE?zR@PE%w$IXYB-Cr|1$z
z_)SXmL#XnP<AnZkoN(q3Lux0;Rm7wMoRmlwD*yV2fU0P?|NG2K`osNymW&&z^*1B&
B*USI_

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7acb5feea4..b630497b95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,7 @@ SET(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
 IF (USE_CUDA)
     include(ExternalProject)
     ExternalProject_Add(cnmem
-        GIT_REPOSITORY "https://github.com/junzhezhang/cnmem.git"
+        GIT_REPOSITORY "https://github.com/nusdbsystem/cnmem.git"
         GIT_TAG "master"
         SOURCE_DIR "cnmem/"
         CONFIGURE_COMMAND "${CMAKE_COMMAND}"
diff --git a/examples/cifar10/alexnet.py b/examples/cifar10/cnn.py
similarity index 100%
rename from examples/cifar10/alexnet.py
rename to examples/cifar10/cnn.py
diff --git a/examples/cifar10/train.py b/examples/cifar10/train.py
index 861bd65ac0..9b07991fb4 100644
--- a/examples/cifar10/train.py
+++ b/examples/cifar10/train.py
@@ -39,7 +39,7 @@
 from singa.proto import core_pb2
 from caffe import caffe_net
 
-import alexnet
+import cnn
 import vgg
 import resnet
 
diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 7ea45259bb..47c1068db0 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -56,8 +56,8 @@ class Device;
 /// Block represent a chunk of memory (on device or host).
 class Block {
  public:
-  Block(void* ptr, size_t size, size_t offset = 0, Device* ptrDevice = nullptr)
-      : data_(ptr), size_(size), offset_(offset), ptrDevice_(ptrDevice) {
+  Block(void* ptr, size_t size, size_t offset = 0, Device* ptr_device = nullptr)
+      : data_(ptr), size_(size), offset_(offset), ptr_device_(ptr_device) {
     ref_count_ = 1;  // std::make_shared<std::atomic<int>>(1);
   }
   // Disabled as it is not used currently.
@@ -90,7 +90,7 @@ class Block {
   void* data_ = nullptr;
   size_t size_ = 0;
   size_t offset_ = 0;
-  Device* ptrDevice_;
+  Device* ptr_device_;
   bool initialized_ = false;
   // Disabled as it is not used currently.
   // std::shared_ptr<std::atomic<int>> ref_count_ = nullptr;
diff --git a/include/singa/core/device.h b/include/singa/core/device.h
index 96bc8e9c41..e9dcc1402d 100644
--- a/include/singa/core/device.h
+++ b/include/singa/core/device.h
@@ -66,10 +66,8 @@ class Device {
   /// Called by Tensor.
   void FreeBlock(Block* block);
   
-  void AppendInfo(string blockInfo);
-  void* GetRealGpuPtrInfo(const Block* block_);
-  void SwapOutInfo(const Block* block_);
-  void SwapInInfo(const Block* block_);
+  void AppendInfo(string block_info);
+  void* UpdateGpuPtrInfo(const Block* block_ptr);
 
   /// Return the size (bytes) of memory in use
   /// TODO(wangwei) override this function for all devices.
@@ -108,7 +106,7 @@ class Device {
 
   int id() const { return id_; }
 
-  virtual void* GetRealGpuPtr(const Block* block_) = 0;
+  virtual void* UpdateGpuPtr(const Block* block_ptr) = 0;
 
  private:
   Device() {};
@@ -125,11 +123,8 @@ class Device {
 
   /// Free device memory.
   virtual void Free(void* ptr) = 0;
-  virtual void MakeMetaTable(Block* block,void* data_,int size) = 0;
-  virtual void Append(string blockInfo) = 0;
-  
-  virtual void SwapOut(const Block* block_) = 0;
-  virtual void SwapIn(const Block* block_) = 0;
+  virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0;
+  virtual void Append(string block_info) = 0;
 
  protected:
   int id_ = 0;
@@ -171,11 +166,10 @@ class CppCPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override {}
-  void Append(string blockInfo) override {}
-  void* GetRealGpuPtr(const Block* block_) override {}
-  void SwapOut(const Block* block_) override {}
-  void SwapIn(const Block* block_) override {}
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
+  void Append(string block_info) override {}
+  void* UpdateGpuPtr(const Block* block_ptr) override {}
+
 };
 
 
@@ -206,11 +200,9 @@ class CudaGPU : public Device {
 
   /// Free cpu memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override {}
-  void Append(string blockInfo) override;
-  void* GetRealGpuPtr(const Block* block_) override;
-  void SwapOut(const Block* block_) override;
-  void SwapIn(const Block* block_) override;
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
+  void Append(string block_info) override;
+  void* UpdateGpuPtr(const Block* block_ptr) override;
 
  private:
   void Setup();
@@ -222,21 +214,21 @@ class CudaGPU : public Device {
 /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host.
 
 ///SwapGPU
-struct onePieceMsg{
+struct DeviceOptInfo{
     /*
-     members: [ptr, size, MallocFree, idx]
+     members: [ptr, size, operation_type, idx]
      */
     string ptr;
     size_t size;
-    int MallocFree;
+    int operation_type;
     int idx;
     double t;
-    onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+    DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
 };
 
 struct BlockMeta{
     /*
-     block Meta.
+     meta of swapping memory blocks
      */
     Block* block_ = nullptr;
     void* data_ = nullptr;
@@ -249,34 +241,39 @@ struct BlockMeta{
 };
 
 struct SwapBlock{
-
+    /*
+    meta of candidate blocks
+    */
     string ptr;
-    string cat;  //A1, A2, A3...
+    string cat; //sub category of the candidate blocks, read-read, write-read, etc.
     int name;
     size_t size;
+    //index of last read/write before swap out, and first read/write after swap in
     int r_idx; //out idx
     int d_idx; //in idx
+    //index of last read/write before swap out, and first read/write after swap in
     double r_time; // out time
     double d_time; //in time
-    double dt; //delta t: t2'-t1'
-    double pri;  //look at here if big enough TODO(junzhe)
-    double dto; //t2-t1
-    double wdto = 0; //t2-t1 weighted by swap_load
-    double r_idx_ready; //r_idx + buffer, could be set during selection.
-    //int free = -1; //when it is freed 
-    //below as per planned.
-    int i1 = 0;
-    int i1p = 0;
-    int i2 = 0;
-    int i2p = 0;
-    double t1 = 0;
-    double t2 = 0;
-    double t1p = 0;
-    double t2p = 0;
-    SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
-    ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
+    double DOA; //Duation of Absence
+    double AOA;  //Area of Absence
+    double DOA_origin; //t2-t1, DOA without taking out time spent
+    double WDOA = 0; //weighted DOA
+    double majority_voting = 0;
+    int r_idx_ready; //r_idx + buffer
+
+    //below are index and time for scheduling
+    int idx_out_start  = 0;
+    int idx_out_end = 0;
+    int idx_in_end = 0;
+    int idx_in_start = 0;
+    double t_out_start = 0;
+    double t_out_end = 0;
+    double t_in_end  = 0;
+    double t_in_start = 0;
+    SwapBlock(string p, size_t s, int idx_out_start, int idx_in_end, double t_out_start, double t_in_end): 
+    ptr(p), size(s), r_idx(idx_out_start),d_idx(idx_in_end),r_time(t_out_start), d_time(t_in_end) {}
 };
-/// Device able to Swap memory between Nvidia GPU and Swap
+/// Device able to Swap memory between Nvidia GPU and CPU
 class SwapGPU : public Device {
  public:
   ~SwapGPU();
@@ -300,98 +297,92 @@ class SwapGPU : public Device {
   /// Free cpu memory.
   void Free(void* ptr) override;
 
-  //Append at every index: malloc, free, read, mutable
-  void Append(string blockInfo) override;
+  //Append at every index: free, read, mutable
+  void Append(string block_info) override;
 
-  //append info after Malloc, pair.
-  void MakeMetaTable(Block* block,void* data_,int size) override; 
+  //append info after Malloc, as Block* is not available till Malloc() done.
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override; 
 
-  //all the testing, without swap, during Append()
-  void Test_sched_switch_swap();
+  //Detection and Plan
+  void DetectionPlan();
 
   //test iteration, return GC
-  int swap_test(vector<string>vec_block,int &maxLen, int &location);
+  int Detection(vector<string>vec_block,int &iteration_length, int &location_of_2nd_iteration);
 
-  //entire plan, from swap_select() to swap_sched(), swap_deploy_tables()
-  void swap_plan();
+  //entire plan, from SelectBlock() to Scheduling(), BuildMetaTables()
+  void Plan();
 
-  //selection algo
-  vector<SwapBlock> swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode);
+  //block selection algo
+  vector<SwapBlock> SelectBlock(vector<SwapBlock>vec_swap,vector<double> temp_load,double mem_limit,string mode);
 
   //schedule algo
-  void swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode);
+  void Scheduling(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double mem_limit,string mode);
   
-  //make tables Table_sched and Table_meta
-  void swap_construct_tables(vector<SwapBlock>vec_swap_selct);
+  //make tables table_sched and table_meta
+  void BuildMetaTables(vector<SwapBlock>vec_swap_selct);
 
-  //update Table_meta, during Append()
-  void swap_update_tables(Block* tempBlock_);
+  //update table_meta, during Append()
+  void UpdateMetaTables(Block* block_ptr);
 
   //swap/sync during Append()
   void DeploySwap();
 
   //exec DelpoySwap
-  void DeploySwap_exec(int r_gc);
-
-
+  void DeploySwapExec(int relative_counter);
 
   //load profile as per synchronous swap.
-  vector<double> swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
+  vector<double> GetIdealLoad(vector<double>vec_load,vector<SwapBlock> vec_swap_selct);
   
-  //in case gpu ptr wrong. TODO(junzhe) to verify if needed.
-  void* GetRealGpuPtr(const Block* block_) override;
+  //in case gpu ptr wrong, updated it after swap_in ad hoc
+  void* UpdateGpuPtr(const Block* block_ptr) override;
 
-  void SwapOut(const Block* block_) override;
-  void SwapIn(const Block* block_) override;
+  //Swap Synchronous, for early iterations
+  void SwapOutSynchronous(const Block* block_ptr);
+  void SwapInSynchronous(const Block* block_ptr);
 
-  //changed to intake data_ instead
-  void SwapOut_idx(const int r_idx);
-  void SwapIn_idx(const int r_idx);
+  //Swap asynchronous, for middle iteraions
+  void SwapOut(const int idx);
+  void SwapIn(const int idx);
 
  private:
   void Setup();
-  ///Tables needed
-  //r_idx->BlockMeta
-  map<int,BlockMeta>Table_meta;
-  map<const Block*,BlockMeta>Table_block_meta; //TODO(junzhe) for measure speed only.
-  map<const Block*, int>Table_not_at_device;  //int refers to its r_idx of the block/meta
-  //map<const Block*, size_t>Table_block_size;  //Table block_ -> size TODO(junzhe) no need, can call block_->size()
-
-  //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D.
-  map<int,std::tuple<int,int,int,int>>Table_sched; // changed to with sync_r_idx
 
-  // vector<SwapBlock>vec_swap_selct_global;
+  map<int,BlockMeta>table_meta;
+  map<const Block*,BlockMeta>table_block_meta; //for measure speed only.
+  map<const Block*, int>table_not_at_device;  //int refers to its r_idx of the block/meta
+  map<int,std::tuple<int,int,int,int>>table_sched; // changed to with sync_r_idx
 
   //vec_block
-  vector<string>vec_block; //iteration 0-3
-  vector<string>vec_block_fresh; //iteration 4 5 6
-  vector<string>vec_block_mf; //itr 8 9 10
-  vector<double>global_load; // from begining
-  vector<double>origin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use.
-  vector<onePieceMsg>vec_run;
-  vector<int>opsSequence; //sequence of operations of one middle iteration
-  vector<size_t>sizeSequence; //size of all operations of one middle iteration
-  int asyncSwapFlag = 0; //0 for sync, 1 for async.
-  int testFlag = 0; //0 means open for test, 1 means no need test anymore.
-  int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
-  int globeCounter = -1;
-  int maxLen = 0;
-  int location = 0;
-  int three_more_location = 0; //location at 3 more iterations later.
-  int three_more_globeCounter = -1; //
-  //design requirement TODO(junzhe)
-  float memLimit_ratio = 0.70; 
+  vector<string>vec_block; //iterations for Detection, i.e. detect iterations.
+  vector<string>vec_block_fresh; //iterations that are used for Planning,
+  vector<string>vec_block_mf; //iterations used to construct pool
+  vector<double>global_load; // load from begining
+  vector<double>origin_load; //3 iteration load, for planning.
+  vector<DeviceOptInfo>vec_run;
+  vector<int>operation_sequence; //sequence of operations of one middle iteration
+  vector<size_t>size_sequence; //size of all operations of one middle iteration
+
+  int async_swap_flag = 0; //0 for sync, 1 for async.
+  int past_test_flag = 0; //0 means need to test, 1 means no need test anymore.
+  int global_index = 0; //global counter, index, add 1 after each Malloc/Free/read/write.
+  int global_index_threshold = -1;
+  int iteration_length = 0;
+  int location_of_2nd_iteration = 0; //index of start of 2nd iteration
+  int location_of_5th_iteration = 0; //index of start of 5th iteration
+  int three_more_iteration_global_index_threshold = -1;
+
+  //design specs
+  float mem_limit_ratio = 0.70; 
   size_t smallest_block = 1<<20; //1 MB
   int data_buffer = 4; // used to control readyIdx
   int mutable_data_buffer = 6;
-  double maxLoad;
-  int maxIdx;
-  double total_swapInTime = 0;
-  double total_swapOutTime = 0;
-  double tempTime = 0;
-  double tempTime2 = 0;
-  double tempTime_baseline; //vec_run[0] time
-  int maxLen_threshold = 1000;
+  double max_load;
+  int max_idx;
+  double total_swap_in_time = 0;
+  double total_swap_out_time = 0;
+  double temp_time = 0;
+  double temp_time_baseline; //vec_run[0] time
+  int iteration_length_threshold = 1000;
 
  private:
   shared_ptr<DeviceMemPool> pool_;
@@ -447,11 +438,9 @@ class OpenclDevice : public singa::Device {
   /// Converts the void pointer into a Buffer object, then deletes the object.
   /// This has the effect of freeing up device memory.
   void Free(void* ptr) override;
-  void MakeMetaTable(Block* block,void* data_,int size) override {}
-  void Append(string blockInfo) override {}
-  void* GetRealGpuPtr(const Block* block_) override {}
-  void SwapOut(const Block* block_) override {}
-  void SwapIn(const Block* block_) override {}
+  void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {}
+  void Append(string block_info) override {}
+  void* UpdateGpuPtr(const Block* block_ptr) override {}
 
 
 private:
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index 343b4449de..b3dfd672ec 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -53,8 +53,6 @@ class DeviceMemPool {
 
   virtual void PoolOpt(vector<string> &vec_mf) = 0;
   
-  virtual void SwapOut(void* data_) = 0;
-  virtual void SwapIn(void* data_) = 0;
   /// Return a pair for free and total memory managed by this pool.
   virtual std::pair<size_t, size_t> GetMemUsage() {
     return std::make_pair(0u, 0u);
@@ -80,8 +78,6 @@ class CnMemPool : public DeviceMemPool {
 
   void PoolOpt(vector<string> &vec_mf) override {}
     
-  void SwapOut(void* data_) override {}
-  void SwapIn(void* data_) override {}
   std::pair<size_t, size_t> GetMemUsage() override;
 
   // release all memory and set cnmem manager to unintialized
@@ -110,135 +106,116 @@ class CudaMemPool : public DeviceMemPool {
 
   void PoolOpt(vector<string> &vec_mf) override {}
 
-  void SwapOut(void* data_) override {}
-  void SwapIn(void* data_) override {}
 };
 
-//for SmartMemPool
-struct lookUpElement{
-    /*
-     for memory pool Malloc look-up table.
-     */
-    int r_idx;
-    int d_idx;
-    size_t size;
-    size_t offset;
-    void* ptr;
-    int Occupied; //0 is free, 1 is occupied.
-    int crossItr; 
-    int Occupied_backup; 
+//for SmartMemPool and SwapPool
+struct PoolBlockMeta{
+  /*
+   for memory pool Malloc look-up table.
+   */
+  int r_idx;
+  int d_idx;
+  size_t size;
+  size_t offset;
+  void* ptr;
+  int occupied; //0 is free, 1 is occupied.
+  int cross_iteration; 
+  int occupied_backup; 
+};
+
+///struct Vertex
+struct Vertex{
+  int name;
+  size_t size;
+  int r; //arrive
+  int d; //depart
+  int cross_iteration =0;
+  pair<size_t, size_t> color_range;
+  vector<pair<size_t, size_t>> vec_color_preoccupied;
+  Vertex(int n, size_t s, int r1, int d1):name(n),size(s),r(r1),d(d1){}
+
 };
 
-///class mem-pool SmartMemPool
+
+///SmartMemPool
 class SmartMemPool: public DeviceMemPool {
 public:
-    SmartMemPool(const MemPoolConf &conf); //constructor
-    //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
-    void Malloc(void** ptr, const size_t size);
-    void Free(void* ptr);
-    ~SmartMemPool();
-    void getMaxLoad(void);
-    std::pair<size_t, size_t> GetMemUsage() override;
-    void Append(string blockInfo);
-
-    void PoolOpt(vector<string> &vec_mf) override {}
-    
-    void SwapOut(void* data_) override {}
-    void SwapIn(void* data_) override {}
-protected:
-    void Init();
-private:
-    MemPoolConf conf_;
-    // whether the (global) memory pool has been initialized
-    bool initialized_ = false;
-    // lock on the initialized variable
-    std::mutex mtx_;
-
-    string colorMethod;
-    int mallocFlag =0; //0 for cudaMalloc, 1 for coloringMalloc
-    int gc =0; //global counter each time Malloc/Free, add 1.
-    int globeCounter=-1;
-    int loadLogFlag =1; //record when its 1.
-    void* ptrPool = NULL;
-    int idxRange = 0;
-    size_t offset = 0;
-    size_t offsetCrossItr=0; //cross iteration offset.
-    int maxLen =0;
-    int location=0;
-    vector<string> vec;
-    vector<string> vec_block_RW;
-    vector<string> vec_block_RWMF;
-    map<int,int>Table_r2d; //full duration info, cross-iteration duration.
-    map<int,int>Table_d2r;
-    //map<int,lookUpElement>Table_r2Ver;
-    vector<pair<int,lookUpElement>>Vec_r2Ver; //b. replace Table_r2Ver
-    map<int, pair<size_t,size_t>>Table_load; //gc, <cudaLoad, colorLoad>
-    map<void*,size_t>Table_p2s; //For tracking load in Free. add when allocate, delete when deallocate.
-    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
-    int checkPoint=300; //for reduce number of test.
-    size_t maxTotalLoad;
-    size_t maxMemUsage;
-    float memRatio;
-};
+  SmartMemPool(const MemPoolConf &conf); //constructor
+  void Malloc(void** ptr, const size_t size);
+  void Free(void* ptr);
+  ~SmartMemPool();
+  std::pair<size_t, size_t> GetMemUsage() override;
+  void GetMaxLoad(void);
+  void Append(string blockInfo);
+  vector<Vertex> Plan(vector<string>vec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method);
+  int Detection(vector<string>vec_string_test, int &iteration_length, int &location_2nd_iteration);
 
+  void PoolOpt(vector<string> &vec_mf) override {}
 
-//for Swap
-struct swapLookUpElement{
-    /*
-    book keep the block info and status
-     */
-    void* data_ = nullptr;
-    void* realGpuPtr = nullptr;
-    void* realCpuPtr = nullptr;
+protected:
+  void Init();
+private:
+  MemPoolConf conf_;
+  // whether the (global) memory pool has been initialized
+  bool initialized_ = false;
+  // lock on the initialized variable
+  std::mutex mtx_;
 
-    int location; //1 is at GPU, 2 is at CPU. 3 on the way C2G, 4 on the way G2C.
-    size_t size; //size may used as of now.
+  string color_method;
+  int malloc_flag = 0; //0 for cudaMalloc, 1 for coloringMalloc
+  int global_index = 0; //global counter each time Malloc/Free, add 1.
+  int global_index_threshold = -1;
+  int load_flag = 1; //record load at 1
+  void* ptr_pool = NULL;
+  int idx_range = 0;
+  size_t offset = 0;
+  size_t offset_cross_iteration = 0; //cross iteration offset.
+  int iteration_length = 0;
+  int location_2nd_iteration = 0;
+  vector<string> vec;
+  vector<string> vec_block_rw; //read write only opt info
+  vector<string> vec_block_rw_mf; //read write, malloc, free opt info
+  map<int,int>table_ridx_to_didx; //table match from r_idx to d_idx
+  map<int,int>table_didx_to_ridx; //table match from d_idx to r_idx
+
+  vector<pair<int,PoolBlockMeta>>vec_block_meta; //vec of block meta, index in the vector refering to the r_idx
+  map<int, pair<size_t,size_t>>table_load; //global_index, <cudaLoad, colorLoad>
+  map<void*,size_t>table_ptr_to_size; //for tracking load in Free. add when allocate, delete when deallocate.
+  map<void*,int>table_ptr_to_ridx; //ptr for arrival idx, for look up Table during free
+  int check_point = 300; //for reduce number of test.
+  size_t max_total_load;
+  size_t max_mem_usage;
 };
 
-struct SwapMeta{
-    /*
-     for copy between block and info.
-     */
-    size_t swapSize;
-    void* ptr;
-    void* d_ptr; //not used for
-};
 
+///SwapPool 
 class SwapPool : public DeviceMemPool {
 public:
-    SwapPool(const MemPoolConf &conf); //constructor
-    //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree.
-    void Malloc(void** ptr, const size_t size);
-    void Free(void* ptr);
-    ~SwapPool();
-    void getMaxLoad(void);
-    std::pair<size_t, size_t> GetMemUsage() override;
-    void Append(string blockInfo);
-    
-    void SwapOut(void* data_);
-    void SwapIn(void* data_);
+  SwapPool(const MemPoolConf &conf); //constructor
+  void Malloc(void** ptr, const size_t size);
+  void Free(void* ptr);
+  ~SwapPool();
+  std::pair<size_t, size_t> GetMemUsage() override;
+  void Append(string blockInfo);
 
-    //PoolOpt() construct pool based on MF info after Swap constructed.
-    void PoolOpt(vector<string> &vec_mf);
+  //PoolOpt() construct pool based on MF info after Swap constructed.
+  void PoolOpt(vector<string> &vec_mf);
 protected:
-    void Init();
+  void Init();
 private:
-    MemPoolConf conf_;
-    // whether the (global) memory pool has been initialized
-    bool initialized_ = false;
-    // lock on the initialized variable
-    std::mutex mtx_; 
-    vector<string> vec_block;
-    size_t swapLimit = 1<<23; //8MB
-    int poolFlag = 0;
-    int pc = 0;
-    int maxLen_mf = 0;
-    void* ptrPool = nullptr;
-    map<void*,int>Table_p2r; //ptr for arrival idx, for look up Table during free
-    map<int,lookUpElement>Table_r2v; //r-> vertex
-    vector<pair<int,lookUpElement>>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe)
-    // map<void*,swapLookUpElement>Table_id2LookUpElement; //old TODO(junzhe) remove
-    // map<void*,pair<SwapMeta,SwapMeta>>Table_Meta;
+  MemPoolConf conf_;
+  // whether the (global) memory pool has been initialized
+  bool initialized_ = false;
+  // lock on the initialized variable
+  std::mutex mtx_; 
+
+  vector<string> vec_block;
+  int pool_flag = 0;
+  int pool_index = 0; //like global counter in device class
+  int iteration_length_mf = 0; //max length of malloc free operation sequences.
+  void* ptr_pool = nullptr;
+  map<void*,int>table_ptr_to_ridx; //map ptr to arrival idx, for look up Table during free
+  map<int,PoolBlockMeta>table_pool_meta; //table of pool block meta, key with r_idx
 };
 
 #endif
diff --git a/src/.DS_Store b/src/.DS_Store
deleted file mode 100644
index a87953d857c6cf9354df29d312f03c027a6411d3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHMTWl0n82<mZz?p&4Qv}+&E?p^AE-h?Z1d4#$n^+2#*rhE6y3Fp3?ZoNKvNOAF
zDK(WuqsC~wBqpNK5Z`LT6N&M~#Cz0uiH*0YG4ahO<C`&F{xfH`DU?3=q6X$9bG~!_
z^Z#@9|IRmaa%KSl+w*!8Ks5j`x&-o-R9&Npad9mwC4r@k5Gfwg<8C%>Ice&DxX3#+
zgbaiXgbaiXgbaiX{2v&gJzEsE#J(>_!#ZRjWZ;HmK)fGfbO}rZI3v-2bx`FW0SLtr
zz%SJ1qCpr7FcIL4L>i%jGL%q;VsOMj8BX?C;Fkz+MxqP{3=SU{%#6Va1^wCS{xRPj
zFd;FlLk2<y7Be6opEAgR1354jp5N11%XGbX{3|ReEnBvH1ydBJl(SQPqt1Arr+Im|
zUCW*Dd!}xrgLcw!togQ<?l+C`6kprn*q&zEMs9)o$pO>Wo$+=jZ|kmqpQD;LO0oV#
zDPA)*)zY-5A=b3JWx63Y)f8`zH^g?f?3$ittg60gfA{IJ$+J`ErXPGkCWXO^0V=La
z^1{LlEw|7izfhDsU(E39`3%*Sk?uo1YKpHZ&=2p;dZuH${YK6uOz{erQt!?<_R*Xn
z#s@Nvl^?U+6tB!^Rwi$0p3!Dmedbxi9rTO|Pqc<yC+Btej)Nw}3R{ZvyldQNP=Hx5
zIw{QMsLZZf7u|T<w)h=;@9LZ@UtPf~*Qn|cacg_#{icySHg4+P=yA=>7`ATOBLfA~
zHPe=nw6!s#5+@a9u6%t}b<NF@$WpF@`LuZ^Z+erwj<Gqux}sdIjohm6%j~31zv76a
zi<xwPBvKoxXY7QqcjOGJrCHq^*{X1jC0$BH-l9?6p>S78K0pg9Lz~r|jO)rVOG_J;
zH0@Sf8ShaB3X)A}+@~I3mDr#RW}LAx%F~tRwnzu#AFyFD_hHTTdb5U2(~Io7Y9o>3
z!rrFqhRztPOKg_w?bA?~h|8Wr+er;<r)}#VI0*)1VG^EzC*di$0I$L&xD0Q?`|vS*
z3ZKE}@GblZKf|x^JNyBE!e7Wx!IiiQYj7jh;U<h@EAGX8xF6fG2M^=D*o#R#g~NCn
zH5|b#o<SEqJckeCBls9Tj?dzY_!7R1uizzo9pA#Y@f~~@Kfy2XJNW}kigG<7`^6f+
zr`G%fNpcN7sFe&~ORmwbyAS?Ha{XqN)aXrX*KMfV+}ON(U)%gLUbS)yMda&73d0vn
zQ4s#}U8zDTW;J?@bY)B|rD?uGD2UAo@7h>2%7t`fcT+ueOGw?ayJc%s<)x%u#dqGm
zLyhufBvU!svQt%g8R=M#?rEjb<)m0PS5sBJUOk{D6!DO8@Pn25sLGUouD**f3va-C
z@DZu*OZWkPf?wb_Qrt4Ez$(()Ef~cOcsp*vZP<W~xC{4?^6n(<b>Ttm#zUmPB=+MV
z9>*cfph4;zMH6$Rz5<Tp1U`i4eHA{6PvbMB!{<nc7jgC~)pZy3CJ|p-sJcR5&bAzT
zglt8XUg;k8t_ucAWfs-q{NJ_o_y6l6*D#5Yfslb~Gk|5CeVy$zw$$5RoV8<gou*3^
x;Ws1E524CGjuZOFal-jO45=L>R}qs4a7H3osQl|60>b^DSP%DqUwk)k^*1Ld+b{qC

diff --git a/src/core/.DS_Store b/src/core/.DS_Store
deleted file mode 100644
index 018d123c47540aef56047ebb82a09e1df9135f84..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKK~BUl476c|7IE1l$9!Rb5Ut7!_5(<*1P4m@z~1``zQ!B)1v9omDnwk6(1PqL
z87H>WWU3~yi0JM(Y(+LAvPK~)m4>jpX*vt%6;N%ANgf~Ww)=h>4~>CNv5RxRkX|Mk
z@wDf^u#Z#M4acWm<mLJG(w6keK74CG-2W+lPVt-NoRWcLAQ?yoe#QXKY|8q|G3#U?
z8At{`7~uX;P>3~fcC@Vnm8AecUZYK*GnZgY39Ny$BP<a0P@souq!{Yq=r7?{17}AM
z7megY<IV5Mi`v^Ueld5^>X>yhkPIvtIQQm;=l>0UnP!r|49Qk9kPQ4Y26WZ!yB!u4
zXX}^c@vJQ<w<r|)%~T-J7ncAuxQ|?$(d~sg{A%Fr$WhpD;lR8I7$I3D1HZt)8>^2g
AjsO4v

diff --git a/src/core/common/common.cc b/src/core/common/common.cc
index f3d144a9c8..d6e9c5a301 100644
--- a/src/core/common/common.cc
+++ b/src/core/common/common.cc
@@ -22,34 +22,28 @@
 #include <iostream>
 #include <fstream>
 #include <string>
-//TODO(junzhe) ifdef to counter verify
-///only include mutable_data() and data()
 
 namespace singa {
 
 void* Block::mutable_data() {
-    //TODO(junzhe) go back to enable it after device done
-    //std::cout<<"mutable_data() "<<this<<' '<<data_<<std::endl;
     initialized_ = true;
-    if (ptrDevice_!=nullptr){
-      //Append info.
+
+    //Append block info: opt_type, ptr, time_stamp
+    if (ptr_device_!=nullptr){
       stringstream strm2;
       strm2<<this;
-      string tempStr2 = strm2.str();
+      string temp_str2 = strm2.str();
       stringstream strm4;
       auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
       strm4<<t2;
-      string tempStr4 = strm4.str();
-      string temp = "Mutable "+tempStr2+" "+tempStr4;   
-      ptrDevice_->AppendInfo(temp);
+      string temp_str4 = strm4.str();
+      string temp = "Mutable "+temp_str2+" "+temp_str4;   
+      ptr_device_->AppendInfo(temp);
     }
-    //TODO(junzhe) this should not happen, can verify and remove
+
+    //update ptr after swap in done, if variable is not swapped back yet as expected.
     if (data_ == nullptr) {
-      //cout<<"to sleep"<<endl;
-      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"print returned tempData_ "<<tempData_<<endl;
-      // cout<<"slept to get data_ updated: (mutable_data) "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptr_device_->UpdateGpuPtrInfo(this);
       return static_cast<char*>(tempData_) + offset_;
     }
     
@@ -59,41 +53,38 @@ void* Block::mutable_data() {
 
 const void* Block::data() const {
     CHECK(initialized_) << "Must initialize data before reading it";
-    //TODO(junzhe) go back to enable it after device done
-    if (ptrDevice_!=nullptr){
+
+    //Append block info: opt_type, ptr, time_stamp
+    if (ptr_device_!=nullptr){
       //Append info.
       stringstream strm2;
       strm2<<this;
-      string tempStr2 = strm2.str();
+      string temp_str2 = strm2.str();
       stringstream strm4;
       auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
       strm4<<t2;
-      string tempStr4 = strm4.str();
-      string temp = "Read "+tempStr2+" "+tempStr4;
-      ptrDevice_->AppendInfo(temp);
+      string temp_str4 = strm4.str();
+      string temp = "Read "+temp_str2+" "+temp_str4;
+      ptr_device_->AppendInfo(temp);
     }
 
-    //TODO(junzhe) this should not happen, can verify and remove
+    //update ptr after swap in done, if variable is not swapped back yet as expected.
     if (data_ == nullptr) {
-      //cout<<"to sleep"<<endl;
-      cout<<"before GetRealGpuPtr, block_ and data_: "<<this<<' '<<data_<<endl;
-      auto tempData_ = ptrDevice_->GetRealGpuPtrInfo(this);
-      cout<<"print returned tempData_ "<<tempData_<<endl;
-      // cout<<"slept to get data_ updated: (data) "<<this<<' '<<data_<<endl;
+      auto tempData_ = ptr_device_->UpdateGpuPtrInfo(this);
       return static_cast<char*>(tempData_) + offset_;
     }
 
-
     return static_cast<char*>(data_) + offset_;
   }
 
 void* Block::get_data() {
+  //get data without calling data(), to avoid append block info.
   return data_;
 }
 
 void Block::update_data(void* data_new) {
+  //update data_, after the swap in completes.
   data_ = data_new;
-  std::cout<<"results update_data:: "<<this<<' '<<data_<<std::endl;
 }
 
 
diff --git a/src/core/device/cuda_gpu.cc b/src/core/device/cuda_gpu.cc
index 52d4b4fb02..7ec8a9deb5 100644
--- a/src/core/device/cuda_gpu.cc
+++ b/src/core/device/cuda_gpu.cc
@@ -127,17 +127,10 @@ void CudaGPU::Append(string blockInfo){
     pool_->Append(blockInfo);
 }
 
-void* CudaGPU::GetRealGpuPtr(const Block* block_){
+void* CudaGPU::UpdateGpuPtr(const Block* block_){
   return nullptr;
 }
 
-void CudaGPU::SwapOut(const Block* block_){
-  
-}
-
-void CudaGPU::SwapIn(const Block* block_){
-  
-}
 
 }  // namespace singa
 #endif  // USE_CUDA
\ No newline at end of file
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index b2988f3615..59faddc5c6 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -41,9 +41,7 @@ Block* Device::NewBlock(int size) {
   if (size > 0) {
     void* ptr = Malloc(size);
     Block* block_ = new Block(ptr, size,0,this);
-    //std::cout<<"(reference) from device.cc after, data_, block_ device: "<<ptr<<" "<<block_<<' '<<this<<std::endl;
-    MakeMetaTable(block_,ptr,size); // make table and append vec_block.
-    //cout<<"NewBlock: "<<block_<<' '<<ptr<<endl;
+    AppendAfterMalloc(block_,ptr,size); // make table and append vec_block.
     return block_;
   } else {
     return nullptr;
@@ -53,22 +51,18 @@ Block* Device::NewBlock(int size) {
 // TODO(wangwei) return Block to the memory manager
 void Device::FreeBlock(Block* block) {
   if (block != nullptr) {
-    //TODO(junzhe) to merge it
     auto tempPtr = block->mutable_data();
-    //cout<<"FreeBlock: "<<block<<' '<<tempPtr<<endl;
     Free(tempPtr);
-    //cout<<"SwapGPU::Free() returned"<<endl;
-    //Free(block->mutable_data());
     
-    //Add Append for free here.
+    //append block info for free operation.
     stringstream strm1;
     strm1<<block;
-    string tempStr1 = strm1.str();
+    string temp_str1 = strm1.str();
     stringstream strm4;
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
     strm4<<t2;
-    string tempStr4 = strm4.str();
-    string blockInfo ="Free "+tempStr1+" "+tempStr4;
+    string temp_str4 = strm4.str();
+    string blockInfo ="Free "+temp_str1+" "+temp_str4;
     Append(blockInfo);
 
     delete block;
@@ -79,17 +73,10 @@ void Device::AppendInfo(string blockInfo){
   Append(blockInfo);
 }
 
-void* Device::GetRealGpuPtrInfo(const Block* block_){
-  return GetRealGpuPtr(block_);
+void* Device::UpdateGpuPtrInfo(const Block* block_){
+  return UpdateGpuPtr(block_);
 }
 
-void Device::SwapOutInfo(const Block* block_){
-  SwapOut(block_);
-}
-
-void Device::SwapInInfo(const Block* block_){
-  SwapIn(block_);
-}
 
 void Device::CopyDataToFrom(Block* dst, Block* src, size_t nBytes,
                             CopyDirection direct, int dst_offset,
diff --git a/src/core/device/swap_gpu.cc b/src/core/device/swap_gpu.cc
index cd4f1fdbd9..85a4061f30 100644
--- a/src/core/device/swap_gpu.cc
+++ b/src/core/device/swap_gpu.cc
@@ -40,175 +40,162 @@ const cudaMemcpyKind copyKind[] = {cudaMemcpyHostToHost, cudaMemcpyHostToDevice,
                                    cudaMemcpyDeviceToHost,
                                    cudaMemcpyDeviceToDevice};
 
-///functions to be used
-///Section for structs and respective sorting function:
-
-
-
-struct less_than_ptrIdx{
+struct sort_by_ptr_idx_ascending{
     /*
-     sort onePieceMsg by ptr and then idx.
+     sort DeviceOptInfo by ptr and then idx.
      */
-    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    inline bool operator() (const DeviceOptInfo& struct1, const DeviceOptInfo& struct2)
     {
         return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
     }
 };
 
 
-struct oneIterMsg{
+struct DeviceOptSimplifiedInfo{
     /*
-     members: [idx, MallocFree, size_delta]
+     members: [idx, operation_type, size_delta]
      */
-    size_t size_delta;// type as size_t in case size if large.
-    int MallocFree;
+    size_t size_delta; //size if Malloc, else: delta to last index
+    int operation_type;
     int idx;
-    oneIterMsg(size_t s, int M, int i):size_delta(s),MallocFree(M),idx(i){}
+    DeviceOptSimplifiedInfo(size_t s, int M, int i):size_delta(s),operation_type(M),idx(i){}
 };
 
 
-struct less_than_iterIdx{
+struct sort_by_DeviceOptSimplifiedInfo_idx_ascending{
     /*
-     sort oneIterMsg by Idx.
+     sort DeviceOptSimplifiedInfo by Idx.
      */
-    inline bool operator() (const oneIterMsg& struct1, const oneIterMsg& struct2)
+    inline bool operator() (const DeviceOptSimplifiedInfo& struct1, const DeviceOptSimplifiedInfo& struct2)
     {
         return (struct1.idx<struct2.idx);
     }
 };
 
-struct less_than_lookupIdx{
-    /*
-     sort lookUpElement by idx.
-     */
-    inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2)
-    {
-        return (struct1.r_idx<struct2.r_idx);
-    }
-};
 
+vector<string> SplitOptString(string s, string delimiter) {
+  // string delimiter
+  size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+  string token;
+  vector<string> res;
+  while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
+    token = s.substr(pos_start, pos_end - pos_start);
+    pos_start = pos_end + delim_len;
+    res.push_back(token);
+  }
+  res.push_back(s.substr(pos_start));
 
-/// string delimiter
-vector<string> swap_split(string s, string delimiter) {
-    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
-    string token;
-    vector<string> res;
-    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
-        token = s.substr(pos_start, pos_end - pos_start);
-        pos_start = pos_end + delim_len;
-        res.push_back(token);
-    }
-    res.push_back(s.substr(pos_start));
-    return res;
+  return res;
 }
 
-///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg
-//vector of pairMsg is used in run.
-//vector of iterMsg is used in test.
 
-vector<onePieceMsg> swap_strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+vector<DeviceOptInfo> DeviceOptSeqStrToStruct(vector<string> vec, int &idx_range){
     /*
-     convert vector of string into vector of onePieceMsg, sorted by ptr 
-     and then idx, and update idxRange to pieceMsgVec size.
-     format of onePieceMsg [ptr, size/-1, flag, idx, timestamp]
+     convert vector of string into vector of DeviceOptInfo, sorted by ptr 
+     and then idx, and update idx_range to pieceMsgVec size.
+     format of DeviceOptInfo [ptr, size/-1, flag, idx, timestamp]
      flag: 1 for malloc, -1 for free, 2 for read, 3 for layer,4 for mutable
-     version on 5/29, with equval blockInfo length: flag, block_, size, t
-     */
-    vector<onePieceMsg>onePieceMsgVec_;
+    */
+    vector<DeviceOptInfo>vec_opt_info;
 
     for (int i=0;i<vec.size();i++) {
-      vector<string> v = swap_split(vec[i], " ");
-      int MallocFree;
+      vector<string> v = SplitOptString(vec[i], " ");
+      int operation_type;
       if (v[0]=="Malloc"){
-        MallocFree = 1;
+        operation_type = 1;
       }else if (v[0]=="Free"){
-        MallocFree = -1;
+        operation_type = -1;
       }else if (v[0]=="Mutable"){
-        MallocFree = 4;
+        operation_type = 4;
       }else if (v[0]=="Read"){ 
-        MallocFree = 2;
+        operation_type = 2;
       }else if (v[0]=="Layer"){
-        MallocFree = 3;
+        operation_type = 3;
       }
-      //onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+      //DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
       size_t result;
       stringstream convert(v[2]);
       if (!(convert>>result)){
-          result =-1;
-          cout<<"error for converting size from str to int."<<endl;
+        result =-1;
+        cout<<"error for converting size from str to int."<<endl;
       }
-      onePieceMsg tempMsg(v[1],result, MallocFree, i);
-      double tempTime;
+      DeviceOptInfo itm(v[1],result, operation_type, i);
+      double temp_time;
       stringstream convert2(v[3]);
-      convert2>>tempTime;
-      tempMsg.t =tempTime;
-      onePieceMsgVec_.push_back(tempMsg);
+      convert2>>temp_time;
+      itm.t =temp_time;
+      vec_opt_info.push_back(itm);
     }
  
-    sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx());
-    idxRange = static_cast<int>(onePieceMsgVec_.size());
-
-    return onePieceMsgVec_;
-}// end of strVec_2_pieceMsgVec function
-
-
-vector<size_t> Swap_piece2rep (vector<onePieceMsg>onePieceMsgVec_){
-    vector<oneIterMsg>oneIterMsgVec_;
-    string tempStr;
-    int tempIdx=0;
-    for (int i=0;i<onePieceMsgVec_.size();i++){
-        if (onePieceMsgVec_[i].MallocFree==1){
-            //update tempStr and idx.
-            tempStr = onePieceMsgVec_[i].ptr;
-            tempIdx = onePieceMsgVec_[i].idx;
-            oneIterMsg tempMsg(onePieceMsgVec_[i].size,1,onePieceMsgVec_[i].idx);
-            oneIterMsgVec_.push_back(tempMsg);
-        } else {
-            oneIterMsg tempMsg(onePieceMsgVec_[i].idx-tempIdx,onePieceMsgVec_[i].MallocFree,onePieceMsgVec_[i].idx);
-            tempIdx = onePieceMsgVec_[i].idx;
-            oneIterMsgVec_.push_back(tempMsg);
-        }
-        //cout<<oneIterMsgVec_[i].size_delta<<' '<<oneIterMsgVec_[i].MallocFree<<' '<<oneIterMsgVec_[i].idx<<endl;
+    sort(vec_opt_info.begin(),vec_opt_info.end(),sort_by_ptr_idx_ascending());
+    idx_range = static_cast<int>(vec_opt_info.size());
+
+    return vec_opt_info;
+}
+
+
+vector<size_t> DeviceOptSeqRepeatableTestPreProcess(vector<DeviceOptInfo>vec_opt_info){
+  /*
+  pre process Device Operation Sequence Struct info for repeatable test,
+  return a vector of int for fast detection.
+  */
+  vector<DeviceOptSimplifiedInfo>vec_opt_simplified_info;
+  string temp_str;
+  int temp_idx=0;
+  for (int i=0;i<vec_opt_info.size();i++){
+    if (vec_opt_info[i].operation_type==1){
+      //update temp_str and idx.
+      temp_str = vec_opt_info[i].ptr;
+      temp_idx = vec_opt_info[i].idx;
+      DeviceOptSimplifiedInfo itm(vec_opt_info[i].size,1,vec_opt_info[i].idx);
+      vec_opt_simplified_info.push_back(itm);
+    } else {
+      DeviceOptSimplifiedInfo itm(vec_opt_info[i].idx-temp_idx,vec_opt_info[i].operation_type,vec_opt_info[i].idx);
+      temp_idx = vec_opt_info[i].idx;
+      vec_opt_simplified_info.push_back(itm);
     }
+  }
     
-    sort(oneIterMsgVec_.begin(),oneIterMsgVec_.end(),less_than_iterIdx());
-    //only after sort then can create rep.
-    vector<size_t>rep; // vector of size_delta, name it as rep for simlisity.
-    for (int i =0; i<oneIterMsgVec_.size(); i++){
-        rep.push_back(oneIterMsgVec_[i].size_delta);
-        //cout<<rep[i]<<endl;
-    }
-    cout<<"rep size: "<<rep.size()<<endl;
-    return rep;
+  sort(vec_opt_simplified_info.begin(),vec_opt_simplified_info.end(),sort_by_DeviceOptSimplifiedInfo_idx_ascending());
+  //only after sort then can create vec_rep.
+  vector<size_t>vec_rep; // vector of size_delta, name it as vec_rep for simlisity.
+  for (int i =0; i<vec_opt_simplified_info.size(); i++){
+    vec_rep.push_back(vec_opt_simplified_info[i].size_delta);
+  }
+  return vec_rep;
 }
-void repPatternDetector(vector<size_t>rep, int &maxLen, int &location, int maxLen_threshold, int gc ){
-    int idxRange = (int)rep.size();
-    int threshold = maxLen_threshold;
-    vector<pair<int,int>>maxLen_location;
-    
-    for (int i=0; i<idxRange;i++){
-        if (maxLen>threshold){
-            break;
-        }
-        for (int len=1; len<(idxRange-i);len++){
-            if (maxLen>threshold){
-                break;
-            }
-            if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLen<len)) {
-                maxLen = len;
-                location = i;
-                maxLen_location.push_back(make_pair(maxLen,location));
-                // cout<<"maxLen increased, lcoation and maxLen: ("<<location<<","<<maxLen<<")"<<endl;
-            }
-        }
+void RepeatableTest(vector<size_t>rep, int &iteration_length, int &location_of_2nd_iteration, int iteration_length_threshold, int global_index ){
+  /*
+  repeatable test, input vector of int, 
+  in-place update max_legth (length of iteration) 
+  and location_of_2nd_iteration (where 2nd iteration starts)
+  */
+  int idx_range = (int)rep.size();
+  int threshold = iteration_length_threshold;
+  vector<pair<int,int>>iteration_length_location_of_2nd_iteration;
+  
+  for (int i=0; i<idx_range;i++){
+    if (iteration_length>threshold){
+      break;
+    }
+    for (int len=1; len<(idx_range-i);len++){
+      if (iteration_length>threshold){
+        break;
+      }
+      if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(iteration_length<len)) {
+        iteration_length = len;
+        location_of_2nd_iteration = i;
+        iteration_length_location_of_2nd_iteration.push_back(make_pair(iteration_length,location_of_2nd_iteration));
+      }
     }
-}// end of repPatternDetector
+  }
+}
 
-struct less_than_Idx{
+struct sort_by_idx_ascending{
     /*
-     sort onePieceMsg by ptr and then idx.
+     sort DeviceOptInfo by ptr and then idx.
      */
-    inline bool operator() (const onePieceMsg& struct1, const onePieceMsg& struct2)
+    inline bool operator() (const DeviceOptInfo& struct1, const DeviceOptInfo& struct2)
     {
         return (struct1.idx<struct2.idx);
     }
@@ -216,331 +203,328 @@ struct less_than_Idx{
 
 
 int SwapOutTime(size_t size){
-    int ans = 0; //TODO(junzhe) used to be 0.29; new param as per vgg
-    if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;}
-    return ans;
+  int ans = 0; 
+  //measured in 16 PCIe, pinned memory.
+  if (size==0) {ans = 47200;} else {ans = 0.0756 * size + 47200;}
+  return ans;
 }
 
 int SwapInTime(size_t size){
-    //yet to get the formula
-    int ans = 0; //TODO(junzhe) used to be 0.13; new param as per vgg
-    if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;}
-    return ans;
+  int ans = 0; 
+  //measured as per ncra ~ ncrd, 16 PCIe, pinned memory.
+  if (size==0) {ans = 9700;} else {ans = 0.0823 * size + 9700;}
+  return ans;
 }
 
-struct less_than_dto{
+struct sort_by_DOA_origin_descending{
   /*
-   sort SwapBlock by dto, descending
-   */
+  sort SwapBlock by DOA_origin, descending
+  */
   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
   {
-    return (struct1.dto>struct2.dto);
+    return (struct1.DOA_origin>struct2.DOA_origin);
   }
 };
 
-struct less_than_wdto{
+struct sort_by_WDOA_descending{
   /*
-   sort SwapBlock by weighted dto, descending
+  sort SwapBlock by weighted DOA_origin, descending
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.WDOA>struct2.WDOA);
+  }
+};
+
+struct sort_by_AOA_descending{
+  /*
+   sort SwapBlock by pri, descending
    */
   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
   {
-    return (struct1.wdto>struct2.wdto);
+    return (struct1.AOA>struct2.AOA);
   }
 };
 
-// struct less_than_r_idx_ready{
-//   /*
-//    sort SwapBlock by r_idx_ready, ascending
-//    */
-//   inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-//   {
-//     return (struct1.r_idx_ready<struct2.r_idx_ready);
-//   }
-// };
-
-struct less_than_pri{
-    /*
-     sort SwapBlock by pri, descending
-     */
-    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-    {
-        return (struct1.pri>struct2.pri);
-    }
+struct sort_by_idx_ascending_swap{
+  /*
+  sort DeviceOptInfo_Swap by idx.
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.r_idx<struct2.r_idx);
+  }
 };
 
-struct less_than_Idx_Swap{
-    /*
-     sort onePieceMsg_Swap by idx.
-     */
-    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-    {
-        return (struct1.r_idx<struct2.r_idx);
-    }
+struct sort_by_idx_descending_swap{
+  /*
+  sort DeviceOptInfo_Swap by idx. reverse
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.d_idx>struct2.d_idx);
+  }
 };
 
-struct less_than_Idx_Swap_rvs{
-    /*
-     sort onePieceMsg_Swap by idx. reverse
-     */
-    inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
-    {
-        return (struct1.d_idx>struct2.d_idx);
-    }
+struct sort_by_majority_voting_ascending{
+  /*
+  sort majority voting, ascending
+  */
+  inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2)
+  {
+    return (struct1.majority_voting<struct2.majority_voting);
+  }
 };
 
 
-pair<int,int> load_over_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx,int maxLen){
-  //input: vec_load, memLimit, range [start_idx, end_idx)
-  //return range overlimit [first_over_limit, first_below_limit)
+pair<int,int> GetOptIdxAboveLoadLimit(vector<double>vec_load, size_t mem_limit, int start_idx, int end_idx,int iteration_length){
+  /*
+  get operation index (range) that above the load limit.
+  input: vec_load, mem_limit, range [start_idx, end_idx)
+  return range overlimit [first_over_limit, first_below_limit)
+  */
   int first_over_limit = start_idx;
   int first_below_limit = end_idx;
 
-  for (int i = start_idx+maxLen; i < end_idx+maxLen; i++){
-    if (vec_load[i] > memLimit){
-      first_over_limit = i-maxLen;
+  for (int i = start_idx+iteration_length; i < end_idx+iteration_length; i++){
+    if (vec_load[i] > mem_limit){
+      first_over_limit = i-iteration_length;
       break;
     }
   }
 
-  for (int i = end_idx+maxLen; i > first_over_limit+maxLen; i--){
-    if (vec_load[i] > memLimit){
-      first_below_limit = i-1-maxLen;
+  for (int i = end_idx+iteration_length; i > first_over_limit+iteration_length; i--){
+    if (vec_load[i] > mem_limit){
+      first_below_limit = i-1-iteration_length;
       break;
     }
   }
+
   if (first_over_limit == start_idx) first_over_limit = -1;
+  
   if (first_below_limit == end_idx) first_below_limit = -1;
 
   return std::make_pair(first_over_limit, first_below_limit);
 }
 
-// pair<int,int> load_below_limit(vector<double>vec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){
-//   //input: vec_load, memLimit, range [start_idx, end_idx]
-//   //return range overlimit [first_over_limit, first_below_limit)
-//   int first_below_limit = maxIdx;
-//   int last_below_limit = maxIdx;
-
-//   for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){
-//     if (vec_load[i] > memLimit){
-//       first_below_limit = i+1-maxLen;
-//       break;
-//     }
-//   }
-
-//   for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){
-//     if (vec_load[i] > memLimit){
-//       last_below_limit = i-1-maxLen;
-//       break;
-//     }
-//   }
-
-//   return std::make_pair(first_below_limit, last_below_limit);
-// }
-
-pair<double,int> load_peak(vector<double>vec_load_test,int maxLen){
-  double maxLoad_test = 0;
-  int maxIdx_test = 0;
-  for (int i = maxLen; i < maxLen*2; i++){
-    if (maxLoad_test < vec_load_test[i]){
-      maxLoad_test = vec_load_test[i];
-      maxIdx_test = i - maxLen;
+
+pair<double,int> GetLoadPeak(vector<double>vec_load_test,int iteration_length){
+  /*
+  return value and index of load peak
+  */
+  double max_load_test = 0;
+  int max_idx_test = 0;
+  for (int i = iteration_length; i < iteration_length*2; i++){
+    if (max_load_test < vec_load_test[i]){
+      max_load_test = vec_load_test[i];
+      max_idx_test = i - iteration_length;
     } 
   }
-  return std::make_pair(maxLoad_test,maxIdx_test);
+  return std::make_pair(max_load_test,max_idx_test);
 }
 
-void load_update(vector<double>& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){
-  //update load [start_idx, end_idx) by plusMinus*size
-  for (int i = start_idx+maxLen; i<end_idx+maxLen; i++){
-    vec_load[i] = vec_load[i] + static_cast<double>(size) * plusMinus;
+void UpdateLoad(vector<double>& vec_load,int start_idx, int end_idx, int plus_minus, size_t size,int iteration_length){
+  /*
+  update load [start_idx, end_idx) by plus_minus*size
+  */
+  for (int i = start_idx+iteration_length; i<end_idx+iteration_length; i++){
+    vec_load[i] = vec_load[i] + static_cast<double>(size) * plus_minus;
   }
 }
 
-vector<SwapBlock> SwapGPU::swap_select(vector<SwapBlock>vec_swap,vector<double> tempLoad,double memLimit,string mode){
+
+///define SwapGPU member functions
+vector<SwapBlock> SwapGPU::SelectBlock(vector<SwapBlock>vec_swap,vector<double> temp_load,double mem_limit,string mode){
   vector<SwapBlock>vec_swap_selct;
-  //vector<SwapBlock>vec_swap_reject;
-  if (mode == "dto"){
-    sort(vec_swap.begin(),vec_swap.end(),less_than_dto());  
+  /*
+  select swapping blocks based on a cetain priority score or BO score;
+  with load updated
+  */
+  if (mode == "DOA_origin"){
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_DOA_origin_descending());  
   }
-  if (mode == "pri"){
-    sort(vec_swap.begin(),vec_swap.end(),less_than_pri());  
+
+  if (mode == "AOA"){
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_AOA_descending());  
   }
-  if (mode == "wdto"){
-    //TODO(junzhe) time complexity
+
+  if (mode == "WDOA"){
     for (int i = 0; i < vec_swap.size(); i++){
       auto itm = vec_swap[i];
       for (int j = itm.r_idx; j < itm.d_idx; j++){
-        itm.wdto += origin_load[i+maxLen] - memLimit;
+        itm.WDOA += origin_load[i+iteration_length] - mem_limit;
       }
     }
-    sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); 
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_WDOA_descending()); 
   }
-  cout<<"===============select block one by one================="<<endl;
+
+  if (mode == "majority_voting"){
+    //add order for DOA
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_DOA_origin_descending()); 
+    for (int i = 0; i < vec_swap.size();i++){
+      vec_swap[i].majority_voting+=i;
+    }
+    //add order for AOA
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_AOA_descending()); 
+    for (int i = 0; i < vec_swap.size();i++){
+      vec_swap[i].majority_voting+=i;
+    }
+    //add order for WDOA
+    for (int i = 0; i < vec_swap.size(); i++){
+      auto itm = vec_swap[i];
+      for (int j = itm.r_idx; j < itm.d_idx; j++){
+        itm.WDOA += origin_load[i+iteration_length] - mem_limit;
+      }
+    }
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_WDOA_descending()); 
+    for (int i = 0; i < vec_swap.size();i++){
+      vec_swap[i].majority_voting+=i;
+    }
+    sort(vec_swap.begin(),vec_swap.end(),sort_by_majority_voting_ascending()); 
+  }
+
+
+
+  //select block one by one till updated peak load is no larger than limit.
   for (int i=0; i<vec_swap.size(); i++){
-    load_update(tempLoad,vec_swap[i].r_idx_ready,vec_swap[i].d_idx,-1,vec_swap[i].size,maxLen);
+    UpdateLoad(temp_load,vec_swap[i].r_idx_ready,vec_swap[i].d_idx,-1,vec_swap[i].size,iteration_length);
     vec_swap_selct.push_back(vec_swap[i]);
-    auto tempOverLimit_ = load_over_limit(tempLoad,memLimit,0,maxLen,maxLen);
-    cout<<vec_swap[i].r_idx_ready<<","<<vec_swap[i].d_idx<<" ((("<<tempOverLimit_.first<<","<<tempOverLimit_.second<<")))"<<endl;
-    auto max_current = load_peak(tempLoad,maxLen);
-    auto newMaxLoad = max_current.first;
-    if (newMaxLoad < memLimit){
+    auto temp_over_limit_ = GetOptIdxAboveLoadLimit(temp_load,mem_limit,0,iteration_length,iteration_length);
+    auto max_current = GetLoadPeak(temp_load,iteration_length);
+    auto newmax_load = max_current.first;
+    if (newmax_load < mem_limit){
       break;
     }
   }
-  cout<<"=============selection done================"<<endl;
   
   return vec_swap_selct;
 }
 
-vector<double> SwapGPU::swap_load_ideal(vector<double>vec_load,vector<SwapBlock> vec_swap_selct){
+vector<double> SwapGPU::GetIdealLoad(vector<double>vec_load,vector<SwapBlock> vec_swap_selct){
+  /*
+  get load_ideal, which is equivalent to load by synchronous swapping.
+  */
   auto vec_load_return = vec_load;
   for (int i =0; i<vec_swap_selct.size(); i++){
     int auto_buffer = 0;
     auto itm = vec_swap_selct[i];
     if (itm.cat == "A2") auto_buffer = data_buffer;
     if (itm.cat == "A3") auto_buffer = mutable_data_buffer;
-    load_update(vec_load_return, itm.r_idx+auto_buffer, itm.d_idx, -1, itm.size, maxLen);
+    UpdateLoad(vec_load_return, itm.r_idx+auto_buffer, itm.d_idx, -1, itm.size, iteration_length);
   }
   return vec_load_return;
 }
 
-void SwapGPU::swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double memLimit,string mode){
+void SwapGPU::Scheduling(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_load_temp,double &overhead,double mem_limit,string mode){
   /*
-    update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit.
+  Swap Scheduling algo
+  update idx_out_end, idx_in_start 
+  compute overhead time 
+  mode selection: no overhead or stick to limit.
   */ 
-  //TODO(junzhe) wordy, can merge in common part.
+
   overhead = 0;
-  cout<<"----------------swap_sched----------------"<<endl;
+
+  /// mode that stick to the mem_limit
   if (mode == "stick-to-limit"){
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_ascending_swap()); 
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int readyIdx = itm.r_idx_ready;
-      cout<<itm.r_idx<<" ["<<itm.size<<"] ";
-      cout<<readyIdx;
+      int ready_idx = itm.r_idx_ready;
+
       if (i > 0){
-        readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
+        ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end);
       }
-      cout<<" -> "<<readyIdx;
-      itm.i1 = readyIdx;
-      // cout<<"check t1((("<<itm.t1;
-      itm.t1 = vec_run[readyIdx+maxLen].t;
-      // cout<<" "<<readyIdx<<" "<<maxLen<<" || "<<vec_run[readyIdx+maxLen].t<<" "<<itm.t1<<")))";
-      itm.t1p = itm.t1 + SwapOutTime(itm.size);
-      total_swapOutTime+=SwapOutTime(itm.size);
-      while (itm.t1p > vec_run[readyIdx+maxLen].t){ //TODO(junzhe) reduce time complexity.
-        readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead.
+
+      itm.idx_out_start = ready_idx;
+      itm.t_out_start = vec_run[ready_idx+iteration_length].t;
+      itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size);
+      total_swap_out_time+=SwapOutTime(itm.size);
+      while (itm.t_out_end > vec_run[ready_idx+iteration_length].t){ 
+        //ready means when able to finish swapOut, w/ or w/o overhead.
+        ready_idx++; 
       }
-      //get min compare with maxIdx and readyIdx.
-      readyIdx = std::min(maxIdx,readyIdx);
-      cout<<" || "<<readyIdx;
-      //TODO(junzhe) 1st should be desired.
-      load_update(vec_load_temp,readyIdx+1,itm.d_idx,-1,itm.size,maxLen);
-      // load_update(vec_load_temp,itm.r_idx_ready,itm.d_idx,-1,itm.size,maxLen);
-      auto tempOverLimit_ = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      cout<<" ((("<<tempOverLimit_.first<<','<<tempOverLimit_.second<<"))) ";
-      if ((tempOverLimit_.first != -1) && (tempOverLimit_.first <= readyIdx)) { 
-        load_update(vec_load_temp,tempOverLimit_.first-1,readyIdx+1,-1,itm.size,maxLen);
-        // cout<<" ((("<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<readyIdx<<' '<<tempOverLimit_.first<<")))";
-        readyIdx = tempOverLimit_.first - 1; //TODO(junzhe) boundary
-        overhead+=(itm.t1p-vec_run[readyIdx+maxLen].t);
-        cout<<"==== overhead added "<<itm.t1p-vec_run[readyIdx+maxLen].t<<"... ";
-        // cout<<"time spent "<<SwapOutTime(itm.size)<<endl;
-        // cout<<"so time "<<itm.t1<<endl;
-        // cout<<"eo 1 time "<<itm.t1p<<endl;
-        // cout<<"eo 2 time "<<vec_run[readyIdx+maxLen].t<<endl;
 
+      //get min compare with max_idx and ready_idx.
+      ready_idx = std::min(max_idx,ready_idx);
+      UpdateLoad(vec_load_temp,ready_idx+1,itm.d_idx,-1,itm.size,iteration_length);
+      auto temp_over_limit_ = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length);
+      if ((temp_over_limit_.first != -1) && (temp_over_limit_.first <= ready_idx)) { 
+        UpdateLoad(vec_load_temp,temp_over_limit_.first-1,ready_idx+1,-1,itm.size,iteration_length);
+        ready_idx = temp_over_limit_.first - 1; 
+        overhead+=(itm.t_out_end-vec_run[ready_idx+iteration_length].t);
       }
-      cout<<" -> "<<readyIdx<<endl;   
-      itm.i1p = readyIdx;
-      vec_swap_selct[i] = itm;
-      // auto tempOverLimit_1 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      // cout<<"end: overlimit first and i1p "<<tempOverLimit_1.first<<' '<<itm.i1p<<endl;
-      
+      itm.idx_out_end = ready_idx;
+      vec_swap_selct[i] = itm; 
     }
-    cout<<"----------------sched part II-------------"<<endl;
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
+
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_descending_swap());
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      cout<<itm.r_idx<<" ["<<itm.size<<"] ";
-      int needIdx = itm.d_idx;
-      cout<<needIdx;
-      if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
-      cout<<" -> "<<needIdx;
-      itm.i2 = needIdx;
-      double prepareTime = vec_run[needIdx+maxLen].t - SwapInTime(itm.size);
-      total_swapInTime+=SwapInTime(itm.size);
-      while (prepareTime < vec_run[needIdx+maxLen].t){
-        needIdx--;
+      int need_idx = itm.d_idx;
+      if (i > 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); }
+      itm.idx_in_end = need_idx;
+      double prepareTime = vec_run[need_idx+iteration_length].t - SwapInTime(itm.size);
+      total_swap_in_time+=SwapInTime(itm.size);
+      while (prepareTime < vec_run[need_idx+iteration_length].t){
+        need_idx--;
       }
-      needIdx = std::max(needIdx,maxIdx+1);
-      cout<<" || "<<needIdx;
-      itm.i2p = needIdx;
-      itm.t2p = prepareTime;
-      // auto tempOverLimit_2 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      // cout<<"(((before come back (right over limit): "<<tempOverLimit_2.second<<endl;
-      load_update(vec_load_temp,itm.i2p,itm.d_idx,1,itm.size,maxLen); //TODO(junzhe) range, right boundary
-      auto tempOverLimit_3 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
-      cout<<" ((("<<tempOverLimit_3.first<<","<<tempOverLimit_3.second<<")))";
-      // cout<<"|||after come back (right over limit): "<<tempOverLimit_3.second<<endl;
-      // if (tempOverLimit_3.second > 0){
-      //   cout<<itm.r_idx<<' '<<itm.d_idx<<"||"<<itm.i1<<' '<<itm.i1p<<' '<<itm.i2p<<' '<<itm.i2<<")))"<<endl;
-      // }
-
-      if ((tempOverLimit_3.second != -1) && (vec_run[tempOverLimit_3.second+maxLen].t > itm.t2p)) {
-        overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p);
-        cout<<"==== overhead added "<<vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p<<"... ";
-        load_update(vec_load_temp,itm.i2p,tempOverLimit_3.second+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
-        itm.i2p = tempOverLimit_3.second+1;
-        auto tempOverLimit_4 = load_over_limit(vec_load_temp,memLimit,0,maxLen,maxLen);
+      need_idx = std::max(need_idx,max_idx+1);
+      itm.idx_in_start = need_idx;
+      itm.t_in_start = prepareTime;
+     UpdateLoad(vec_load_temp,itm.idx_in_start,itm.d_idx,1,itm.size,iteration_length); 
+      auto temp_over_limit_3 = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length);
+
+      if ((temp_over_limit_3.second != -1) && (vec_run[temp_over_limit_3.second+iteration_length].t > itm.t_in_start)) {
+        overhead+=(vec_run[temp_over_limit_3.second+iteration_length].t - itm.t_in_start);
+        UpdateLoad(vec_load_temp,itm.idx_in_start,temp_over_limit_3.second+1,-1,itm.size,iteration_length);
+        itm.idx_in_start = temp_over_limit_3.second+1;
+        auto temp_over_limit_4 = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length);
       }
-      cout<<" -> "<<itm.i2p<<endl;
-      //cout<<"after consider overlimit (right over limit): "<<tempOverLimit_4.second<<endl;
       vec_swap_selct[i] = itm;
     }
-    cout<<":::::END OF SCHED, overhead is "<<overhead<<endl;
   }///end of first mode.
 
 
-  ///this mode not really in use, for test purpose only.
+  ///mode that incurs zero overhead
   if (mode == "no-overhead"){
-    //update i1p
-    //sort by r_idx for i1p update
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
+    //update idx_out_end
+    //sort by r_idx for idx_out_end update
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_ascending_swap()); 
     for (int i = 0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int readyIdx = 0;
-      if (itm.cat == "A1") { readyIdx = itm.r_idx; }
-      if (itm.cat == "A2") { readyIdx = itm.r_idx + data_buffer; }
-      if (itm.cat == "A3") { readyIdx = itm.r_idx + mutable_data_buffer; }
+      int ready_idx = 0;
+      if (itm.cat == "A1") { ready_idx = itm.r_idx; }
+      if (itm.cat == "A2") { ready_idx = itm.r_idx + data_buffer; }
+      if (itm.cat == "A3") { ready_idx = itm.r_idx + mutable_data_buffer; }
 
       if (i > 0){
-        readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p);
+        ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end);
       }
-      itm.i1 = readyIdx;
-      itm.t1 = vec_run[readyIdx].t;
-      itm.t1p = itm.t1 + SwapOutTime(itm.size);
-      while (itm.t1p > vec_run[readyIdx].t){
-        readyIdx++;
+      itm.idx_out_start = ready_idx;
+      itm.t_out_start = vec_run[ready_idx].t;
+      itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size);
+      while (itm.t_out_end > vec_run[ready_idx].t){
+        ready_idx++;
       }
-      itm.i1p = readyIdx;
+      itm.idx_out_end = ready_idx;
       vec_swap_selct[i] = itm;
     }
-    //update i2p
-    sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs());
+    //update idx_in_start
+    sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_descending_swap());
     for (int i =0; i<vec_swap_selct.size(); i++){
       auto itm = vec_swap_selct[i];
-      int needIdx = itm.d_idx;
-      if (i > 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); }
-      itm.i2 = needIdx;
-      double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size);
-      while (prepareTime < vec_run[needIdx].t){
-        needIdx--;
+      int need_idx = itm.d_idx;
+      if (i > 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); }
+      itm.idx_in_end = need_idx;
+      double prepareTime = vec_run[need_idx].t - SwapInTime(itm.size);
+      while (prepareTime < vec_run[need_idx].t){
+        need_idx--;
       }
-      itm.i2p = needIdx;
-      itm.t2p = prepareTime;
+      itm.idx_in_start = need_idx;
+      itm.t_in_start = prepareTime;
       vec_swap_selct[i] = itm;
-      load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary
+      UpdateLoad(vec_load_temp,itm.idx_out_end,itm.idx_in_start+1,-1,itm.size,iteration_length);
     }
 
   }
@@ -548,365 +532,219 @@ void SwapGPU::swap_sched(vector<SwapBlock>&vec_swap_selct, vector<double>&vec_lo
 }
 
 
-void SwapGPU::swap_construct_tables(vector<SwapBlock>vec_swap_selct){
+void SwapGPU::BuildMetaTables(vector<SwapBlock>vec_swap_selct){
+  /*
+  construct tables: table_sched, and table_meta
+  */
   cudaStream_t stream1;
   cudaStream_t stream2;
-  cout<<"---------------print all 1, 1', 2', 2-----------"<<endl;
-  sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap()); 
-  //for each swap select, make Table_sched and Table_meta
+  sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_ascending_swap()); 
+  //for each swap select, make table_sched and table_meta
   // for (int i = static_cast<int>(vec_swap_selct.size()-1);i>=0; i--){
   for (int i =0; i<vec_swap_selct.size(); i++){
     auto itm = vec_swap_selct[i];
-    // if (itm.r_idx >= 0){
-    //TODO(junzhe) for time being, remove negative r_idx itms.
-      cout<<itm.r_idx<<" || "<<itm.i1<<" "<<itm.i1p<<" "<<itm.i2p<<" "<<itm.i2<<endl;
-      //i1 swap
-      if (Table_sched.find(itm.i1) == Table_sched.end()){
-        Table_sched[itm.i1] = std::make_tuple(itm.r_idx,0,-1,-1);
-      } else {
-        std::get<0>(Table_sched.find(itm.i1)->second) = itm.r_idx;
-        std::get<1>(Table_sched.find(itm.i1)->second) = 0;
-      }
-      //i2p swap
-      if (Table_sched.find(itm.i2p) == Table_sched.end()){
-        Table_sched[itm.i2p] = std::make_tuple(itm.r_idx,1,-1,-1);      
-      } else {
-        std::get<0>(Table_sched.find(itm.i2p)->second) = itm.r_idx;
-        std::get<1>(Table_sched.find(itm.i2p)->second) = 1;
-      }
-      // i1p sync
-      if (Table_sched.find(itm.i1p) == Table_sched.end()){
-        Table_sched[itm.i1p] = std::make_tuple(-1,-1,itm.r_idx,0);
-      } else {
-        std::get<2>(Table_sched.find(itm.i1p)->second) = itm.r_idx;
-        std::get<3>(Table_sched.find(itm.i1p)->second) = 0; 
-      }
-      //i2 sync
-      if (Table_sched.find(itm.i2) == Table_sched.end()){
-        Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1);
-      } else {
-        std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx;
-        std::get<3>(Table_sched.find(itm.i2)->second) = 1;
-      }
 
-      ///Make Table_meta
-      void* tempPtr = nullptr;
-      cudaMallocHost(&tempPtr,itm.size); //pinned memory.
-      BlockMeta meta;
-      meta.size = itm.size;
-      meta.cpu_ptr = tempPtr;
-      meta.out_stream = stream1;
-      meta.in_stream = stream2;
-      //meta.last_out_idx = vec_swap_selct[i].last_out_idx;
-      //meta.last_in_idx = vec_swap_selct[i].last_in_idx;
-      //meta.i2 = vec_swap_selct[i].i2;
-      Table_meta[itm.r_idx] = meta;
-    // }
-
-  }
-  cout<<"---------------print all 1, 1', 2', 2-----------DONE"<<endl;
-  cout<<"size of Table_meta: "<<Table_meta.size()<<endl;
-  cout<<"size of Table_sched =================="<<Table_sched.size()<<endl;
-  cout<<"print Table_sched, idx, r_idx, sync, direction"<<endl;
-  for (int i = -500; i<maxLen; i++){
-    if (!(Table_sched.find(i) == Table_sched.end())){
-      cout<<i<<"-->";
-      cout<<std::get<0>(Table_sched.find(i)->second)<<" ";
-      cout<<std::get<1>(Table_sched.find(i)->second)<<" ";
-      cout<<std::get<2>(Table_sched.find(i)->second)<<" ";
-      cout<<std::get<3>(Table_sched.find(i)->second)<<endl;
+    if (table_sched.find(itm.idx_out_start) == table_sched.end()){
+      table_sched[itm.idx_out_start] = std::make_tuple(itm.r_idx,0,-1,-1);
+    } else {
+      std::get<0>(table_sched.find(itm.idx_out_start)->second) = itm.r_idx;
+      std::get<1>(table_sched.find(itm.idx_out_start)->second) = 0;
     }
+    //idx_in_start swap
+    if (table_sched.find(itm.idx_in_start) == table_sched.end()){
+      table_sched[itm.idx_in_start] = std::make_tuple(itm.r_idx,1,-1,-1);      
+    } else {
+      std::get<0>(table_sched.find(itm.idx_in_start)->second) = itm.r_idx;
+      std::get<1>(table_sched.find(itm.idx_in_start)->second) = 1;
+    }
+    // idx_out_end sync
+    if (table_sched.find(itm.idx_out_end) == table_sched.end()){
+      table_sched[itm.idx_out_end] = std::make_tuple(-1,-1,itm.r_idx,0);
+    } else {
+      std::get<2>(table_sched.find(itm.idx_out_end)->second) = itm.r_idx;
+      std::get<3>(table_sched.find(itm.idx_out_end)->second) = 0; 
+    }
+    //i2 sync
+    if (table_sched.find(itm.idx_in_end) == table_sched.end()){
+      table_sched[itm.idx_in_end] = std::make_tuple(-1,-1,itm.r_idx,1);
+    } else {
+      std::get<2>(table_sched.find(itm.idx_in_end)->second) = itm.r_idx;
+      std::get<3>(table_sched.find(itm.idx_in_end)->second) = 1;
+    }
+
+    ///Make table_meta
+    void* temp_ptr = nullptr;
+    cudaMallocHost(&temp_ptr,itm.size); //pinned memory.
+    BlockMeta meta;
+    meta.size = itm.size;
+    meta.cpu_ptr = temp_ptr;
+    meta.out_stream = stream1;
+    meta.in_stream = stream2;
+    table_meta[itm.r_idx] = meta;
   }
 
 }
 
-void SwapGPU::swap_update_tables(Block* tempBlock_){
-  // update Table_meta's block_ and data_; update once atfer swap test is passed.
-  // enable to update negative r_idx. 
-  // it's safe in below procedure, as r_gc and r_gc_n should never be the same.
-  if (testFlag == 1) {
+void SwapGPU::UpdateMetaTables(Block* block_ptr){
+  /*
+  update table_meta's block_ and data_; update once atfer swap test is passed.
+  enable to update negative r_idx. 
+  it's safe in below procedure, as r_global_index and relative_counter should never be the same.
+  */
+
+  if (past_test_flag == 1) {
     //update positive r_idx
-    int r_gc = (gc-location)%maxLen;
-    if (!(Table_meta.find(r_gc)==Table_meta.end())){
-      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
-      //TODO(junzhe) verify the length change, if go in, value update
-      // cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
-      Table_meta.find(r_gc)->second.block_ = tempBlock_;
-      Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data();
+    int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length;
+    if (!(table_meta.find(r_global_index)==table_meta.end())){
+     table_meta.find(r_global_index)->second.block_ = block_ptr;
+      table_meta.find(r_global_index)->second.data_ = block_ptr->get_data();
     }
 
     //update negative r_idx
-    int r_gc_n = r_gc - maxLen;
-    if (!(Table_meta.find(r_gc_n)==Table_meta.end())){
-      //cout<<"r_gc, gc and size ot Table_meta "<<r_gc<<' '<<gc<<" "<<Table_meta.size()<<endl;
-      //TODO(junzhe) verify the length change, if go in, value update
-      // cout<<"To update Block_ at "<<r_gc<<' '<<gc<<' '<<tempBlock_<<' '<<tempBlock_->get_data()<<endl;
-      Table_meta.find(r_gc_n)->second.block_ = tempBlock_;
-      Table_meta.find(r_gc_n)->second.data_ = tempBlock_->get_data();
+    int relative_counter = r_global_index - iteration_length;
+    if (!(table_meta.find(relative_counter)==table_meta.end())){
+      table_meta.find(relative_counter)->second.block_ = block_ptr;
+      table_meta.find(relative_counter)->second.data_ = block_ptr->get_data();
     }
   }
 
 }
 
-int SwapGPU::swap_test(vector<string>vec_block,int &maxLen, int &location){
+int SwapGPU::Detection(vector<string>vec_block,int &iteration_length, int &location_of_2nd_iteration){
+  /*
+  test repeatability, detect iteration, and return global_index_threshold.
+  */
+
+  ///vec_str (vec_block) to vec_opt_info, sort by ptr and idx.
+  int idx_range = 0; 
+  vector<DeviceOptInfo> vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range);
 
-  ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx.
-  int idxRange = 0;
-  vector<onePieceMsg> vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange);
-  cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
   ///rep test
-  vector<size_t> vec_rep = Swap_piece2rep(vec_pieceMsg);
-  //int idxRange3=0; //rename TODO(junzhe)
-  //int maxLen=0, location =0;
-  repPatternDetector(vec_rep,maxLen,location,maxLen_threshold,gc);
-  cout<<"maxLen and location are: "<<maxLen<<' '<<location<<endl;
-  cout<<"test rep"<<endl;
-  //Note here location not exactly start of one iteration, 
+  vector<size_t> vec_rep = DeviceOptSeqRepeatableTestPreProcess(vec_opt_info);
+  RepeatableTest(vec_rep,iteration_length,location_of_2nd_iteration,iteration_length_threshold,global_index);
+
+  //Note here location_of_2nd_iteration not exactly start of one iteration, 
   //adjust to nearly start of one by restricting "Malloc"
   int shift_counter = 0;
-  for (int i=0;i<maxLen;i++){
-    vector<string> v = swap_split(vec_block[location+i], " ");
+  for (int i=0;i<iteration_length;i++){
+    vector<string> v = SplitOptString(vec_block[location_of_2nd_iteration+i], " ");
     if (v[0]=="Malloc"){
       shift_counter = i; 
       break;
     }
   }
-  location =location+shift_counter;
-  cout<<"shift_counter is "<<shift_counter<<endl;
-  cout<<"location changed to "<<location<<endl;
+  location_of_2nd_iteration =location_of_2nd_iteration+shift_counter;
 
-  if (maxLen<maxLen_threshold) {return -1;}
+  if (iteration_length<iteration_length_threshold) {return -1;}
 
-  return gc+maxLen-(gc-location)%maxLen;
+  return global_index+iteration_length-(global_index-location_of_2nd_iteration)%iteration_length;
 } 
 
-void SwapGPU::swap_plan(){
-  cout<<":::::::::::::::::::::::::start swap_plan()"<<endl;
+void SwapGPU::Plan(){
+  /*
+  major stream of functions: from make candidate blocks, selection swaps, make tables, etc.
+  */
 
-  int idxRange = 0;
-  vector<onePieceMsg> vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange);
-  cout<<"size of vec_pieceMsg & vec_block: "<<vec_pieceMsg.size()<<' '<<vec_block.size()<<endl;
-  sort(vec_pieceMsg.begin(),vec_pieceMsg.end(),less_than_Idx());
+  int idx_range = 0;
+  vector<DeviceOptInfo> vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range);
+  sort(vec_opt_info.begin(),vec_opt_info.end(),sort_by_idx_ascending());
+  
   // scale down idx, to middle iteration.
-  tempTime_baseline = vec_pieceMsg[three_more_location].t;
-  for (int i=0; i<vec_pieceMsg.size();i++){
-    vec_pieceMsg[i].idx = vec_pieceMsg[i].idx - three_more_location - maxLen;
-    vec_pieceMsg[i].t = vec_pieceMsg[i].t - tempTime_baseline;
+  temp_time_baseline = vec_opt_info[location_of_5th_iteration].t;
+  for (int i=0; i<vec_opt_info.size();i++){
+    vec_opt_info[i].idx = vec_opt_info[i].idx - location_of_5th_iteration - iteration_length;
+    vec_opt_info[i].t = vec_opt_info[i].t - temp_time_baseline;
   }
 
   // build opsSqn, and sizeSqn
-  // cout<<"------printing sequenc--------"<<endl;
-  vector<onePieceMsg>one_itr(&vec_pieceMsg[location+4*maxLen],&vec_pieceMsg[location+5*maxLen]);
+  vector<DeviceOptInfo>one_itr(&vec_opt_info[location_of_2nd_iteration+4*iteration_length],&vec_opt_info[location_of_2nd_iteration+5*iteration_length]);
   for (int i =0; i<one_itr.size();i++){
-    opsSequence.push_back(one_itr[i].MallocFree);
-    sizeSequence.push_back(one_itr[i].size);
-    // cout<<one_itr[i].MallocFree<<' '<<one_itr[i].size<<endl;
+    operation_sequence.push_back(one_itr[i].operation_type);
+    size_sequence.push_back(one_itr[i].size);
   }
-  //3 iterations of vec_run and vec_load, maxIdx and maxLoad
-  vector<onePieceMsg>temp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]);
+  
+  //3 iterations of vec_run and vec_load, max_idx and max_load
+  vector<DeviceOptInfo>temp_vec_run(&vec_opt_info[location_of_2nd_iteration+3*iteration_length],&vec_opt_info[location_of_2nd_iteration+6*iteration_length]);
   vec_run = temp_vec_run;
-  fstream file_vec_run("vec_run36.csv", ios::in|ios::out|ios::app);
-  for (int i =0; i<vec_run.size();i++){
-    //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    // file_vec_run<<i<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    // tempTime2 = vec_run[i].t;
-    file_vec_run<<i-maxLen<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].size<<' '<<vec_run[i].t<<endl;
-  }
 
-  vector<onePieceMsg>temp_vec_run2(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]);
+  vector<DeviceOptInfo>temp_vec_run2(&vec_opt_info[location_of_2nd_iteration],&vec_opt_info[location_of_2nd_iteration+3*iteration_length]);
   auto vec_run2 = temp_vec_run2;
-  fstream file_vec_run2("vec_run03.csv", ios::in|ios::out|ios::app);
-  for (int i =0; i<vec_run2.size();i++){
-    //file_vec_run<<vec_run[i].idx<<' '<<vec_run[i].MallocFree<<' '<<vec_run[i].t<<' '<<vec_run[i].t-tempTime2<<endl;
-    // file_vec_run2<<i<<' '<<vec_run2[i].t<<' '<<vec_run2[i].t-tempTime2<<endl;
-    // tempTime2 = vec_run[i].t;
-    file_vec_run2<<i<<' '<<vec_run2[i].MallocFree<<' '<<vec_run2[i].size<<endl;
-  }
 
-  vector<double>vec_load(&global_load[location],&global_load[location+3*maxLen]);
+
+  vector<double>vec_load(&global_load[location_of_2nd_iteration],&global_load[location_of_2nd_iteration+3*iteration_length]);
   origin_load = vec_load;
-  //3 iterations
-  fstream file_load_origin("load_origin03.csv", ios::in|ios::out|ios::app);
-  for (int i=0; i<origin_load.size(); i++){
-    file_load_origin<<i<<' '<<origin_load[i]<<endl;
-  }
-  
-  // vector<double>vec_load2(&global_load[location+3*maxLen],&global_load[location+6*maxLen]);
-  // auto origin_load2 = vec_load2;
-  // //3 iterations
-  // fstream file_load_origin2("load_origin36.csv", ios::in|ios::out|ios::app);
-  // for (int i=0; i<origin_load2.size(); i++){
-  //   file_load_origin2<<i<<" "<<origin_load2[i]<<endl;
-  // }
-
-  // //one iteration.
-  // fstream file_load_current("load_current.csv", ios::in|ios::out|ios::app);
-  // for (int i=0; i<maxLen; i++){
-  //   file_load_current<<vec_load[i]<<endl;
-  // }
-  cout<<"build from scratch:::::::::::::::;"<<endl;
-  auto max_current = load_peak(vec_load,maxLen);
-  maxLoad = max_current.first;
-  maxIdx = max_current.second;
-  cout<<"------------------print max_load: (current) "<<maxLoad<<" "<<maxIdx<<endl;
 
+  auto max_current = GetLoadPeak(vec_load,iteration_length);
+  max_load = max_current.first;
+  max_idx = max_current.second;
 
   //sort by ptr & idx, sorting the duplicate
   auto vec_run_dup = vec_run;
-  sort(vec_run_dup.begin(),vec_run_dup.end(),less_than_ptrIdx());
+  sort(vec_run_dup.begin(),vec_run_dup.end(),sort_by_ptr_idx_ascending());
+  
   ///formulate swappable items.
-  cout<<"==============================print swappable items, with maxIdx "<<maxIdx<<endl;
   vector<SwapBlock>vec_swap;
-  // size_t load_swap = 0;
+
   for (int i =1; i<vec_run_dup.size(); i++){
-    //SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): 
-    //ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {}
-    if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idx<maxIdx) && (vec_run_dup[i].idx>maxIdx) 
+    //SwapBlock(string p, size_t s, int idx_out_start, int i2, double t1, double t2): 
+    //ptr(p), size(s), r_idx(idx_out_start),d_idx(i2),r_time(t1), d_time(t2) {}
+    if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idx<max_idx) && (vec_run_dup[i].idx>max_idx) 
       && (vec_run_dup[i-1].ptr ==vec_run_dup[i].ptr) 
-      && ((vec_run_dup[i-1].MallocFree==3) or (vec_run_dup[i-1].MallocFree==2) or (vec_run_dup[i-1].MallocFree==4)))
+      && ((vec_run_dup[i-1].operation_type==3) or (vec_run_dup[i-1].operation_type==2) or (vec_run_dup[i-1].operation_type==4)))
     {
       SwapBlock itm(vec_run_dup[i].ptr, vec_run_dup[i].size, vec_run_dup[i-1].idx, vec_run_dup[i].idx, vec_run_dup[i-1].t, vec_run_dup[i].t);
-      itm.dto = itm.d_time-itm.r_time;
-      itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size);
-      if (itm.dt>=0){
-        itm.pri = itm.dt * itm.size;
+      itm.DOA_origin = itm.d_time-itm.r_time;
+      itm.DOA = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size);
+      if (itm.DOA>=0){
+        itm.AOA = itm.DOA * itm.size;
       } else {
-        itm.pri = itm.dt * 1/itm.size;
+        itm.AOA = itm.DOA * 1/itm.size;
       }
       //cat A
-      if (vec_run_dup[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
-      if (vec_run_dup[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
-      if (vec_run_dup[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
+      if (vec_run_dup[i-1].operation_type == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } 
+      if (vec_run_dup[i-1].operation_type == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} 
+      if (vec_run_dup[i-1].operation_type == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} 
 
       vec_swap.push_back(itm);
-      // load_swap+=itm.size;
-      cout<<itm.size<<" ";
-      cout<<"Items Swappable: (r_idx, d_idx, cat, MB, dt/us, PS) || "<<itm.r_idx<<' '<<itm.d_idx;
-      cout<<" ||  "<<itm.cat<<"  || "<<(float)(itm.size)/(float)(1024*1024);
-      cout<<' || '<<itm.dt/1000<<' '<<itm.pri<<endl;
     } 
   }
-  cout<<"size vec_swap: "<<vec_swap.size()<<endl;
 
-  ///load ideal, swap all vec_swap, lest possible memory by one-swap
-  auto vec_load_ideal = swap_load_ideal(vec_load,vec_swap);
+  ///load ideal, swap all vec_swap, lest possible memory by one-swap, for data collection only.
+  auto vec_load_ideal = GetIdealLoad(vec_load,vec_swap);
   fstream file_load_ideal("load_ideal.csv", ios::in|ios::out|ios::app);
-  for (int i=maxLen; i<maxLen*2; i++){
+  for (int i=iteration_length; i<iteration_length*2; i++){
     file_load_ideal<<vec_load_ideal[i]<<endl;
   }
 
-  auto max_ideal = load_peak(vec_load_ideal,maxLen);
-  size_t maxLoad_ideal = max_ideal.first;
-  int maxIdx_ideal = max_ideal.second;
-  cout<<"------------------print max_load: (ideal) "<<maxLoad_ideal<<" "<<maxIdx_ideal<<endl;
-  //maxLoad_ideal = 400;
-  /// select till maxLoad_ideal, dto
-  // auto vec_swap_dto = swap_select(vec_swap,maxLoad,maxLoad_ideal,"dto");
-  // cout<<"size of vec_swap_dto: "<<vec_swap_dto.size()<<endl;
-  // auto vec_load_dto_ideal = swap_load_ideal(vec_load,vec_swap_dto);
-  //   fstream file_load_dto_ideal("load_dto_ideal.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_load_dto_ideal<<vec_load_dto_ideal[i]<<endl;
-  // }
-  // auto tempMax_ = load_peak(vec_load_dto_ideal,maxLen);
-  // cout<<"------------------print max_load: (dto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  // /// select till maxLoad_ideal, pri
-  // auto vec_swap_pri = swap_select(vec_swap,maxLoad,maxLoad_ideal,"pri");
-  // cout<<"size of vec_swap_pri: "<<vec_swap_dto.size()<<endl;
-  // auto vec_load_pri_ideal = swap_load_ideal(vec_load,vec_swap_pri);
-  //   fstream file_load_pri_ideal("load_pri_ideal.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_load_pri_ideal<<vec_load_pri_ideal[i]<<endl;
-  // }
-  // tempMax_ = load_peak(vec_load_pri_ideal,maxLen);
-  // cout<<"------------------print max_load: (pri ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  /// select till maxLoad_ideal, wdto
-  auto tempLoad = origin_load;
-  auto memLimit_wdto = 550<<20;
-  //TODO(junzhe) memLimit = maxLoad_ideal*1.4
-  auto vec_swap_wdto = swap_select(vec_swap,tempLoad,memLimit_wdto,"wdto");
-  // vec_swap_selct_global = vec_swap_wdto;
-  cout<<"size of vec_swap_wdto: "<<vec_swap_wdto.size()<<endl;
-  // auto vec_load_wdto_ideal = swap_load_ideal(vec_load,vec_swap_wdto);
-    // fstream file_load_wdto_ideal("load_wdto_ideal.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-    // file_load_wdto_ideal<<vec_load_wdto_ideal[i]<<endl;
-  // }
-  // auto tempMax_ = load_peak(vec_load_wdto_ideal,maxLen);
-  // cout<<"------------------print max_load: (wdto ideal) "<<tempMax_.first<<" "<<tempMax_.second<<endl;
-
-  /// load_1 no overhead, based on pri
-  //auto vec_swap_pri = vec_swap_pri;
-  // auto vec_load_pri = origin_load;
-  // auto vec_load_dto = origin_load;
-  auto vec_load_wdto = origin_load;
+  auto max_ideal = GetLoadPeak(vec_load_ideal,iteration_length);
+  size_t max_load_ideal = max_ideal.first;
+  int max_idx_ideal = max_ideal.second;
+
+  /// majority voting, can specify mode here, can specify load_limit
+  auto temp_load = origin_load;
+  auto mem_limit_majority_voting = 550<<20;
+  auto vec_swap_majority_voting = SelectBlock(vec_swap,temp_load,mem_limit_majority_voting,"majority_voting");
+  // vec_swap_selct_global = vec_swap_majority_voting;
+
+  auto vec_load_WDOA = origin_load;
   string mode = "stick-to-limit";
-  // double overhead_pri = 0;
-  // swap_sched(vec_swap_pri, vec_load_pri,overhead_pri,450<<20, mode);
-  
-  // double overhead_dto = 0;
-  // swap_sched(vec_swap_dto, vec_load_dto,overhead_dto,450<<20,mode);
-  
-  double overhead_wdto = 0;
-  swap_sched(vec_swap_wdto, vec_load_wdto,overhead_wdto,memLimit_wdto,mode);
-
-  swap_construct_tables(vec_swap_wdto);
-
-
-  // fstream file_block10("load_1_pri.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_block10<<vec_load_pri[i]<<endl;
-  // }
-  // fstream file_block11("load_1_dto.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_block11<<vec_load_dto[i]<<endl;
-  // }
-  // fstream file_block12("load_1_wdto.csv", ios::in|ios::out|ios::app);
-  // for (int i=maxLen; i<maxLen*2; i++){
-  //   file_block12<<vec_load_wdto[i]<<endl;
-  // }
-  //TODO(junzhe) below verification to be done later.
-  // auto max_1 = load_peak(vec_load_pri,maxLen);
-  // size_t maxLoad_1 = max_1.first;
-  // int maxIdx_1 = max_1.second;
-  // cout<<"------------------print max_load: (1) "<<maxLoad_1<<" "<<maxIdx_1<<endl;
-  // //change back order by Idx.
-  // sort(vec_run.begin(),vec_run.end(),less_than_Idx());
-  // cout<<"done with swap_plan..."<<endl;
-  // cout<<"load 2 overhead pri: "<<(float)(overhead_pri)/(float)(1000000)<<endl;
-  // cout<<"load 2 overhead dto: "<<(float)(overhead_dto)/(float)(1000000)<<endl;
-  // cout<<"load 2 overhead wdto: "<<(float)(overhead_wdto)/(float)(1000000)<<endl;
-  // cout<<"total_swapOutTime: "<<(float)(total_swapOutTime)/(float)(3000000)<<endl;
-  // cout<<"total_swapInTime: "<<(float)(total_swapInTime)/(float)(3000000)<<endl;
-  // auto t1 = vec_run[0].t;
-  // auto t2 = vec_run[maxLen].t;
-  // auto t3 = vec_run[maxLen*2].t;
-  // auto t4 = vec_run[maxLen*3-1].t;
-  // cout<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
-  // fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-  // file_time<<"iteration time spent: "<<(float)(t2-t1)/(float)(1000000)<<" "<<(float)(t3-t2)/(float)(1000000)<<" "<<(float)(t4-t3)/(float)(1000000)<<endl;
-  // file_time<<"iteration time spent: "<<t2-t1<<" "<<t3-t2<<" "<<t4-t3<<endl;
-  // file_time<<"idx "<<0<<" "<<maxLen<<" "<<maxLen*2<<" "<<maxLen*3-1<<endl;
-  // file_time<<"value "<<t1<<" "<<t2<<" "<<t3<<" "<<t4<<endl;
-}
 
+  double overhead_WDOA = 0;
+  Scheduling(vec_swap_majority_voting, vec_load_WDOA,overhead_WDOA,mem_limit_majority_voting,mode);
 
+  BuildMetaTables(vec_swap_majority_voting);
 
+}
 
 
 
 SwapGPU::~SwapGPU() {
-  //print out push-info TODO(junzhe) can remove
+  //print out push-info
   fstream file_block_full("vec_block_full.csv", ios::in|ios::out|ios::app);
   for (int i =0; i<vec_block.size();i++){
     file_block_full<<vec_block[i]<<endl;
   }
 
-  fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-  file_time<<"=============================="<<endl;
-  //main body
   if (ctx_.cublas_handle) CUBLAS_CHECK(cublasDestroy(ctx_.cublas_handle));
   if (ctx_.curand_generator)
     CURAND_CHECK(curandDestroyGenerator(ctx_.curand_generator));
@@ -923,7 +761,6 @@ SwapGPU::SwapGPU(int id) : Device(id, kNumCudaStream) {
 
   MemPoolConf conf;
   conf.add_device(id);
-  //TODO(junzhe) note that it has been <Swap> for building SwapGPU, which doesnt matter.
   pool_ = std::make_shared<SwapPool>(conf); 
   Setup();
 
@@ -988,21 +825,21 @@ void* SwapGPU::Malloc(int size) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Malloc((void**)&ptr, size);
 
-    ///append vec_block_mf
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-      && ((gc - maxLen) >= three_more_globeCounter)){
-      string tempStr1 ="Malloc ";
+    ///append vec_block_mf:for swap & pool
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+      && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+      string temp_str1 ="Malloc ";
       stringstream strm2;
       strm2<<ptr;
-      string tempStr2 = strm2.str();
+      string temp_str2 = strm2.str();
       stringstream strm3;
       strm3<<size;
-      string tempStr3 = strm3.str();
-      string temp = tempStr1+tempStr2+" "+tempStr3;
+      string temp_str3 = strm3.str();
+      string temp = temp_str1+temp_str2+" "+temp_str3;
       vec_block_mf.push_back(temp);
     }
     //record mf semantics after swap plan done
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
       fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
       file_mf_one_itr<<"Malloc "<<ptr<<" "<<size;
       file_mf_one_itr<<endl;
@@ -1010,7 +847,6 @@ void* SwapGPU::Malloc(int size) {
     // TODO(wangwei) remove the memset.
     CUDA_CHECK(cudaMemset(ptr, 0, size));
   }
-  //cout<<"malloc done"<<endl;
   return ptr;
 }
 
@@ -1020,413 +856,318 @@ void SwapGPU::Free(void* ptr) {
   if (ptr != nullptr) {
     CUDA_CHECK(cudaSetDevice(id_));
     pool_->Free(ptr);
-    ///append vec_block_mf
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-      && ((gc - maxLen) >= three_more_globeCounter)){
-      string tempStr1 ="Free ";
+    ///append vec_block_mf: for swap & pool
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+      && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+      string temp_str1 ="Free ";
       stringstream strm2;
       strm2<<ptr;
-      string tempStr2 = strm2.str();
-      string temp = tempStr1+tempStr2;
+      string temp_str2 = strm2.str();
+      string temp = temp_str1+temp_str2;
       vec_block_mf.push_back(temp);
     }
 
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
       fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
       file_mf_one_itr<<"Free "<<ptr<<endl;
     }
   }
 
-  //cout<<"free done"<<endl; 
 }
 
-void SwapGPU::Test_sched_switch_swap(){
+void SwapGPU::DetectionPlan(){
   /*
-    v1: do Test_sched_switch_swap during (before) Malloc and Free.
-    swap removed to DeploySwap
-    v2: test after every index, at Append. order and index changed.
+    test after every index, at Append. order and index changed.
   */
   ///test iteration
-  if (((gc+1)%(maxLen_threshold) == 0) && (asyncSwapFlag == 0) && (testFlag == 0)){
-    //TODO(junzhe) not lean, chances are globeCounter found more than 300 idx ago: redudant test.
-    cout<<"gc, GC and vec_len before test: "<<gc<<' '<<globeCounter<<' '<<vec_block.size()<<endl;
-    globeCounter = swap_test(vec_block,maxLen,location);
-    maxLen_threshold = std::max(maxLen_threshold,gc/10);
-    maxLen_threshold = std::min(2000,maxLen_threshold);
-    if (maxLen > maxLen_threshold) {
-      testFlag = 1;
-      three_more_globeCounter = globeCounter + 3*maxLen;
-      three_more_location = location + 3*maxLen;
-      cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"<<endl;
-      cout<<"impt numbers (maxLen, location, GC) "<<maxLen<<' '<<location<<' '<<globeCounter<<endl;
-      
+  if (((global_index+1)%(iteration_length_threshold) == 0) && (async_swap_flag == 0) && (past_test_flag == 0)){
+    global_index_threshold = Detection(vec_block,iteration_length,location_of_2nd_iteration);
+    iteration_length_threshold = std::max(iteration_length_threshold,global_index/10);
+    iteration_length_threshold = std::min(2000,iteration_length_threshold);
+    if (iteration_length > iteration_length_threshold) {
+      past_test_flag = 1;
+      three_more_iteration_global_index_threshold = global_index_threshold + 3*iteration_length;
+      location_of_5th_iteration = location_of_2nd_iteration + 3*iteration_length;      
    }
  }
  ///switch flag; next idx
- if ((gc+1) == three_more_globeCounter){
-    swap_plan();
-    asyncSwapFlag = 1;
-    // vector<double>vec_load2(&global_load[three_more_location],&global_load[three_more_location+3*maxLen]);
-    // origin_load = vec_load2;
-    // //load before swap, write in
-    // fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app);
-    // for (int i=0; i<origin_load.size(); i++){
-    //   file_load_origin<<origin_load[i]<<endl;
-    // }
-    cout<<"switched flag for at "<<three_more_globeCounter<<endl;
-    cout<<"maxLen is "<<maxLen<<endl;
-    cout<<"globeCounter "<<globeCounter<<endl;
+ if ((global_index+1) == three_more_iteration_global_index_threshold){
+    Plan();
+    async_swap_flag = 1;
  }
 }
 
-void SwapGPU::MakeMetaTable(Block* block_,void* data_,int size){
+void SwapGPU::AppendAfterMalloc(Block* block_ptr,void* data_ptr,int size){
   /*
-  Append info right after Malloc; make block_ - data_ pair wise table.
+  Append info right after Malloc; make block_ptr - data_ptr pair wise table.
+  as Block* is not available till Malloc() done.
   */
 
   //append info
   stringstream strm1;
   strm1<<size;
-  string tempStr1 = strm1.str();
+  string temp_str1 = strm1.str();
   stringstream strm3;
-  strm3<<block_;
-  string tempStr3 = strm3.str();
+  strm3<<block_ptr;
+  string temp_str3 = strm3.str();
   stringstream strm4;
   auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
   strm4<<t2;
-  string tempStr4 = strm4.str();
-  string blockInfo ="Malloc "+tempStr3+" "+tempStr1+" "+tempStr4;
-  Append(blockInfo);
-
+  string temp_str4 = strm4.str();
+  string block_info ="Malloc "+temp_str3+" "+temp_str1+" "+temp_str4;
+  Append(block_info);
   
 }
 
 void SwapGPU::DeploySwap(){
-   ///swap and sync as per schedule.
-  int r_gc = (gc-location)%maxLen; 
-  int r_gc_n = r_gc - maxLen;
-
-  if (asyncSwapFlag == 1){
-    if ((gc < three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) {
-      cout<<"condition A"<<endl;
-      DeploySwap_exec(r_gc_n);
+  /*
+  swap and sync as per schedule, at every index, by calling DeploySwapExec()
+  */
+
+  int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length; 
+  int r_global_index_n = r_global_index - iteration_length;
+
+  if (async_swap_flag == 1){
+    if ((global_index < three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index_n) == table_sched.end()))) {
+      DeploySwapExec(r_global_index_n);
     }
-    if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) {
-      cout<<"condition B"<<endl;
-      DeploySwap_exec(r_gc_n);
+    if ((global_index >= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index_n) == table_sched.end()))) {
+      DeploySwapExec(r_global_index_n);
     }
-    if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc) == Table_sched.end()))) {
-      cout<<"condition C"<<endl;
-      DeploySwap_exec(r_gc);
+    if ((global_index >= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index) == table_sched.end()))) {
+      DeploySwapExec(r_global_index);
     }
   }
 }
 
 
-void SwapGPU::DeploySwap_exec(int r_gc){
-  cout<<"--------sched action at "<<r_gc<<endl;
-  auto swap_idx = std::get<0>(Table_sched.find(r_gc)->second);
-  auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second);
-  auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second);
-  auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second);
+void SwapGPU::DeploySwapExec(int r_global_index){
+  //execute DeploySwap 
+  auto swap_idx = std::get<0>(table_sched.find(r_global_index)->second);
+  auto swap_dir = std::get<1>(table_sched.find(r_global_index)->second);
+  auto sync_idx = std::get<2>(table_sched.find(r_global_index)->second);
+  auto sync_dir = std::get<3>(table_sched.find(r_global_index)->second);
   if (swap_dir == 0){ 
-    SwapOut_idx(swap_idx); 
-    cout<<"----Swap Out "<<swap_idx<<endl;
+    SwapOut(swap_idx); 
   }
   if (swap_dir == 1){ 
-    SwapIn_idx(swap_idx); 
-    cout<<"----Swap In "<<swap_idx<<endl;
+    SwapIn(swap_idx); 
   }
-  //TODO(junzhe) verify sync what else to be done
   if (sync_dir == 0){
     ///sync swap-out, including sync, update block's data_ to nullptr, free data_, update meta.
-    auto last_meta = Table_meta.find(sync_idx)->second;
+    auto last_meta = table_meta.find(sync_idx)->second;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaEventSynchronize(last_meta.in_event);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
 
-    Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed.
+    table_not_at_device[last_meta.block_] = sync_idx;
 
     last_meta.block_->update_data(nullptr);
-    // cout<<"to free data_"<<last_meta.data_<<endl;
     pool_->Free(last_meta.data_);
     ///append vec_block_mf
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-      && ((gc - maxLen) >= three_more_globeCounter)){
-      string tempStr1 ="Free ";
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+      && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+      string temp_str1 ="Free ";
       stringstream strm2;
       strm2<<last_meta.data_;
-      string tempStr2 = strm2.str();
-      string temp = tempStr1+tempStr2;
+      string temp_str2 = strm2.str();
+      string temp = temp_str1+temp_str2;
       vec_block_mf.push_back(temp);
     }
 
-    if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+    if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
       fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
       file_mf_one_itr<<"Free "<<last_meta.data_<<" SwapOut(Sync)"<<endl;
     }
-    last_meta.data_ = nullptr; //not really needed TODO(junzhe)
-    cout<<"----sync out "<<sync_idx<<endl;
-    Table_meta.find(sync_idx)->second = last_meta;
+    last_meta.data_ = nullptr;
+    table_meta.find(sync_idx)->second = last_meta;
   }
   if (sync_dir == 1){
     ///sync swap-in, including sync, update block's data_ to new gpu address, update meta.
-    //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe)
-    auto last_meta = Table_meta.find(sync_idx)->second;
+    auto last_meta = table_meta.find(sync_idx)->second;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaEventSynchronize(last_meta.out_event);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    Table_not_at_device.erase(last_meta.block_);
+    table_not_at_device.erase(last_meta.block_);
     last_meta.block_->update_data(last_meta.data_);
-    cout<<"----sync in "<<sync_idx<<endl;
-    Table_meta.find(sync_idx)->second = last_meta;
+    table_meta.find(sync_idx)->second = last_meta;
   }
 }
 
-void SwapGPU::Append(string blockInfo){
+void SwapGPU::Append(string block_info){
+  /*
+  Append Operation block info after each operation
+  Meantime execute following operations:
+    insert size for non-malloc operations
+    update global memory load
+    control swap flag on and off
+    update table_meta and table_sched
+    deploy swap at every index.
+    test moved from start of malloc/free to end of append, only global_index+1 changed
+    call PoolOpt to Construct Pool
+  */
 
-  vector<string> v = swap_split(blockInfo, " ");
-  void* tempPtr;
+  vector<string> v = SplitOptString(block_info, " ");
+  void* temp_ptr;
   stringstream convert(v[1]);
-  convert>>tempPtr;
-  auto tempBlock_ = static_cast<Block*>(tempPtr);
+  convert>>temp_ptr;
+  auto block_ptr = static_cast<Block*>(temp_ptr);
   
   // insert size, malloc : flag, block_, size, t; others: insert size t.
   if (v.size() != 4) {
     stringstream strm1;
-    strm1<<tempBlock_->size();
-    string tempStr1 = strm1.str();
-    blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2];
+    strm1<<block_ptr->size();
+    string temp_str1 = strm1.str();
+    block_info = v[0] + ' ' + v[1] + ' ' + temp_str1 + ' ' + v[2];
   }
 
   // update global load
-  if (maxLen < maxLen_threshold){
+  if (iteration_length < iteration_length_threshold){
     if (v[0] == "Malloc"){
       if (global_load.size()>0){
-        global_load.push_back(global_load[global_load.size()-1]+tempBlock_->size());
+        global_load.push_back(global_load[global_load.size()-1]+block_ptr->size());
       } else {
-        global_load.push_back(tempBlock_->size());
+        global_load.push_back(block_ptr->size());
       }
     } else if (v[0] == "Free"){
-      global_load.push_back(global_load[global_load.size()-1]-tempBlock_->size());
+      global_load.push_back(global_load[global_load.size()-1]-block_ptr->size());
     } else {
       global_load.push_back(global_load[global_load.size()-1]);
     }
   }
 
   //append into vec_block
-  vec_block.push_back(blockInfo);
-
-
-  //cout<<blockInfo<<endl;
-  //cout<<tempBlock_->size()<<endl;
-  //cout<<"load: "<<global_load[global_load.size()-1]<<" len of blockInfo and global_load "<<vec_block.size()<<' '<<global_load.size()<<endl;
-  //std::this_thread::sleep_for(std::chrono::milliseconds(2000));
-  
-  // if (asyncSwapFlag == 1){
-  //   vec_block_fresh.push_back(blockInfo);
-  // }
-  // if ((maxLen>maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){
-  //   fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app);
-  //   for (int i =0; i<vec_block_fresh.size();i++){
-  //     file_block_fresh<<vec_block_fresh[i]<<endl;
-  //   }
-  // }
-  // fstream file_block5("append.csv", ios::in|ios::out|ios::app);
-  // file_block5<<gc<<' '<<blockInfo<<' '<<(gc-1247)%612<<endl;
-
-  //print time duration per iteration
-  if ((maxLen>maxLen_threshold) && ((gc-location)%(maxLen) == 0)){
-    if (tempTime != 0){
-      fstream file_time("itr_time.csv", ios::in|ios::out|ios::app);
-      auto t_now = (std::chrono::system_clock::now()).time_since_epoch().count();
-      file_time<<(float)(t_now - tempTime)/(float)(1000000)<<endl;
-      
+  vec_block.push_back(block_info);
+
+  //change swap flag on and off
+  if (async_swap_flag == 1){
+    int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length;
+    if (block_ptr->size() != size_sequence[r_global_index]){
+      async_swap_flag = 0;
+      cout<<"!!!! async_swap_flag changed back to 0"<<endl;
     }
-    tempTime = (std::chrono::system_clock::now()).time_since_epoch().count();
   }
 
-  //check if last iteration, TODO(junzhe) further verify with MallocFree.
-  if (asyncSwapFlag == 1){
-    int r_gc = (gc-location)%maxLen;
-    if (tempBlock_->size() != sizeSequence[r_gc]){
-      asyncSwapFlag = 0;
-      cout<<"!!!! asyncSwapFlag changed back to 0"<<endl;
-    }
-  }
-
-  //update Table_meta
-  swap_update_tables(tempBlock_);
+  //update table_meta and table_sched
+  UpdateMetaTables(block_ptr);
 
   //deploy swap at every index.
   DeploySwap();
 
-  //test moved from start of malloc/free to end of append, only gc+1 changed
-  Test_sched_switch_swap();
-  //NOTE: this gc includes read/write and AppendLayer as well, in addition to malloc/free.
-  gc++;
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) == three_more_globeCounter)){
-    cout<<"==================to call PoolOpt"<<endl;
-    fstream file_mf_8910("mf_8910.csv", ios::in|ios::out|ios::app);
-    for (int i = 0; i< vec_block_mf.size();i++){
-      file_mf_8910<<vec_block_mf[i]<<endl;
-    }
-    cout<<"len of vec_block_mf: "<<vec_block_mf.size()<<endl;
-    pool_->PoolOpt(vec_block_mf);
-    cout<<"==================to call PoolOpt done"<<endl;
-  }
+  //test moved from start of malloc/free to end of append, only global_index+1 changed
+  DetectionPlan();
 
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) 
-    && ((gc - three_more_globeCounter)%maxLen == 0)){
-      fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
-      file_mf_one_itr<<"-----new itr------"<<endl;
+  //NOTE: this global_index includes read/write and AppendLayer as well, in addition to malloc/free.
+  global_index++;
+
+  //call PoolOpt to Construct Pool
+  if ((async_swap_flag == 1) && ((global_index - 4 * iteration_length) == three_more_iteration_global_index_threshold)){
+    pool_->PoolOpt(vec_block_mf);
   }
 
 }
 
-void* SwapGPU::GetRealGpuPtr(const Block* block_){
-  // in case that block is at host memory, swapIn ad hoc.
-  auto r_idx = Table_not_at_device.find(block_)->second;
-
-  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+void* SwapGPU::UpdateGpuPtr(const Block* block_ptr){
+  /*
+  in case that block is not at device memory, swapIn ad hoc.
+  used in block class to update ptr after swap in done, if variable is not swapped back yet as expected.
+  */ 
+  auto r_idx = table_not_at_device.find(block_ptr)->second;
   cudaError_t err;
-  BlockMeta meta = Table_meta.find(r_idx)->second;
+  BlockMeta meta = table_meta.find(r_idx)->second;
   cudaEventCreate (&meta.in_event);
-  //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
   void* ptr = nullptr;
   pool_->Malloc((void**)&ptr, meta.size);
-  //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
-  //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
-  //void* to_rm_ptr = meta.data_;
   meta.data_ = ptr;
-  // cout<<"right before cudaMemcpyAsync In"<<endl;
   err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
   cudaEventRecord(meta.in_event,meta.in_stream);
-  // cout<<"right after cudaMemcpyAsync"<<endl;
-  // cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
-  //upadte meta's new gpu addr, in_event
-  
-
-  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
   cudaEventSynchronize(meta.out_event);
-  // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // Table_not_at_device.erase(block_);
-  // last_meta.block_->update_data(last_meta.data_);
-  // cout<<"----sync in "<<sync_idx<<endl;
-  // Table_meta.find(sync_idx)->second = last_meta;
-  Table_meta.find(r_idx)->second = meta;
-
-  // //here should be not update_data()
-  // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second;
-  // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // cudaEventSynchronize(reading_meta.in_event);
-  // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // //cout<<"GetRealGpuPtr, overhead is: "<<t2-t1<<endl;
-  // //cout<<"To update_data swap for (In) "<<Table_not_at_device.find(block_)->second<<" "<<reading_meta.data_<<" 0"<<endl;
-  // //reading_meta.block_->update_data(reading_meta.data_);
-  // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<<Table_not_at_device.find(block_)->second<<endl;
-  // Table_not_at_device.erase(reading_meta.block_);
-  // block_->update_data(static_cast<Block*>(ptr));
-  
-  cout<<"print ptr from function GetRealGpuPtr() "<<ptr<<endl;
+  table_meta.find(r_idx)->second = meta;
 
-  return ptr; //TODO(junzhe) attention, based on no change here.
+  return ptr;
 }
 
-void SwapGPU::SwapOut_idx(const int r_idx){
-  //cout<<"doing asynchrous swapOut of r_idx: "<<r_idx<<' '<<endl;
-  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();  
+void SwapGPU::SwapOut(const int idx){
+  /*
+  memory copy asynchronously GPU -> CPU, and update meta.
+  */
   cudaError_t err;
-  BlockMeta meta = Table_meta.find(r_idx)->second;
+  BlockMeta meta = table_meta.find(idx)->second;
   cudaEventCreate (&meta.out_event);
-  //cout<<"right before cudaMemcpyAsync Out"<<endl;
   err = cudaMemcpyAsync(meta.cpu_ptr,meta.data_,meta.size,cudaMemcpyDeviceToHost,meta.out_stream);
   cudaEventRecord(meta.out_event,meta.out_stream);
-  //cout<<"right after cudaMemcpyAsync"<<endl;
-  auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  // cout<<"To update_data swap for (Out) "<<r_idx<<" "<<meta.block_<<" 0"<<endl;
-  //update meta's out_event
-  Table_meta.find(r_idx)->second = meta;
-  //cout<<"time for asynchrous: "<<t2-t1<<endl;
-  //cudaEventSynchronize(event1);
-  //auto t4 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  //cout<<"time for asynchrous to complete: "<<t4-t1<<endl;
+  table_meta.find(idx)->second = meta;
 }
 
-void SwapGPU::SwapIn_idx(const int r_idx){
-  //logic: extra meta, swap, update meta in Table
-  //TODO(junzhe) to clean up free(), make it in somewhere else.
-  auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
+void SwapGPU::SwapIn(const int idx){
+  /*
+  memory copy asynchronously CPU -> GPU, and update meta.
+  */
+
   cudaError_t err;
-  BlockMeta meta = Table_meta.find(r_idx)->second;
+  BlockMeta meta = table_meta.find(idx)->second;
   cudaEventCreate (&meta.in_event);
-  //cout<<"update block and data of r_idx: "<<r_idx<<' '<<meta.block_<<' '<<meta.data_<<endl;
   void* ptr = nullptr;
   pool_->Malloc((void**)&ptr, meta.size);
+
   ///append vec_block_mf
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)
-    && ((gc - maxLen) >= three_more_globeCounter)){
-    string tempStr1 ="Malloc ";
+  if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)
+    && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){
+    string temp_str1 ="Malloc ";
     stringstream strm2;
     strm2<<ptr;
-    string tempStr2 = strm2.str();
+    string temp_str2 = strm2.str();
     stringstream strm3;
     strm3<<meta.size;
-    string tempStr3 = strm3.str();
-    string temp = tempStr1+tempStr2+" "+tempStr3;
+    string temp_str3 = strm3.str();
+    string temp = temp_str1+temp_str2+" "+temp_str3;
     vec_block_mf.push_back(temp);
   }
-  if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter)){
+  if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold)){
     fstream file_mf_one_itr("mf_one_itr.csv", ios::in|ios::out|ios::app);
     file_mf_one_itr<<"Malloc "<<ptr<<" "<<meta.size<<" swapIn"<<endl;
   }
-  //cout<<"expected results update_data:: "<<meta.block_<<" "<<ptr<<endl;
-  //cout<<"malloc due to swapIn ("<<r_idx<<") "<<ptr<<endl;
-  //void* to_rm_ptr = meta.data_;
+
   meta.data_ = ptr;
-  // cout<<"right before cudaMemcpyAsync In"<<endl;
   err = cudaMemcpyAsync(meta.data_,meta.cpu_ptr,meta.size,cudaMemcpyHostToDevice,meta.in_stream);
   cudaEventRecord(meta.in_event,meta.in_stream);
-  // cout<<"right after cudaMemcpyAsync"<<endl;
-  // cout<<"To update_data swap for (In) "<<r_idx<<" "<<meta.block_<<" "<<meta.data_<<' '<<ptr<<endl;
-  //upadte meta's new gpu addr, in_event
-  Table_meta.find(r_idx)->second = meta;
-  //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update.
-  //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-  //cout<<"time for asynchrous: "<<t2-t1<<endl;
+  table_meta.find(idx)->second = meta;
 }
 
-void SwapGPU::SwapOut(const Block* block_){
-  if (gc < 1000 && block_->size() > 1<<20) {
+void SwapGPU::SwapOutSynchronous(const Block* block_ptr){
+  /*
+  for synchronous swap, collect speed info
+  */
+  if (global_index < 1000 && block_ptr->size() > 1<<20) {
     fstream file_block5("speed.csv", ios::in|ios::out|ios::app);
     BlockMeta meta;
     meta.data_ = meta.block_->get_data();
-    void* tempPtr = nullptr;
-    cudaMallocHost(&tempPtr,block_->size()); //pinned memory.
-    meta.cpu_ptr = tempPtr;
-    Table_block_meta[block_] = meta;
+    void* temp_ptr = nullptr;
+    cudaMallocHost(&temp_ptr,block_ptr->size()); //pinned memory.
+    meta.cpu_ptr = temp_ptr;
+    table_block_meta[block_ptr] = meta;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaError_t err;
-    err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_->size(),cudaMemcpyDeviceToHost);
+    err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_ptr->size(),cudaMemcpyDeviceToHost);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    file_block5<<"Out "<<block_->size()<<' '<<t2-t1<<endl;
-    cout<<"swap out done at gc: "<<gc<<endl;
+    file_block5<<"Out "<<block_ptr->size()<<' '<<t2-t1<<endl;
   }
 }
 
-void SwapGPU::SwapIn(const Block* block_){
-  if (gc < 1000 && block_->size() > 1<<20) {
+void SwapGPU::SwapInSynchronous(const Block* block_ptr){
+  /*
+  for synchronous swap, collect speed info
+  */
+  if (global_index < 1000 && block_ptr->size() > 1<<20) {
     fstream file_block5("speed.csv", ios::in|ios::out|ios::app);
-    BlockMeta meta = Table_block_meta.find(block_)->second;
+    BlockMeta meta = table_block_meta.find(block_ptr)->second;
     auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count();
     cudaError_t err;
-    err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_->size(),cudaMemcpyHostToDevice);
+    err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_ptr->size(),cudaMemcpyHostToDevice);
     auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count();
-    file_block5<<"In "<<block_->size()<<' '<<t2-t1<<endl;
-    cout<<"swap in done at gc: "<<gc<<endl;
+    file_block5<<"In "<<block_ptr->size()<<' '<<t2-t1<<endl;
   }
 }
 
-
 }  // namespace singa
 #endif  // USE_CUDA
\ No newline at end of file
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index 8c7bc1c057..1bdb1190a5 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -118,75 +118,55 @@ void CudaMemPool::Free(void *ptr) {
 }
 
 //for SmartMemPool
-///vertex of the graph.
-class Vertex {
-public:
-    int name;
-    size_t size;
-    int r; //arrive
-    int d; //depart
-    int crossItr =0;
-    Vertex(int,size_t,int,int);
-    pair<size_t, size_t> colorRange;
-    vector<pair<size_t, size_t>> colorOccupied;
-};
-Vertex::Vertex(int n, size_t s, int r1, int d1){
-    name =n;
-    size = s;
-    r = r1;
-    d = d1;
-}//end of class Vertex
-
 
 ///Section for structs and respective sorting function:
-// onePieceMsg_pool, onePairMsg, oneIterMsg, version 11/30 3pm
-struct onePieceMsg_pool{
+// PoolOptInfo, PoolBlockLifeTime, PoolOptSimplifiedInfo
+struct PoolOptInfo{
     /*
-     members: [ptr, size, MallocFree, idx]
+     members: [ptr, size, operation_type, idx]
      */
     string ptr;
     size_t size;
-    int MallocFree;
+    int operation_type;
     int idx;
-    onePieceMsg_pool(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){}
+    PoolOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){}
 };
 
 
-struct less_than_ptrIdx{
-    /*
-     sort onePieceMsg_pool by ptr and then idx.
-     */
-    inline bool operator() (const onePieceMsg_pool& struct1, const onePieceMsg_pool& struct2)
-    {
-        return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
-    }
+struct sort_by_ptr_idx_ascending{
+  /*
+   sort PoolOptInfo by ptr and then idx.
+   */
+  inline bool operator() (const PoolOptInfo& struct1, const PoolOptInfo& struct2)
+  {
+    return ((struct1.ptr<struct2.ptr)||((struct1.ptr==struct2.ptr)&&(struct1.idx<struct2.idx)));
+  }
 };
 
 
-struct oneIterMsg{
+struct PoolOptSimplifiedInfo{
     /*
-     members: [idx, MallocFree, size_delta]
+     members: [idx, operation_type, size_delta]
      */
     size_t size_delta;// type as size_t in case size if large.
-    int MallocFree;
+    int operation_type;
     int idx;
-    oneIterMsg(size_t s, int M, int i):size_delta(s),MallocFree(M),idx(i){}
+    PoolOptSimplifiedInfo(size_t s, int M, int i):size_delta(s),operation_type(M),idx(i){}
 };
 
 
-struct less_than_iterIdx{
-    /*
-     sort oneIterMsg by Idx.
-     */
-    inline bool operator() (const oneIterMsg& struct1, const oneIterMsg& struct2)
-    {
-        return (struct1.idx<struct2.idx);
-    }
+struct sort_by_itr_idx_ascending{
+  /*
+   sort PoolOptSimplifiedInfo by Idx.
+   */
+  inline bool operator() (const PoolOptSimplifiedInfo& struct1, const PoolOptSimplifiedInfo& struct2)
+  {
+    return (struct1.idx<struct2.idx);
+  }
 };
 
 
-//TODO(junzhe) to replace vertex with onePairMsg, try combine other structs as well.
-struct onePairMsg{
+struct PoolBlockLifeTime{
     /*
      members: [name (r_idx), size, r_idx, d_idx]
      */
@@ -194,66 +174,54 @@ struct onePairMsg{
     size_t size;
     int r_idx;
     int d_idx;
-    onePairMsg(int n,size_t s, int r,int d):name(n),size(s),r_idx(r),d_idx(d){}
+    PoolBlockLifeTime(int n,size_t s, int r,int d):name(n),size(s),r_idx(r),d_idx(d){}
 };
 
 
-struct less_than_size{
-    /*
-     sort onePairMsg by descending size.
-     */
-    inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2)
-    {
-        return (struct1.size>struct2.size);
-    }
-};
-
-struct less_than_size_rIdx{
-    /*
-     sort onePairMsg by descending size and r_idx
-     */
-    inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2)
-    {
-        return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx<struct2.r_idx)));
-    }
+struct sort_by_size_descending{
+  /*
+  sort PoolBlockLifeTime by descending size.
+  */
+  inline bool operator() (const PoolBlockLifeTime& struct1, const PoolBlockLifeTime& struct2)
+  {
+    return (struct1.size>struct2.size);
+  }
 };
 
-struct less_than_lookupIdx{
-    /*
-     sort lookUpElement by idx.
-     */
-    inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2)
-    {
-        return (struct1.r_idx<struct2.r_idx);
-    }
+struct sort_by_size_r_idx_descending{
+  /*
+  sort PoolBlockLifeTime by descending size and r_idx
+  */
+  inline bool operator() (const PoolBlockLifeTime& struct1, const PoolBlockLifeTime& struct2)
+  {
+    return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx<struct2.r_idx)));
+  }
 };
 
 
-/// string delimiter
-vector<string> split(string s, string delimiter) {
-    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
-    string token;
-    vector<string> res;
-    while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
-        token = s.substr(pos_start, pos_end - pos_start);
-        pos_start = pos_end + delim_len;
-        res.push_back(token);
-    }
-    res.push_back(s.substr(pos_start));
-    return res;
+vector<string> SplitString(string s, string delimiter) {
+  /// string delimiter
+  size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+  string token;
+  vector<string> res;
+  while ((pos_end = s.find(delimiter, pos_start)) != string::npos) {
+    token = s.substr(pos_start, pos_end - pos_start);
+    pos_start = pos_end + delim_len;
+    res.push_back(token);
+  }
+  res.push_back(s.substr(pos_start));
+  return res;
 }
 
-///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg
-//vector of pairMsg is used in run.
-//vector of iterMsg is used in test.
 
-vector<onePieceMsg_pool> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange){
+vector<PoolOptInfo> PoolOptSeqStrToStruct(vector<string> vec, int &idx_range){
     /*
-     convert vector of string into vector of onePieceMsg_pool, sorted by ptr and then idx, and update idxRange to pieceMsgVec size.
+     convert vector of string into vector of PoolOptInfo, 
+     sorted by ptr and then idx, and update idx_range to pieceMsgVec size.
      */
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_;
+    vector<PoolOptInfo>vec_pool_opt_info;
     for (int i=0;i<vec.size();i++) {
-        vector<string> v = split(vec[i], " ");
+        vector<string> v = SplitString(vec[i], " ");
         if (v[0]=="Malloc"){
             //convert v[2] from str to size_t
             size_t result;
@@ -262,410 +230,346 @@ vector<onePieceMsg_pool> strVec_2_pieceMsgVec(vector<string> vec, int &idxRange)
                 result =-1;
                 cout<<"error for converting size from str to int."<<endl;
             }
-            onePieceMsg_pool tempMsg(v[1],result, 1, i);
-            onePieceMsg_poolVec_.push_back(tempMsg);
+            PoolOptInfo tempMsg(v[1],result, 1, i);
+            vec_pool_opt_info.push_back(tempMsg);
         }else if (v[0]=="Free"){
-            onePieceMsg_pool tempMsg(v[1],-1, -1, i);
-            onePieceMsg_poolVec_.push_back(tempMsg);
+            PoolOptInfo tempMsg(v[1],-1, -1, i);
+            vec_pool_opt_info.push_back(tempMsg);
         }else {
             cout<<"error for process the onePriceMsg."<<endl;
         }
     }
     
-    sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx());
-    idxRange = static_cast<int>(onePieceMsg_poolVec_.size());
+    sort(vec_pool_opt_info.begin(),vec_pool_opt_info.end(),sort_by_ptr_idx_ascending());
+    idx_range = static_cast<int>(vec_pool_opt_info.size());
 
-    return onePieceMsg_poolVec_;
-}// end of strVec_2_pieceMsgVec function
+    return vec_pool_opt_info;
+}
 
 
-pair<vector<onePairMsg>,vector<onePairMsg>> pieceMsgVec_2_pairOfPairMsgVec(vector<onePieceMsg_pool>onePieceMsg_poolVec_, int idxRange){
-    /*
-     pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks.
-     */
-    vector<onePairMsg>onePairMsgVec_1;
-    vector<onePairMsg>onePairMsgVec_2;
-    int i=0;
-    
-    //while loop processes a pair at each time, if got a pair.
-    while (i<(onePieceMsg_poolVec_.size()-1)){
-        //condition A: start with free. do nothing.
-        if (onePieceMsg_poolVec_[i].MallocFree==-1){
-            i+=1;
-        }
-        //condition B: start with Malloc, next item same ptr and is free.
-        if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)&&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){
-            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx);
-            onePairMsgVec_1.push_back(tempPair);
-            i+=2;
-        }
-        // condition C: start with Malloc, no free.
-        if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){
-            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
-            onePairMsgVec_2.push_back(tempPair);
-            i+=1;
-        }
-    }//end of while
-    //condition D: if still left with the last item
-    if ((i<onePieceMsg_poolVec_.size())&&(onePieceMsg_poolVec_[i+1].MallocFree==1)){
-        onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
-        onePairMsgVec_2.push_back(tempPair);
+pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>> PoolOptInfoToBlockLifeTime(vector<PoolOptInfo>vec_pool_opt_info, int idx_range){
+  /*
+  convert vector of opt info into vector of block life time
+  return a pair of vectors: 1. normal blocks 2. cross-iteration blocks.
+  */
+  vector<PoolBlockLifeTime>vec_block_life_time1;
+  vector<PoolBlockLifeTime>vec_block_life_time2;
+  int i=0;
+  
+  //while loop processes a pair at each time, if got a pair.
+  while (i<(vec_pool_opt_info.size()-1)){
+    //condition: start with free. do nothing.
+    if (vec_pool_opt_info[i].operation_type==-1){
         i+=1;
     }
-
-    //sort both pairVec
-    sort(onePairMsgVec_1.begin(),onePairMsgVec_1.end(),less_than_size_rIdx());
-    sort(onePairMsgVec_2.begin(),onePairMsgVec_2.end(),less_than_size_rIdx());
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_(onePairMsgVec_1,onePairMsgVec_2);
-    
-    return pairOfPairMsgVec_;
-}//end of pieceMsgVec_2_pairOfPairMsgVec function
-
-///Section of coloring algorithm. mergeSeg and then FFallocation when building edges of the graph.
-vector<pair<size_t, size_t>>  mergeSeg(vector<pair<size_t,size_t>> colorOccupied){
-    /*
-     version 12/9 11am -- modify to accomodate unsigned int/size_t
-     input:the collection of color ranges that is once occupied by some block during a block's life time.
-     function: merge consecutive/overlapping segments of colorOccupied
-     output: merged segments in ascending order.
-     time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of colorOccupied.
-     */
-    sort(colorOccupied.begin(), colorOccupied.end());
-    
-    if(colorOccupied.size()<=1){
-        return colorOccupied;
-    }
-    
-    int m = 0;
-    while (m<(colorOccupied.size()-1)){
-        
-        if ((colorOccupied[m].second +2)> colorOccupied[m+1].first){
-            pair<int,int>tempItem(colorOccupied[m].first,max(colorOccupied[m].second,colorOccupied[m+1].second));
-            //remove m+1 and m
-            colorOccupied.erase(colorOccupied.begin()+m+1);
-            colorOccupied.erase(colorOccupied.begin()+m);
-            //insert the combined range
-            colorOccupied.insert(colorOccupied.begin()+m,tempItem);
-        }else{
-            m+=1;
-        }
-    }//end of while loop
-    
-    //verify if mergeSeg is completed. O(n^2) optional
-//    if(colorOccupied.size()>1){
-//        for (int i=0;i<(colorOccupied.size()-1);i++){
-//            if(colorOccupied[i].second>=colorOccupied[i+1].first){
-//                cout<<"error in mergeSeg"<<endl;
-//            }
-//        }
-//    }
-    
-    return colorOccupied;
-}//end of mergeSeg function
-
-
-pair<size_t,size_t> FFallocation(vector<pair<size_t,size_t>> colorMerged,size_t size, size_t local_offset){
-    /*
-     version 12/2 4pm
-     First Fit weighted coloring
-     return a pair standing for colorRange.
-     local_offset shifts the returned colorRange, allowing multiple run().
-     local_offset not changable, whereas offset is changable.
-     */
-    // condition A: if no occupied, put after the local_offset
-    if (colorMerged.size()==0){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
-    }
-    
-    // condition B: able to fit before first block, after the local_offset
-    if ((size+local_offset)<(colorMerged[0].first+1)){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+    //condition: start with Malloc, next item same ptr and is free.
+    if ((vec_pool_opt_info[i].operation_type==1)&& (vec_pool_opt_info[i+1].operation_type==-1)&&((vec_pool_opt_info[i].ptr==vec_pool_opt_info[i+1].ptr))){
+      PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,vec_pool_opt_info[i+1].idx);
+      vec_block_life_time1.push_back(temp_block_life_time);
+      i+=2;
     }
-    
-    size_t yLocation= -1;
-    if (colorMerged.size()>1) {
-        int n = 0;
-        while (n<(colorMerged.size()-1)){
-            // condition C: able to fit in between middle blocks.
-            if ((colorMerged[n+1].first-colorMerged[n].second-1)>=size){
-                yLocation = colorMerged[n].second+1;
-                break;
-            }
-            n+=1;
-        }//end of while loop.
-        // condition D: allocate after the last block.
-        if (yLocation == -1){
-            yLocation = colorMerged[colorMerged.size()-1].second+1;
-        }
-    }// end of if loop, conditon C and D.
-    
-    // condition E: colorMeger len =1, allocate after the last block.
-    if (colorMerged.size()==1){
-        yLocation = colorMerged[0].second+1;
-    }
-    
-    if (yLocation==-1){
-        cout<<"error in FFallocation!!!"<<endl;
+    // condition: start with Malloc, no free.
+    if ((vec_pool_opt_info[i].operation_type==1)&&(vec_pool_opt_info[i].ptr!=vec_pool_opt_info[i+1].ptr)){
+      PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,idx_range);
+      vec_block_life_time2.push_back(temp_block_life_time);
+      i+=1;
     }
-    
-    return pair<size_t,size_t>(yLocation,yLocation+size-1);
-}//end of FFallocation function
+  }//end of while
+  //condition: if still left with the last item
+  if ((i<vec_pool_opt_info.size())&&(vec_pool_opt_info[i+1].operation_type==1)){
+    PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,idx_range);
+    vec_block_life_time2.push_back(temp_block_life_time);
+    i+=1;
+  }
 
+  //sort both pair
+  sort(vec_block_life_time1.begin(),vec_block_life_time1.end(),sort_by_size_r_idx_descending());
+  sort(vec_block_life_time2.begin(),vec_block_life_time2.end(),sort_by_size_r_idx_descending());
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time(vec_block_life_time1,vec_block_life_time2);
+  
+  return pair_vec_block_life_time;
+}
 
-pair<size_t,size_t> BFallocation(vector<pair<size_t,size_t>> colorMerged,size_t size, size_t local_offset){
-    /*
-     version 12/11 1pm
-     Best Fit allocation, input and output same as FFallocation
-     */
-    // condition A: if no occupied, put after the local_offset
-    if (colorMerged.size()==0){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
-    }
-    //condition B: if size=1, able to fit before the first block
-    if ((colorMerged.size()==1)&&((size+local_offset)<(colorMerged[0].first+1))){
-        return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
-    }
-    
-    //condition C: else of B
-    if ((colorMerged.size()==1)&&((size+local_offset)>=(colorMerged[0].first+1))){
-        return pair<size_t,size_t>(colorMerged[0].second+1,colorMerged[0].second+size);
-    }
-    
-    //condition D and E:
-    size_t yLocation=-1;
-    pair<int, size_t>tempHole(-1,-1); // n, hole size between n and n+1
-    if (colorMerged.size()>1) {
-        int n = 0;
-        while (n<(colorMerged.size()-1)){
-            // condition C: able to fit in between middle blocks. select smallest.
-            if (((colorMerged[n+1].first-colorMerged[n].second-1)>=size)&&((colorMerged[n+1].first-colorMerged[n].second-1)<tempHole.second)){
-                tempHole.first=n;
-                tempHole.second=colorMerged[n+1].first-colorMerged[n].second-1;
-            }
-            n+=1;
-        }//end of while loop.
-        
-        if(tempHole.first==-1){
-            // condition D: allocate after the last block.
-            yLocation = colorMerged[colorMerged.size()-1].second+1;
-        }else{
-            //condition E: best fit in the smallest hole.
-            yLocation = colorMerged[tempHole.first].second+1;
-            
-        }
-    }// end of if loop, conditon D and E.
-    
-    if (yLocation==-1){
-        cout<<"error in BFallocation!"<<endl;
+///Section implementing coloring algorithm.
+vector<pair<size_t, size_t>>  MergeColoredSegments(vector<pair<size_t,size_t>> vec_color_preoccupied){
+  /*
+  merge consecutive/overlapping segments of vec_color_preoccupied
+  input:the collection of color ranges that is once occupied by some block during a block's life time.
+  output: merged segments in ascending order.
+  time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of vec_color_preoccupied.
+  */
+  sort(vec_color_preoccupied.begin(), vec_color_preoccupied.end());
+  
+  if(vec_color_preoccupied.size()<=1){
+    return vec_color_preoccupied;
+  }
+  
+  int m = 0;
+  while (m<(vec_color_preoccupied.size()-1)){
+    if ((vec_color_preoccupied[m].second +2)> vec_color_preoccupied[m+1].first){
+      pair<int,int>tempItem(vec_color_preoccupied[m].first,max(vec_color_preoccupied[m].second,vec_color_preoccupied[m+1].second));
+      //remove m+1 and m
+      vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m+1);
+      vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m);
+      //insert the combined range
+      vec_color_preoccupied.insert(vec_color_preoccupied.begin()+m,tempItem);
+    }else{
+        m+=1;
     }
-    
-    return pair<size_t,size_t>(yLocation,yLocation+size-1);
+  }//end of while loop
+   
+  return vec_color_preoccupied;
 }
 
-vector<Vertex> colorSomeVertices(vector<onePairMsg> pairMsgVec_, size_t &offset,string colorMethod){
-    /*
-     color all or 1/2 vertices using mergeSeg() and FFallocation(), with update offset.
-     time complexity: O(n^2).
-     */
-    size_t local_offset = offset; //feed into FFallocation, shall never change.
-    int m = static_cast<int>(pairMsgVec_.size());
-    //init all vertices
-    vector<Vertex>vertices;
-    for (int i=0; i<m;i++){
-        Vertex tempVertex(pairMsgVec_[i].name,pairMsgVec_[i].size,pairMsgVec_[i].r_idx,pairMsgVec_[i].d_idx);
-        vertices.push_back(tempVertex);
 
+pair<size_t,size_t> FirstFitAllocation(vector<pair<size_t,size_t>> vec_color_merged,size_t size, size_t local_offset){
+  /*
+   First Fit weighted coloring
+   return a pair standing for color_range.
+   local_offset shifts the returned color_range, allowing multiple Plan().
+   local_offset not changable, whereas offset is changable.
+   */
+  // condition: if no occupied, put after the local_offset
+  if (vec_color_merged.size()==0){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  
+  // condition: able to fit before first block, after the local_offset
+  if ((size+local_offset)<(vec_color_merged[0].first+1)){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  
+  size_t y_location= -1;
+  if (vec_color_merged.size()>1) {
+    int n = 0;
+    while (n<(vec_color_merged.size()-1)){
+      // condition: able to fit in between middle blocks.
+      if ((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size){
+        y_location = vec_color_merged[n].second+1;
+        break;
+      }
+      n+=1;
+    }//end of while loop.
+    // condition: allocate after the last block.
+    if (y_location == -1){
+      y_location = vec_color_merged[vec_color_merged.size()-1].second+1;
     }
+  }// end of if loop, conditon C and D.
+  
+  // condition: colorMeger len =1, allocate after the last block.
+  if (vec_color_merged.size()==1){
+    y_location = vec_color_merged[0].second+1;
+  }
+  
+  if (y_location==-1){
+    cout<<"error in FirstFitAllocation!!!"<<endl;
+  }
+  
+  return pair<size_t,size_t>(y_location,y_location+size-1);
+}
 
-    int **adj;
-    adj = new int*[m]; //TODO(junzhe) should be deleted somewhere.
-    // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop.
-    for (int i=0; i<m;i++){
-        adj[i] = new int[m];
-        for (int j=0; j<m;j++){
-            if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))){
-                adj[i][j]=1;
-                if (vertices[j].colorRange.second){ //as second never be 0, if not empty.
-                    vertices[i].colorOccupied.push_back(vertices[j].colorRange);
-                }
-            }
-            else { adj[i][j]=0; }
-        }
-        
-        vector<pair<size_t,size_t>>colorMerged = mergeSeg(vertices[i].colorOccupied);
-       
-        if(colorMethod=="FF"){
-            vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset);
-            
-        }else{ //BF
-            vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, local_offset);
-        }
 
-        //update of offset, largest memory footprint as well.
-        if (vertices[i].colorRange.second >=offset){
-            offset = vertices[i].colorRange.second+1;
-        }
-    }//end of for loop.
+pair<size_t,size_t> BestFitAllocation(vector<pair<size_t,size_t>> vec_color_merged,size_t size, size_t local_offset){
+  /*
+   Best Fit allocation, input and output same as FirstFitAllocation
+  */
+  // condition: if no occupied, put after the local_offset
+  if (vec_color_merged.size()==0){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  //condition: if size=1, able to fit before the first block
+  if ((vec_color_merged.size()==1)&&((size+local_offset)<(vec_color_merged[0].first+1))){
+    return pair<size_t,size_t>(0+local_offset,size-1+local_offset);
+  }
+  
+  //condition: lese of second condition
+  if ((vec_color_merged.size()==1)&&((size+local_offset)>=(vec_color_merged[0].first+1))){
+    return pair<size_t,size_t>(vec_color_merged[0].second+1,vec_color_merged[0].second+size);
+  }
+  
+  size_t y_location=-1;
+  pair<int, size_t>temp_hole(-1,-1); // n, hole size between n and n+1
+  if (vec_color_merged.size()>1) {
+    int n = 0;
+    while (n<(vec_color_merged.size()-1)){
+      // condition: able to fit in between middle blocks. select smallest.
+      if (((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size)&&((vec_color_merged[n+1].first-vec_color_merged[n].second-1)<temp_hole.second)){
+        temp_hole.first=n;
+        temp_hole.second=vec_color_merged[n+1].first-vec_color_merged[n].second-1;
+      }
+      n+=1;
+    }//end of while loop.
     
-    return vertices;
+    if(temp_hole.first==-1){
+      // condition: allocate after the last block.
+      y_location = vec_color_merged[vec_color_merged.size()-1].second+1;
+    }else{
+      //condition: best fit in the smallest hole.
+      y_location = vec_color_merged[temp_hole.first].second+1;       
+    }
+  }// end of if loop, conditon D and E.
+  
+  if (y_location==-1){
+    cout<<"error in BestFitAllocation!"<<endl;
+  }
+  
+  return pair<size_t,size_t>(y_location,y_location+size-1);
 }
 
+vector<Vertex> AssignColorToVertices(vector<PoolBlockLifeTime> vec_block_life_time, size_t &offset,string color_method){
+  /*
+   color all or 1/2 vertices using MergeColoredSegments() and FirstFitAllocation(), with updated offset.
+   time complexity: O(n^2).
+  */
+  size_t local_offset = offset; //feed into FirstFitAllocation, shall never change.
+  int m = static_cast<int>(vec_block_life_time.size());
+  //init all vertices
+  vector<Vertex>vertices;
+  for (int i=0; i<m;i++){
+    Vertex temp_vertex(vec_block_life_time[i].name,vec_block_life_time[i].size,vec_block_life_time[i].r_idx,vec_block_life_time[i].d_idx);
+    vertices.push_back(temp_vertex);
+  }
 
-///get cross-iteration duration pairs
-pair<map<int,int>,map<int,int>> cross_itr_durations(vector<string>vec_double, int location, int maxLen, int &doubleRange){
-    
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_2,doubleRange);
-    
-    map<int,int>Table_r2d; //full duration info, cross-iteration duration.
-    map<int,int>Table_d2r;
-    for (int i=0;i<pairOfPairMsgVec_2.first.size();i++){
-        if(pairOfPairMsgVec_2.first[i].r_idx<maxLen){
-            Table_r2d[pairOfPairMsgVec_2.first[i].r_idx] =pairOfPairMsgVec_2.first[i].d_idx%maxLen;
-            Table_d2r[pairOfPairMsgVec_2.first[i].d_idx%maxLen]=pairOfPairMsgVec_2.first[i].r_idx;
+  int **adj;
+  adj = new int*[m];
+  // build edges with values 1 and 0; combine with mergeSeg and FirstFitAllocation in the loop.
+  for (int i=0; i<m;i++){
+    adj[i] = new int[m];
+    for (int j=0; j<m;j++){
+      if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))){
+        adj[i][j]=1;
+        if (vertices[j].color_range.second){ //as second never be 0, if not empty.
+          vertices[i].vec_color_preoccupied.push_back(vertices[j].color_range);
         }
+      }
+      else { adj[i][j]=0; }
     }
     
-    return pair<map<int,int>,map<int,int>>(Table_r2d,Table_d2r);
-}
-
-/// main run funtion
-vector<Vertex> run(vector<string>vec, int &idxRange, size_t &offset, size_t &offsetCrossItr,string colorMethod){
-    /*
-     run function, input vector of strings, return colored vertices,
-     update idxRange, offset.
-     time complexity: O(n^2) where n is maxLen.
-     */
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_ = strVec_2_pieceMsgVec(vec,idxRange);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_,idxRange);
-    //1. normal blocks 2. cross-iteration blocks.
-    vector<onePairMsg>pairMsgVec_1 = pairOfPairMsgVec_.first;
-    vector<onePairMsg>pairMsgVec_2 = pairOfPairMsgVec_.second;
-  
-    vector<Vertex>vertices_2 = colorSomeVertices(pairMsgVec_2,offset,colorMethod);
-    for (int i=0; i<vertices_2.size();i++){
-      vertices_2[i].crossItr = 1;
+    vector<pair<size_t,size_t>>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied);
+   
+    if(color_method=="FF"){
+      vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset);
+        
+    }else{ //BF
+      vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, local_offset);
     }
-    offsetCrossItr = offset;
-    offset = offsetCrossItr*2;
-    vector<Vertex>vertices = colorSomeVertices(pairMsgVec_1,offset,colorMethod);
-    //merge
-    vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end());
 
-    return vertices;
+    //update of offset, largest memory footprint as well.
+    if (vertices[i].color_range.second >=offset){
+      offset = vertices[i].color_range.second+1;
+    }
+  }//end of for loop.
+  
+  return vertices;
 }
 
 
-///Section of test functions.
-vector<size_t> pairOfPairMsgVec_2_repSeq(pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_){
-    int counter_1M=0; int counter_1F=0; int counter_2=0;
-    vector<onePairMsg>onePairMsgVec_1 = pairOfPairMsgVec_.first;
-    vector<onePairMsg>onePairMsgVec_2 = pairOfPairMsgVec_.second;
-    vector<oneIterMsg>oneIterMsgVec_;
-    for (int i =0; i<onePairMsgVec_1.size(); i++){
-        oneIterMsg tempIterM(onePairMsgVec_1[i].size,1,onePairMsgVec_1[i].r_idx);
-        oneIterMsgVec_.push_back(tempIterM);
-        counter_1M++;
-        
-        size_t temp_s_d = static_cast<size_t>(onePairMsgVec_1[i].d_idx-onePairMsgVec_1[i].r_idx);
-        oneIterMsg tempIterF(temp_s_d,-1,onePairMsgVec_1[i].d_idx);
-        oneIterMsgVec_.push_back(tempIterF);
-        counter_1F++;
-    }
-    
-    for (int i =0; i<onePairMsgVec_2.size(); i++){
-        oneIterMsg tempIterM(onePairMsgVec_2[i].size,1,onePairMsgVec_2[i].r_idx);
-        oneIterMsgVec_.push_back(tempIterM);
-        counter_2++;
-    }
-    
-    sort(oneIterMsgVec_.begin(),oneIterMsgVec_.end(),less_than_iterIdx());
-    //only after sort then can create rep.
-    vector<size_t>rep; // vector of size_delta, name it as rep for simlisity.
-    for (int i =0; i<oneIterMsgVec_.size(); i++){
-        rep.push_back(oneIterMsgVec_[i].size_delta);
+pair<map<int,int>,map<int,int>> GetCrossIterationBlocks(vector<string>vec_double, int location_2nd_iteration, int iteration_length, int &double_range){
+  ///get cross-iteration duration blocks
+  vector<PoolOptInfo>vec_pool_opt_info2 = PoolOptSeqStrToStruct(vec_double,double_range);
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time2=PoolOptInfoToBlockLifeTime(vec_pool_opt_info2,double_range);
+  
+  map<int,int>table_ridx_to_didx; //full duration info, cross-iteration duration.
+  map<int,int>table_didx_to_ridx;
+  for (int i=0;i<pair_vec_block_life_time2.first.size();i++){
+    if(pair_vec_block_life_time2.first[i].r_idx<iteration_length){
+      table_ridx_to_didx[pair_vec_block_life_time2.first[i].r_idx] =pair_vec_block_life_time2.first[i].d_idx%iteration_length;
+      table_didx_to_ridx[pair_vec_block_life_time2.first[i].d_idx%iteration_length]=pair_vec_block_life_time2.first[i].r_idx;
     }
+  }
+  
+  return pair<map<int,int>,map<int,int>>(table_ridx_to_didx,table_didx_to_ridx);
+}
 
-    return rep;
-}//end of pairOfPairMsgVec_2_repSeq function
 
+///Section of test functions.
+vector<size_t> PoolOptSeqRepeatableTestPreProcess(pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time){
+  /*
+  pre process pair of vector of block life time info, for ease of repeatable test.
+  */
+  vector<PoolBlockLifeTime>vec_block_life_time1 = pair_vec_block_life_time.first;
+  vector<PoolBlockLifeTime>vec_block_life_time2 = pair_vec_block_life_time.second;
+  vector<PoolOptSimplifiedInfo>vec_pool_opt_simplified_info;
+
+  //process Malloc and Free pair, i.e. normal blocks
+  for (int i =0; i<vec_block_life_time1.size(); i++){
+    PoolOptSimplifiedInfo tempIterM(vec_block_life_time1[i].size,1,vec_block_life_time1[i].r_idx);
+    vec_pool_opt_simplified_info.push_back(tempIterM);
+    size_t temp_s_d = static_cast<size_t>(vec_block_life_time1[i].d_idx-vec_block_life_time1[i].r_idx);
+    PoolOptSimplifiedInfo tempIterF(temp_s_d,-1,vec_block_life_time1[i].d_idx);
+    vec_pool_opt_simplified_info.push_back(tempIterF);
+  }
+  
+  //process Malloc-only blocks, i.e. cross-iteration blocks
+  for (int i =0; i<vec_block_life_time2.size(); i++){
+    PoolOptSimplifiedInfo tempIterM(vec_block_life_time2[i].size,1,vec_block_life_time2[i].r_idx);
+    vec_pool_opt_simplified_info.push_back(tempIterM);
+  }
+  
+  //sort then can create vec_rep.
+  sort(vec_pool_opt_simplified_info.begin(),vec_pool_opt_simplified_info.end(),sort_by_itr_idx_ascending());
+  vector<size_t>vec_rep; // vector of size_delta, name it as vec_rep for simlisity.
+  for (int i =0; i<vec_pool_opt_simplified_info.size(); i++){
+    vec_rep.push_back(vec_pool_opt_simplified_info[i].size_delta);
+  }
 
-vector<size_t> maxRepeatedSeg(vector<size_t>rep, int idxRange, int &maxLen, int &location){
-    /*
-     get max repeated non-overlapping Seg of a vector, return the repeated segment,
-     update maxLen, and location of where Seg starts to repeat.
-     brtue force method using equal()
-     time complexity O(n^2)
-     */
-    for (int i=0; i<idxRange;i++){
-        for (int len=1; len<(idxRange-i);len++){
-            if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLen<len)) {
-                maxLen = len;
-                location = i;
-                cout<<"maxLen increased, lcoation and maxLen: ("<<location<<","<<maxLen<<")"<<endl;
-            }
-        }
-    }
-    //TODO(junzhe) verify the subSeq returned, below poped up error in vgg.
-    vector<size_t>subSeq(&rep[location],&rep[location+maxLen]);
-    if(!(equal(rep.begin()+location,rep.begin()+maxLen-1+location,subSeq.begin()) && equal(rep.begin()+location+maxLen,rep.begin()+2*maxLen-1+location,subSeq.begin()))){
-        cout<<"error in get the maxRep"<<endl;
-    }
-    return subSeq;
+  return vec_rep;
 }
 
 
-void verifyAndCut (vector<size_t>subSeq, int &maxLen, int &location){
-    /*
-     to cut, in case the repeated Seg contains multiple iterations.
-     */
-    int tempMaxLen=0;
-    int tempLocation =0;
-    int tempIdxRange = maxLen;
-    
-    vector<size_t>tempSubSeq = maxRepeatedSeg(subSeq,tempIdxRange,tempMaxLen, tempLocation);
-    //TODO(junzhe), tunable threshold.
-    int threshold =50;
-    if (tempMaxLen>threshold){
-        maxLen = tempMaxLen;
-        location += tempLocation;
-        cout<<"max length get cut"<<endl;
+vector<size_t> PoolRepeatableTest(vector<size_t>rep, int idx_range, int &iteration_length, int &location_2nd_iteration){
+  /*
+  get max repeated non-overlapping Seg of a vector, return the repeated segment,
+  update iteration_length, and location_2nd_iteration of where Seg starts to repeat.
+  brtue force method using equal()
+  time complexity O(n^2)
+  */
+  for (int i=0; i<idx_range;i++){
+    for (int len=1; len<(idx_range-i);len++){
+      if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(iteration_length<len)) {
+        iteration_length = len;
+        location_2nd_iteration = i;
+      }
     }
-    cout<<tempMaxLen<<endl;
+  }
+  //obtain sub_sequence based on iteration_length and location_2nd_iteration
+  vector<size_t>sub_sequence(&rep[location_2nd_iteration],&rep[location_2nd_iteration+iteration_length]);
+  if(!(equal(rep.begin()+location_2nd_iteration,rep.begin()+iteration_length-1+location_2nd_iteration,sub_sequence.begin()) && equal(rep.begin()+location_2nd_iteration+iteration_length,rep.begin()+2*iteration_length-1+location_2nd_iteration,sub_sequence.begin()))){
+    cout<<"error in get the maxRep"<<endl;
+  }
+
+  return sub_sequence;
 }
 
 
-//main function of test
-int test(vector<string>vec3, int &maxLen, int &location){
+void VerifyRepeatableTest(vector<size_t>sub_sequence, int &iteration_length, int &location_2nd_iteration){
     /*
-     main function of test, returns globeCounter, which is when flag shall be switched,
-     update maxLen and location of where the repeated Seg starts.
-     */
-    cout<<"====================== test ========================="<<endl;
-    int idxRange3=0;
-    vector<onePieceMsg_pool>onePieceMsg_poolVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3);
-    pair<vector<onePairMsg>,vector<onePairMsg>>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_3,idxRange3);
-    vector<size_t>rep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_);
+     to cut, in case the repeated Segment returned by PoolRepeatableTest contains multiple iterations.
+    */
+    int temp_iteration_length = 0;
+    int temp_location_2nd_iteration = 0;
+    int temp_idx_range = iteration_length;
+    
+    //verify by testing its subsequence again
+    vector<size_t>tempsub_sequence = PoolRepeatableTest(sub_sequence,temp_idx_range,temp_iteration_length, temp_location_2nd_iteration);
     
-    //get repeated sub vector.
-    vector<size_t>subSeq = maxRepeatedSeg(rep,idxRange3,maxLen,location);
-    //cout<<subSeq.size()<<endl;
-    verifyAndCut(subSeq, maxLen, location);
-    int globeCounter=-1;
-    if (maxLen>100){ //TODO(junzhe) tunable threshold.
-        cout<<"new location and maxLen: "<<location<<' '<<maxLen<<endl;
-        globeCounter = idxRange3+maxLen-(idxRange3-location)%maxLen;
+    //tunable threshold.
+    int threshold = 50;
+    
+    if (temp_iteration_length>threshold){
+        iteration_length = temp_iteration_length;
+        location_2nd_iteration += temp_location_2nd_iteration;
     }
-    return globeCounter;
 }
 
+
 ///verify if coloring got overlapping
-void overlap_test(vector<Vertex> vertices){
+void OverlapVerification(vector<Vertex> vertices){
     size_t s = vertices.size();
     int i,j;
     for (i=0; i<s; i++){
         for (j=i+1; j<s; j++){
-            if (((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d)))&& ((max(vertices[i].colorRange.first,vertices[j].colorRange.first))<(1+min(vertices[i].colorRange.second,vertices[j].colorRange.second)))){
+            if (((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d)))&& ((max(vertices[i].color_range.first,vertices[j].color_range.first))<(1+min(vertices[i].color_range.second,vertices[j].color_range.second)))){
                 cout<<"error overlapping"<<endl;
             }
         }
@@ -674,13 +578,11 @@ void overlap_test(vector<Vertex> vertices){
 
 
 SmartMemPool::SmartMemPool(const MemPoolConf &conf){
-    //TODO(junzhe) to figure out what to do here.
-    colorMethod = "BF";
+    color_method = "BF";
     conf_ = conf;
 }
 
 void SmartMemPool::Init(){
-  //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
   mtx_.lock();
   if(!initialized_){
     initialized_ =true;
@@ -689,291 +591,332 @@ void SmartMemPool::Init(){
 }
 
 
+
+int SmartMemPool::Detection(vector<string>vec_string_test, int &iteration_length, int &location_2nd_iteration){
+  /*
+  Testing repeatability from raw operation sequence
+  returns global_index_threshold, which is when flag shall be switched,
+  update iteration_length and location_2nd_iteration of where the repeated Seg starts.
+  */
+  int idx_range_test=0;
+  vector<PoolOptInfo>vec_pool_opt_info3 = PoolOptSeqStrToStruct(vec_string_test,idx_range_test);
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time = PoolOptInfoToBlockLifeTime(vec_pool_opt_info3,idx_range_test);
+  vector<size_t>vec_rep = PoolOptSeqRepeatableTestPreProcess(pair_vec_block_life_time);
+  
+  //repeatable test with verification
+  vector<size_t>sub_sequence = PoolRepeatableTest(vec_rep,idx_range_test,iteration_length,location_2nd_iteration);
+  VerifyRepeatableTest(sub_sequence, iteration_length, location_2nd_iteration);
+  
+  //update global_index_threshold if test past, i.e. iteration_length exceed certain threshold
+  if (iteration_length>100){ //tunable threshold.
+    global_index_threshold = idx_range_test+iteration_length-(idx_range_test-location_2nd_iteration)%iteration_length;
+  }
+  return global_index_threshold;
+}
+
+
+/// main run funtion
+vector<Vertex> SmartMemPool::Plan(vector<string>vec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method){
+  /*
+  Planning, i.e. Assign Color to Vertices from raw operation sequence info.
+  input vector of strings, return colored vertices,
+  update idx_range, offset.
+  time complexity: O(n^2) where n is iteration_length.
+  */
+
+  vector<PoolOptInfo>vec_pool_opt_info = PoolOptSeqStrToStruct(vec,idx_range);
+  pair<vector<PoolBlockLifeTime>,vector<PoolBlockLifeTime>>pair_vec_block_life_time=PoolOptInfoToBlockLifeTime(vec_pool_opt_info,idx_range);
+  
+  //coloring normal blocks and cross-iteration blocks separately, cannot be miss-matched.
+  vector<PoolBlockLifeTime>vec_block_life_time1 = pair_vec_block_life_time.first;
+  vector<PoolBlockLifeTime>vec_block_life_time2 = pair_vec_block_life_time.second;
+
+  //color cross-iteration blocks
+  vector<Vertex>vertices_2 = AssignColorToVertices(vec_block_life_time2,offset,color_method);
+
+  for (int i=0; i<vertices_2.size();i++){
+    vertices_2[i].cross_iteration = 1;
+  }
+  //update offset
+  offset_cross_iteration = offset;
+  offset = offset_cross_iteration*2;
+  //color normal blocks
+  vector<Vertex>vertices = AssignColorToVertices(vec_block_life_time1,offset,color_method);
+  
+  //merge after coloring
+  vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end());
+
+  return vertices;
+}
+
+
 ///Malloc
 void SmartMemPool::Malloc(void** ptr, const size_t size){
-    /*
-     1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool.
-     2. if flag=0, malloc/cudaMalloc, collect vec string
-     3. if flag=1, look up table, malloc/cudaMalloc if not in the Table
-     4. test repeated sequence every 100 blocks, update globeCounter.
-     */
-
-    //TODO(junzhe) Note, this is dummy here, not catter multiple GPU.
-    //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a.
-    //file<<gc<<' '<<"Malloc"; //a.
-    if (!initialized_){
+  /*
+   1. switch flag when global_index == global_index_threshold, construct lookup table and malloc the whole pool.
+   2. if flag=0, malloc/cudaMalloc, collect vec string
+   3. if flag=1, look up table, malloc/cudaMalloc if not in the Table
+   4. test repeated sequence every 100 blocks, update global_index_threshold.
+   */
+
+  if (!initialized_){
     Init();
   }
 
-    void* allocatedPtr = NULL; //ptr to be returned
+  void* allocated_ptr = NULL; //ptr to be returned
+
+  /// 1. switch flag when global_index == global_index_threshold, construct lookup table and malloc the whole pool.    
+  if (global_index == global_index_threshold){
+
+    malloc_flag = 1;
+    vector<string>vec_raw_opt_info(&vec[location_2nd_iteration],&vec[location_2nd_iteration+iteration_length]);
     
-    if (gc == globeCounter){
-        /// 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool.
-        
-        mallocFlag=1;
-        cout<<"switched to color-malloc"<<endl;
-        vector<string>vec_run(&vec[location],&vec[location+maxLen]);
-        
-        vector<Vertex>vertices = run(vec_run, idxRange,offset,offsetCrossItr, colorMethod);
+    //color vertices
+    vector<Vertex>vertices = Plan(vec_raw_opt_info,idx_range,offset,offset_cross_iteration,color_method);
 
-        //here to verify if the coloring got overlapping. TODO(junzhe) optional
-        //overlap_test(vertices);
-        
-        //obtain the cross-iteration duration info
-        int doubleRange=0;
-        vector<string>vec_double(&vec[location],&vec[location+2*maxLen]);
-        pair<map<int,int>,map<int,int>>pairs =cross_itr_durations(vec_double, location,maxLen,doubleRange);
-        Table_r2d = pairs.first;
-        Table_d2r = pairs.second;
-        
-        //update ptrPool
-        cudaMalloc(&ptrPool,offset); //poolSize or memory foot print  offset.
-        cout<<"ptrPool is: "<<ptrPool<<endl;
-
-        //b.  below 2 loops: vec_r2Ver to replace Table_r2Ver
-        for (int i=0; i<idxRange; i++){
-            lookUpElement tempElement;
-            Vec_r2Ver.push_back(make_pair(i,tempElement));
-        }
-        for (int i=0; i<vertices.size(); i++){
-            lookUpElement temp;
-            temp.r_idx =vertices[i].r;
-            temp.d_idx =Table_r2d.find(vertices[i].r)->second;
-            temp.size =vertices[i].size;
-            temp.offset=vertices[i].colorRange.first;
-            temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char));
-            temp.Occupied =0;
-            temp.crossItr = vertices[i].crossItr;
-            temp.Occupied_backup =0; 
-            //build tables for lookup.
-            Vec_r2Ver[vertices[i].r].second= temp;
-        }
-    }
+    //here to verify if the coloring got overlapping. for verify purpose only.
+    //OverlapVerification(vertices);
     
-    if(mallocFlag==0){
-        ///  2. if flag=0, malloc/cudaMalloc
-        cudaMalloc(ptr, size);
-        allocatedPtr = *ptr;
-        //update load
-        if(loadLogFlag==1){
-            if (gc>0){
-                Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second);
-            }else{ //very first block
-                Table_load[gc]=make_pair(size,0);
-            }
-        }
-        //push_back the string for later test and run.
-        string tempStr1 ="Malloc ";
-        stringstream strm2;
-        strm2<<allocatedPtr;
-        string tempStr2 = strm2.str();
-        stringstream strm3;
-        strm3<<size;
-        string tempStr3 = strm3.str();
-        string temp = tempStr1+tempStr2+" "+tempStr3;
-        vec.push_back(temp);
-        //file<<" Condition M1, addr: "<<*ptr<<endl;  //a.
-    }else{
-        /// 3. if flag=1, look up table.
-        int lookupIdx = (gc-location)%maxLen;
-        if ((Vec_r2Ver[lookupIdx].second.size ==size)&&(Vec_r2Ver[lookupIdx].second.Occupied*Vec_r2Ver[lookupIdx].second.Occupied_backup==0)){
-             if (Vec_r2Ver[lookupIdx].second.Occupied==0){
-                //condition M2, normal and crossItr's primary.
-                //assign ptr and mark as occupied, and add in ptr2rIdx
-                allocatedPtr = Vec_r2Ver[lookupIdx].second.ptr;
-                Vec_r2Ver[lookupIdx].second.Occupied= 1;
-                Table_p2r[allocatedPtr]=lookupIdx;                
-                //update load
-                if(loadLogFlag==1){
-                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second+size);
-                }
-                //file<<" Condition M2, addr: "<<*ptr<<endl;  //a.
-              }else if ((Vec_r2Ver[lookupIdx].second.crossItr==1) && (Vec_r2Ver[lookupIdx].second.Occupied==1) && (Vec_r2Ver[lookupIdx].second.Occupied_backup ==0)) {
-                //condition M4, crossItr's backup
-                allocatedPtr = (void*)((char*)Vec_r2Ver[lookupIdx].second.ptr+offsetCrossItr*sizeof(char));
-                Vec_r2Ver[lookupIdx].second.Occupied_backup=1;
-                Table_p2r[allocatedPtr]=lookupIdx;
-                //update load
-                if(loadLogFlag==1){
-                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second+size);
-                }
-                //file<<" Condition M4, addr: "<<*ptr<<endl;  //a.
-              }
-        }else{  //condition M3, size not proper or both occupied.
-                cudaMalloc(ptr, size);
-                allocatedPtr = *ptr;       
-                //update load
-                if(loadLogFlag==1){
-                  Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second);
-                }
-                //file<<" Condition M3, addr: "<<*ptr<<endl;  //a.
-        } 
-    } //end of loop for flag=1
+    //obtain the cross-iteration duration info
+    int double_range=0;
+    vector<string>vec_double(&vec[location_2nd_iteration],&vec[location_2nd_iteration+2*iteration_length]);
+    pair<map<int,int>,map<int,int>>pairs =GetCrossIterationBlocks(vec_double,location_2nd_iteration,iteration_length,double_range);
+    table_ridx_to_didx = pairs.first;
+    table_didx_to_ridx = pairs.second;
     
-    ///4. test repeated sequence every 100 blocks, update globeCounter.
-    if (((gc+1)%300==0) && (mallocFlag==0) && (globeCounter==-1)&&(gc+2>checkPoint)){
-        cout<<"gc and GC before test: "<<gc<<' '<<globeCounter<<endl;
-        globeCounter = test(vec,maxLen,location);
-        checkPoint=checkPoint*2;
+    //make pool
+    cudaMalloc(&ptr_pool,offset); //poolSize or memory foot print  offset.
+
+    //make vec_block_meta for lookup purpose after pool is constructed
+    for (int i=0; i<idx_range; i++){
+        PoolBlockMeta tempElement;
+        vec_block_meta.push_back(make_pair(i,tempElement));
     }
-    
-    ///get load info, when gc == GC+2maxLen
-    if (gc==(globeCounter+2*maxLen)&& (globeCounter>0)){
-        getMaxLoad();
-        loadLogFlag=0;
+    for (int i=0; i<vertices.size(); i++){
+        PoolBlockMeta temp;
+        temp.r_idx =vertices[i].r;
+        temp.d_idx =table_ridx_to_didx.find(vertices[i].r)->second;
+        temp.size =vertices[i].size;
+        temp.offset=vertices[i].color_range.first;
+        temp.ptr = (void*)((char*)ptr_pool+temp.offset*sizeof(char));
+        temp.occupied =0;
+        temp.cross_iteration = vertices[i].cross_iteration;
+        temp.occupied_backup =0; 
+        //build tables for lookup.
+        vec_block_meta[vertices[i].r].second= temp;
     }
-    
-    gc++;
-    Table_p2s[allocatedPtr]=size; //update it for load tracking purpose.
-    *ptr = allocatedPtr;
-    ///update block_RWMF
-    string tempStr1 ="Malloc ";
+  }
+  ///  2. if flag=0, malloc/cudaMalloc, accumulate vec_info at the beginning iterations.
+  if(malloc_flag ==0){    
+    cudaMalloc(ptr, size);
+    allocated_ptr = *ptr;
+    //update load
+    if(load_flag==1){
+      if (global_index>0){
+        table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second);
+      }else{ //very first block
+        table_load[global_index]=make_pair(size,0);
+      }
+    }
+    //push_back the string for later test and run.
+    string temp_str1 ="Malloc ";
     stringstream strm2;
-    strm2<<allocatedPtr;
-    string tempStr2 = strm2.str();
+    strm2<<allocated_ptr;
+    string temp_str2 = strm2.str();
     stringstream strm3;
     strm3<<size;
-    string tempStr3 = strm3.str();
-    string temp = tempStr1+tempStr2+" "+tempStr3;
-    ///below time can be removed TODO(junzhe)
-    int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-    stringstream strm4;
-    strm4<<now;
-    string tempStr4 = strm4.str();
-    temp = temp+" "+tempStr4;
-    vec_block_RWMF.push_back(temp);
+    string temp_str3 = strm3.str();
+    string temp = temp_str1+temp_str2+" "+temp_str3;
+    vec.push_back(temp);
+  }else{
+
+    /// 3. if flag=1, look up table.
+    int lookup_idx = (global_index-location_2nd_iteration)%iteration_length;
+    if ((vec_block_meta[lookup_idx].second.size ==size)&&(vec_block_meta[lookup_idx].second.occupied*vec_block_meta[lookup_idx].second.occupied_backup==0)){
+     if (vec_block_meta[lookup_idx].second.occupied==0){
+        //condition: normal and cross_iteration's primary.
+        //assign ptr and mark as occupied, and add in ptr2rIdx
+        allocated_ptr = vec_block_meta[lookup_idx].second.ptr;
+        vec_block_meta[lookup_idx].second.occupied= 1;
+        table_ptr_to_ridx[allocated_ptr]=lookup_idx;                
+        //update load
+        if(load_flag==1){
+          table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second+size);
+        }
+      }else if ((vec_block_meta[lookup_idx].second.cross_iteration==1) && (vec_block_meta[lookup_idx].second.occupied==1) && (vec_block_meta[lookup_idx].second.occupied_backup ==0)) {
+        //condition: cross_iteration's backup
+        allocated_ptr = (void*)((char*)vec_block_meta[lookup_idx].second.ptr+offset_cross_iteration*sizeof(char));
+        vec_block_meta[lookup_idx].second.occupied_backup=1;
+        table_ptr_to_ridx[allocated_ptr]=lookup_idx;
+        //update load
+        if(load_flag==1){
+          table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second+size);
+        }
+      }
+    }else{  
+      //condition: size not proper or both occupied.
+      cudaMalloc(ptr, size);
+      allocated_ptr = *ptr;       
+      //update load
+      if(load_flag==1){
+        table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second);
+      }
+    } 
+  } //end of loop for flag=1
+    
+  ///4. test repeated sequence every 300 index, update global_index_threshold.
+  if (((global_index+1)%300==0) && (malloc_flag ==0) && (global_index_threshold==-1)&&(global_index+2>check_point)){
+    global_index_threshold = Detection(vec,iteration_length,location_2nd_iteration);
+    check_point=check_point*2;
+  }
+    
+  ///get load info, when global_index == global_index+2iteration_length
+  if (global_index==(global_index_threshold+2*iteration_length)&& (global_index_threshold>0)){
+    GetMaxLoad();
+    load_flag=0;
+  }
+    
+  global_index++;
+  //update it for load tracking purpose.
+  table_ptr_to_size[allocated_ptr]=size; 
+ 
+  //update *ptr
+  *ptr = allocated_ptr;
+  
+  ///update block_RWMF
+  string temp_str1 ="Malloc ";
+  stringstream strm2;
+  strm2<<allocated_ptr;
+  string temp_str2 = strm2.str();
+  stringstream strm3;
+  strm3<<size;
+  string temp_str3 = strm3.str();
+  string temp = temp_str1+temp_str2+" "+temp_str3;
+  int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+  stringstream strm4;
+  strm4<<now;
+  string temp_str4 = strm4.str();
+  temp = temp+" "+temp_str4;
+  vec_block_rw_mf.push_back(temp);
 }
 
 ///Free
 void SmartMemPool::Free(void* ptr){
     
-    //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a.
-    //file<<gc<<' '<<"Free"; //a.
-    
-    size_t deallocatedSize = Table_p2s.find(ptr)->second;
+  size_t deallocatedSize = table_ptr_to_size.find(ptr)->second;
+  
+  /// at the begining iterations, via cudaFree, accumulate opt info.  
+  if ((global_index_threshold==-1)||(global_index<global_index_threshold)){
+    //push_back the string for later test and run.
+    string temp_str1 ="Free ";
+    stringstream strm2;
+    strm2<<ptr;
+    string temp_str2 = strm2.str();
+    string temp = temp_str1+temp_str2;
+    vec.push_back(temp);
     
-    if ((globeCounter==-1)||(gc<globeCounter)){
-        //push_back the string for later test and run.
-        string tempStr1 ="Free ";
-        stringstream strm2;
-        strm2<<ptr;
-        string tempStr2 = strm2.str();
-        string temp = tempStr1+tempStr2;
-        vec.push_back(temp);
-        
-        //file<<" Condition F1, addr: "<<ptr<<endl;  //a.
-        //update load before free
-        if(loadLogFlag==1){
-            Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second);
-        }
-        /// before flag switch, for sure all free shall be done by free()
-        cudaFree(ptr);
-    }else{
-        if (!(Table_p2r.find(ptr)==Table_p2r.end())){
-            int resp_rIdx = Table_p2r.find(ptr)->second;
-            Table_p2r.erase(ptr);
-            
-            if (ptr == Vec_r2Ver[resp_rIdx].second.ptr){
-              //Condition F2, from M2
-              Vec_r2Ver[resp_rIdx].second.Occupied =0; //freed, able to allocate again.
-              //file<<" Condition F2, addr: "<<ptr<<endl;  //a.
-            }else if (ptr == (void*)((char*)Vec_r2Ver[resp_rIdx].second.ptr+offsetCrossItr*sizeof(char))){
-              //Condition F4, from M4
-              Vec_r2Ver[resp_rIdx].second.Occupied_backup =0;
-              //file<<" Condition F4, addr: "<<ptr<<endl;  //a.
-            } else{
-              //Condition F5, from M2, M4 but idx switched.
-              if (((float)((char*)ptr-((char*)ptrPool+offsetCrossItr*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptrPool+2*offsetCrossItr*sizeof(char)))<0)){
-                     Vec_r2Ver[resp_rIdx].second.Occupied_backup =0;
-              }else{
-                     Vec_r2Ver[resp_rIdx].second.Occupied =0;
-              }
-            }
-            //update load
-             if(loadLogFlag==1){
-                 Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second-deallocatedSize);
-             }
+    //update load before free
+    if(load_flag==1){
+      table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second);
+    }
+    // before flag switch, for sure all free shall be done by free()
+    cudaFree(ptr);
+  }else{
+    /// cases that no need accumulating opt info
+
+    /// free a ptr that is in the memory pool
+    if (!(table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end())){
+      int resp_rIdx = table_ptr_to_ridx.find(ptr)->second;
+      table_ptr_to_ridx.erase(ptr);
+      
+      if (ptr == vec_block_meta[resp_rIdx].second.ptr){
+        vec_block_meta[resp_rIdx].second.occupied =0; //freed, able to allocate again.
+      }else if (ptr == (void*)((char*)vec_block_meta[resp_rIdx].second.ptr+offset_cross_iteration*sizeof(char))){
+        vec_block_meta[resp_rIdx].second.occupied_backup =0;
+      } else{
+        if (((float)((char*)ptr-((char*)ptr_pool+offset_cross_iteration*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptr_pool+2*offset_cross_iteration*sizeof(char)))<0)){
+          vec_block_meta[resp_rIdx].second.occupied_backup =0;
         }else{
-          //update load
-          if(loadLogFlag==1){
-              Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second);
-          }
-          //file<<" Condition F3, addr: "<<ptr<<endl;  //a.
-          cudaFree(ptr);
+          vec_block_meta[resp_rIdx].second.occupied =0;
         }
-            
+      }
+      //update load
+       if(load_flag==1){
+           table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second-deallocatedSize);
+       }
+    }else{
+      /// free a ptr that is NOT in the memory pool
+      
+      //update load
+      if(load_flag==1){
+          table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second);
+      }
+      cudaFree(ptr);
     }
-    gc++;
-    ///update block_RWMF
-    string tempStr1 ="Free ";
-    stringstream strm2;
-    strm2<<ptr;
-    string tempStr2 = strm2.str();
-    string temp = tempStr1+tempStr2;
-    ///below time can be removed TODO(junzhe)
-    int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-    stringstream strm4;
-    strm4<<now;
-    string tempStr4 = strm4.str();
-    temp = temp+" "+tempStr4;
-    vec_block_RWMF.push_back(temp);
+            
+  }
+
+  global_index++;
+
+  ///update block_RWMF
+  string temp_str1 ="Free ";
+  stringstream strm2;
+  strm2<<ptr;
+  string temp_str2 = strm2.str();
+  string temp = temp_str1+temp_str2;
+  int64_t now = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+  stringstream strm4;
+  strm4<<now;
+  string temp_str4 = strm4.str();
+  temp = temp+" "+temp_str4;
+  vec_block_rw_mf.push_back(temp);
 }//end of Free.
 
 
 SmartMemPool::~SmartMemPool(){
-    fstream file_block1("blockInfo_RW.text", ios::in|ios::out|ios::app);
-    fstream file_block2("blockInfo_RWMF.text", ios::in|ios::out|ios::app);
-    for (int i=0; i< vec_block_RW.size();i++){
-        file_block1<<vec_block_RW[i]<<endl;
-    }
-    for (int i=0; i< vec_block_RWMF.size();i++){
-        file_block2<<vec_block_RWMF[i]<<endl;
-    }
-    cudaFree(ptrPool);
-    //TODO(junzhe) verify what else shall be cleaned up.
+
+  fstream file_block1("blockInfo_RW.text", ios::in|ios::out|ios::app);
+  fstream file_block2("blockInfo_RWMF.text", ios::in|ios::out|ios::app);
+  for (int i=0; i< vec_block_rw.size();i++){
+    file_block1<<vec_block_rw[i]<<endl;
+  }
+  for (int i=0; i< vec_block_rw_mf.size();i++){
+    file_block2<<vec_block_rw_mf[i]<<endl;
+  }
+  cudaFree(ptr_pool);
+
 }
 
-void SmartMemPool::getMaxLoad(){
+void SmartMemPool::GetMaxLoad(){
     
-    vector<size_t>cudaLoadLog;
-    for (int i=0; i<Table_load.size();i++){
-        cudaLoadLog.push_back(Table_load.find(i)->second.first);
-    }
-    size_t maxCudaLoad = *max_element(cudaLoadLog.begin(),cudaLoadLog.end());
-    int idxMaxCudaLoad = static_cast<int>(distance(cudaLoadLog.begin(),max_element(cudaLoadLog.begin(),cudaLoadLog.end())));
-    
-    vector<size_t>colorLoadLog;
-    for (int i=0; i<Table_load.size();i++){
-        colorLoadLog.push_back(Table_load.find(i)->second.second);
-    }
-    size_t maxColorLoad = *max_element(colorLoadLog.begin(),colorLoadLog.end());
-    int idxMaxColorLoad = static_cast<int>(distance(colorLoadLog.begin(),max_element(colorLoadLog.begin(),colorLoadLog.end())));
-    size_t offsetCudaLoad = Table_load.find(idxMaxColorLoad)->second.first;
-    
-    maxTotalLoad = max(maxCudaLoad,maxColorLoad+offsetCudaLoad);
-    maxMemUsage = max(maxCudaLoad,offset+offsetCudaLoad);
-    memRatio = (float)maxMemUsage/(float)maxTotalLoad;
-    
-    cout<<"=============================memory usage stats print: ================================"<<endl;
-    cout<<"maxColorLoad vs memPoolSize: (at idx "<<idxMaxColorLoad<<")"<<endl;
-    cout<<maxColorLoad<<endl;
-    cout<<offset<<endl;
-    cout<<"maxTotalLoad vs maxCudaLoad(at idx "<<idxMaxCudaLoad<<") maxMemUsage"<<endl;
-    cout<<maxTotalLoad<<endl;
-    cout<<maxCudaLoad<<endl;
-    cout<<maxMemUsage<<endl;
-    cout<<"memRatio: "<<memRatio<<endl;   
+  vector<size_t>vec_load_log;
+  for (int i=0; i<table_load.size();i++){
+      vec_load_log.push_back(table_load.find(i)->second.first);
+  }
+  size_t max_cuda_load = *max_element(vec_load_log.begin(),vec_load_log.end());
+  int idx_max_cuda_load = static_cast<int>(distance(vec_load_log.begin(),max_element(vec_load_log.begin(),vec_load_log.end())));
+  
+  vector<size_t>vec_color_load;
+  for (int i=0; i<table_load.size();i++){
+      vec_color_load.push_back(table_load.find(i)->second.second);
+  }
+  size_t max_color_load = *max_element(vec_color_load.begin(),vec_color_load.end());
+  int idx_max_color_load = static_cast<int>(distance(vec_color_load.begin(),max_element(vec_color_load.begin(),vec_color_load.end())));
+  size_t offset_color_load = table_load.find(idx_max_color_load)->second.first;
+  
+  max_total_load = max(max_cuda_load,max_color_load+offset_color_load);
+  max_mem_usage = max(max_cuda_load,offset+offset_color_load);
+  
 }
 
 std::pair<size_t, size_t> SmartMemPool::GetMemUsage() {
-  //TODO(junzhe) note here the pair is different from that of CnMemPool.
-  return std::make_pair(maxMemUsage, maxTotalLoad);
+  //note here the pair is different from that of CnMemPool.
+  return std::make_pair(max_mem_usage, max_total_load);
 }
     
 void SmartMemPool::Append(string blockInfo) {
-     //TODO(junzhe) add idx later
-    vec_block_RW.push_back(blockInfo);
-    vec_block_RWMF.push_back(blockInfo);
+  vec_block_rw.push_back(blockInfo);
+  vec_block_rw_mf.push_back(blockInfo);
 }
 
 ///SwapPool
@@ -992,121 +935,86 @@ void SwapPool::Init(){
 
 
 void SwapPool::PoolOpt(vector<string> &vec_mf) {
-  //TODO(junzhe) redo 9/17
-
-  ///process vec_mf of 3itr into blocks,maxLen
-  //assume format of string: MF ptr size;
-  //onePieceMsg_pool verified
-  // for (int i = 0; i< vec_mf.size();i++){
-  //   cout<<"print mf "<<i<<' '<<vec_mf[i]<<endl;
-  // }
-  // cout<<"===================print done"<<endl;
-  vector<onePieceMsg_pool>onePieceMsg_poolVec_;
-  maxLen_mf = vec_mf.size()/3;
-  cout<<"maxLen_mf "<<maxLen_mf<<endl;
-  cout<<"len of vec_mf "<<vec_mf.size()<<endl;
+
+  vector<PoolOptInfo>vec_pool_opt_info;
+  iteration_length_mf = vec_mf.size()/3; //cos input vec_mf is of 3 iteration 
+
+  //convert raw opt info into struct: PoolOptInfo
   for (int i = 0;i < vec_mf.size();i++){
-    vector<string> v = split(vec_mf[i], " ");
-    // cout<<"print mf "<<i<<' '<<vec_mf[i]<<endl;
-    // cout<<"||"<<v[0]<<"||"<<v[1]<<"||"<<endl;
+    vector<string> v = SplitString(vec_mf[i], " ");
 
     if (v[0]=="Malloc"){
-      //convert v[2] from str to size_t
       size_t result;
       stringstream convert(v[2]);
-      // cout<<"1"<<endl;
       if (!(convert>>result)){
-        result =-1;
+        result = -1;
         cout<<"error for converting size from str to int."<<endl;
       }
-      // cout<<"2"<<endl;
-      onePieceMsg_pool tempMsg(v[1],result, 1, i-maxLen_mf);
-      // cout<<"3"<<endl;
-      onePieceMsg_poolVec_.push_back(tempMsg);
-      // cout<<"4"<<endl;
+      PoolOptInfo tempMsg(v[1],result, 1, i-iteration_length_mf);
+      vec_pool_opt_info.push_back(tempMsg);
     }else if (v[0]=="Free"){
-      // cout<<"1"<<endl;
-      onePieceMsg_pool tempMsg(v[1],-1, -1, i-maxLen_mf);
-      // cout<<"2"<<endl;
-      onePieceMsg_poolVec_.push_back(tempMsg);
-      // cout<<"3"<<endl;
+      PoolOptInfo tempMsg(v[1],-1, -1, i-iteration_length_mf);
+      vec_pool_opt_info.push_back(tempMsg);
     }else {
       cout<<"error for process the onePriceMsg."<<endl;
-      // cout<<i<<" "<<v[0]<<"||"<<vec_mf[i]<<endl;
     }
   }
-  sort(onePieceMsg_poolVec_.begin(),onePieceMsg_poolVec_.end(),less_than_ptrIdx());
-  // cout<<"===================print done(2nd loop)"<<endl;
-  // for (int i = 0;i < vec_mf.size();i++){
-
+  //sort by ptr and then idx
+  sort(vec_pool_opt_info.begin(),vec_pool_opt_info.end(),sort_by_ptr_idx_ascending());
   
-  //pair
-  vector<onePairMsg>pairMsgVec_;
+  //convert into block lifetime
+  vector<PoolBlockLifeTime>vec_block_life_time;
   int i = 0;
-  // cout<<"before while loop"<<endl;
 
-  while (i<(onePieceMsg_poolVec_.size()-1)){
+  while (i<(vec_pool_opt_info.size()-1)){
     
-    // cout<<i<<" before 1st if"<<endl;
-    if (onePieceMsg_poolVec_[i].MallocFree==-1){
-      //condition A: start with free. do nothing.
+    if (vec_pool_opt_info[i].operation_type==-1){
+      //condition: start with free. do nothing.
       i+=1;
-      // cout<<i<<" condition A"<<endl;
     } else {
-        if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)
-          &&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){
-          //condition B: start with Malloc, next item same ptr and is free.
-          if ((onePieceMsg_poolVec_[i].idx >=0 && onePieceMsg_poolVec_[i].idx <maxLen_mf)
-            ||(onePieceMsg_poolVec_[i+1].idx >=0 && onePieceMsg_poolVec_[i+1].idx <maxLen_mf)){
-            //condition B1: at least one of the index in range [0,maxLen_mf]
-            onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx);
-            pairMsgVec_.push_back(tempPair); 
-            // cout<<i<<" condition B"<<endl;
-          }
-          i+=2; //no matter in the middle iteration or not, plus 2.
-        } else {
-          //condiction C: not one pair, single one.
-          i+=1;
-          // cout<<" condiction C"<<endl;
-          //       if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){
-          //     // onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange);
-          //     // onePairMsgVec_2.push_back(tempPair);
-          //     i+=1;
-          //     cout<<i<<" condition C"<<endl;
-          // }
+      if ((vec_pool_opt_info[i].operation_type==1)&& (vec_pool_opt_info[i+1].operation_type==-1)
+        &&((vec_pool_opt_info[i].ptr==vec_pool_opt_info[i+1].ptr))){
+        //condition: start with Malloc, next item same ptr and is free.
+        if ((vec_pool_opt_info[i].idx >=0 && vec_pool_opt_info[i].idx <iteration_length_mf)
+          ||(vec_pool_opt_info[i+1].idx >=0 && vec_pool_opt_info[i+1].idx <iteration_length_mf)){
+          //condition: at least one of the index in range [0,iteration_length_mf]
+          PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,vec_pool_opt_info[i+1].idx);
+          vec_block_life_time.push_back(temp_block_life_time); 
         }
+        i+=2; //no matter in the middle iteration or not, plus 2.
+      } else {
+        //condiction: not one pair, Malloc-only block, no free..
+        i+=1;
+      }
     } 
   }
-  sort(pairMsgVec_.begin(),pairMsgVec_.end(),less_than_size_rIdx());
-  // cout<<"after while loop"<<endl;
-  cout<<"size of pairs "<<pairMsgVec_.size()<<endl;
+  sort(vec_block_life_time.begin(),vec_block_life_time.end(),sort_by_size_r_idx_descending());
 
 
   ///get E, V of the blocks， coloring
   //V
-  int m = static_cast<int>(pairMsgVec_.size());
+  int m = static_cast<int>(vec_block_life_time.size());
   vector<Vertex>vertices;
   for (int i=0; i<m;i++){
-    Vertex tempVertex(pairMsgVec_[i].name,pairMsgVec_[i].size,pairMsgVec_[i].r_idx,pairMsgVec_[i].d_idx);
-    vertices.push_back(tempVertex);
+    Vertex temp_vertex(vec_block_life_time[i].name,vec_block_life_time[i].size,vec_block_life_time[i].r_idx,vec_block_life_time[i].d_idx);
+    vertices.push_back(temp_vertex);
   }
 
-  //E and coloring
-  
+  //E and coloring  
   int offset = 0;
   int **adj;
-  adj = new int*[m]; //TODO(junzhe) should be deleted somewhere.
-  // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop.
+  adj = new int*[m];
+
+  // build edges with values 1 and 0; combine with mergeSeg and FirstFitAllocation in the loop.
   for (int i=0; i<m;i++){
     adj[i] = new int[m];
     for (int j=0; j<m;j++){
       if ((max(vertices[i].r,vertices[j].r))<(min(vertices[i].d,vertices[j].d))
         || (min(vertices[i].d,vertices[j].d)<0 && 
-        min(vertices[i].d,vertices[j].d)+2*maxLen_mf< max(vertices[i].r,vertices[j].r))){
-        //TODO(junzhe) verify after ||
+        min(vertices[i].d,vertices[j].d)+2*iteration_length_mf< max(vertices[i].r,vertices[j].r))){
         adj[i][j]=1;
-        if (vertices[j].colorRange.second){ //as second never be 0, if not empty.
-          vertices[i].colorOccupied.push_back(vertices[j].colorRange);
+        if (vertices[j].color_range.second){ //as second never be 0, if not empty.
+          vertices[i].vec_color_preoccupied.push_back(vertices[j].color_range);
         }
       }
       else { 
@@ -1114,98 +1022,93 @@ void SwapPool::PoolOpt(vector<string> &vec_mf) {
       }
     }
     
-    vector<pair<size_t,size_t>>colorMerged = mergeSeg(vertices[i].colorOccupied);
+    vector<pair<size_t,size_t>>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied);
 
-    // vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset);
-    vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, offset);
+    // vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset);
+    vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, offset);
 
     //update of offset, largest memory footprint as well.
-    if (vertices[i].colorRange.second >=offset){
-      offset = vertices[i].colorRange.second+1;
+    if (vertices[i].color_range.second >=offset){
+      offset = vertices[i].color_range.second+1;
     }
   }//end of for loop.
-  cout<<"offset is "<<offset<<endl;
-  cout<<"===================to print vertices"<<endl;
-  cout<<"a, d, size, color["<<endl;
+
+  //delete adj, the edges
   for (int i=0; i<m;i++){
-    cout<<"("<<vertices[i].r<<","<<vertices[i].d<<") "<<vertices[i].size<<" "<<vertices[i].colorRange.first<<endl;
+    delete[] adj[i]; 
   }
-  cout<<"===================to print vertices done"<<endl;
+  delete[] adj;
 
-  ///Make pool, and make table
-  cudaMalloc(&ptrPool,offset); //poolSize or memory foot print  offset.
-  cout<<"ptrPool is: "<<ptrPool<<endl;
+  //make pool
+  cudaMalloc(&ptr_pool,offset); //poolSize or memory foot print  offset.
 
+  //make table
   for (int i=0; i<vertices.size();i++){
-    lookUpElement temp;
-    temp.r_idx = vertices[i].r;
-    temp.d_idx = vertices[i].d;
-    temp.size = vertices[i].size;
-    temp.offset = vertices[i].colorRange.first;
-    temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char));
-    temp.Occupied = 0;
-    Table_r2v[vertices[i].r] = temp;
+    PoolBlockMeta itm;
+    itm.r_idx = vertices[i].r;
+    itm.d_idx = vertices[i].d;
+    itm.size = vertices[i].size;
+    itm.offset = vertices[i].color_range.first;
+    itm.ptr = (void*)((char*)ptr_pool+itm.offset*sizeof(char));
+    itm.occupied = 0;
+    table_pool_meta[vertices[i].r] = itm;
   }
-  cout<<"===================Table_r2v done"<<endl;
-  poolFlag = 1;
+  pool_flag = 1;
     
 }
 
 void SwapPool::Malloc(void** ptr, const size_t size){
   
-  void* allocatedPtr =nullptr;
+  void* allocated_ptr =nullptr;
   
-  if (poolFlag == 0) {
+  if (pool_flag == 0) {
     cudaError_t status = cudaMalloc(ptr, size);
     CHECK_EQ(status, cudaError_t::cudaSuccess);
   } else {
-    //POOLFLAG = 1 
-    if (pc < maxLen_mf){
-      if ((Table_r2v.find(pc-maxLen_mf) == Table_r2v.end()) || (!(size == Table_r2v.find(pc-maxLen_mf)->second.size))){
+    //pool_flag = 1 
+    if (pool_index < iteration_length_mf){
+      if ((table_pool_meta.find(pool_index - iteration_length_mf) == table_pool_meta.end()) || (!(size == table_pool_meta.find(pool_index - iteration_length_mf)->second.size))){
         //not in table of negative r_idx
         cudaError_t status = cudaMalloc(ptr, size);
         CHECK_EQ(status, cudaError_t::cudaSuccess);
       } else{
         //in the table of negative r_idx
-        auto tempMeta = Table_r2v.find(pc-maxLen_mf)->second;
-        allocatedPtr = tempMeta.ptr;
-        *ptr = allocatedPtr;
-        Table_p2r[allocatedPtr]=pc-maxLen_mf; 
+        auto temp_meta = table_pool_meta.find(pool_index - iteration_length_mf)->second;
+        allocated_ptr = temp_meta.ptr;
+        *ptr = allocated_ptr;
+        table_ptr_to_ridx[allocated_ptr]=pool_index - iteration_length_mf; 
 
       }
     } else{
-      //8 9 10
-      int r_pc = pc%maxLen_mf;
-      if ((Table_r2v.find(r_pc) == Table_r2v.end()) || (!(size == Table_r2v.find(r_pc)->second.size))){
+      //8 9 10th iteration
+      int r_pool_index = pool_index%iteration_length_mf;
+      if ((table_pool_meta.find(r_pool_index) == table_pool_meta.end()) || (!(size == table_pool_meta.find(r_pool_index)->second.size))){
         //not here, should be abnormal
         cudaError_t status = cudaMalloc(ptr, size);
         CHECK_EQ(status, cudaError_t::cudaSuccess);
       } else{
         //in the table
-        auto tempMeta = Table_r2v.find(r_pc)->second;
-        allocatedPtr = tempMeta.ptr;
-        *ptr = allocatedPtr;
-        Table_p2r[allocatedPtr]=r_pc; 
-
+        auto temp_meta = table_pool_meta.find(r_pool_index)->second;
+        allocated_ptr = temp_meta.ptr;
+        *ptr = allocated_ptr;
+        table_ptr_to_ridx[allocated_ptr]=r_pool_index; 
       }
     }
   }
 
-
-    pc++;     
+    pool_index++;     
   }
 
 
 void SwapPool::Free(void *ptr) {
-  if (poolFlag == 0){
+  if (pool_flag == 0){
     cudaError_t status = cudaFree(ptr);
     CHECK_EQ(status, cudaError_t::cudaSuccess);
   } else{
-    if (Table_p2r.find(ptr)==Table_p2r.end()){
+    if (table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end()){
       cudaError_t status = cudaFree(ptr);
       CHECK_EQ(status, cudaError_t::cudaSuccess);
     }
-    
   }
 
 }
@@ -1215,15 +1118,7 @@ void SwapPool::Append(string blockInfo) {
 }
 
 
-void SwapPool::SwapOut(void* data_){
-  //NA
-}
-
-void SwapPool::SwapIn(void* data_){
-  //NA 
-}
-
-void getMaxLoad (){
+void GetMaxLoad (){
   //empty
 }