From 0f3722d30c5942ef4b0b484bee80bf3833524e02 Mon Sep 17 00:00:00 2001 From: junzhezhang Date: Wed, 31 Oct 2018 14:08:53 +0800 Subject: [PATCH] add documentation --- .DS_Store | Bin 8196 -> 0 bytes CMakeLists.txt | 2 +- examples/cifar10/{alexnet.py => cnn.py} | 0 examples/cifar10/train.py | 2 +- include/singa/core/common.h | 6 +- include/singa/core/device.h | 215 ++-- include/singa/core/memory.h | 209 ++- src/.DS_Store | Bin 8196 -> 0 bytes src/core/.DS_Store | Bin 6148 -> 0 bytes src/core/common/common.cc | 51 +- src/core/device/cuda_gpu.cc | 9 +- src/core/device/device.cc | 27 +- src/core/device/swap_gpu.cc | 1487 +++++++++------------- src/core/memory/memory.cc | 1543 +++++++++++------------ 14 files changed, 1562 insertions(+), 1989 deletions(-) delete mode 100644 .DS_Store rename examples/cifar10/{alexnet.py => cnn.py} (100%) delete mode 100644 src/.DS_Store delete mode 100644 src/core/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index e74703a65a58eddc3bf6ead31e5ced542081bb7d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMTWl0n82=R4;= z|37E{?|d^SXBGgkrJ&aXQ~>~^OCVoP)isJ37uSLk2`r_ANb%t2GM1B}-iMjo0`Jfe zG7vHlG7vHlG7vKGe_(+2Y*E-E`@S3v>yUwvfg6$m@qS3qB`_7>q(uMKL6v_5AQVRc zzfhZt24Nz=RDhEbX@m;OP(m4s!4U&xIN1|{Un;;!i8358IDB9*GX^IV^k=90CwzCn zl*F(O83-9z$bfi!O2CCYWHX~zpWiLh^^(c2Fj7*wWa%=dC`>71C%T87(QZ%k3T}&* zKj!yL-N*#(wBuMaZ7p-uG)4#c>Q=}0G}AWnSGgbWHErD)ZE*^=?)vvRqIts<>rXVu zqvPWZ^}Fj5^}8A->JsDi$sNhML}SCwi3!FkYwP!QoEjNBJAQ8B!53so7`zssd|rwd zX7jV$VypZ<5qUn7llgr;lb^ag)^V^?9ptNu^s~Eip6S@`Q6ujX4)SuB67R@5_L00H z#(T4lRT#0|L0*y7tZc#3JfqpNy3MnO+vgc)Jkc6(oxIoTJNB6rD{O5cxhWj-)YuLJJ5A_yJ z*UVT(+SW#l3LI0Esj{_|RnePcvBg~b3K{cs!Su#>4P#S$WqFxe9lKTGSJ-i#e#K!& z7c=SpNUS&h86O)s)*ua3p~ zg}qtV4V^JoliDa(+oz!>m6Sb&wvs4pqb=(mI1UEnU<{ssC*di$0I$MjxB_p&`|vS* z3ZKE}@GblZKf|x^JNyBE!e7Wx!R5FDqqrVxa04cB5AMY#+=ngLiHGoB?7}pjz>|0i zH5|emo<g1!o2x-7Wi{SIsxr1_#?V_`M-Vf@Bi0T&S5ek10e%*8NkxE?zR@PE%w$IXYB-Cr|1$z z_)SXmL#XnP>(1); } // Disabled as it is not used currently. @@ -90,7 +90,7 @@ class Block { void* data_ = nullptr; size_t size_ = 0; size_t offset_ = 0; - Device* ptrDevice_; + Device* ptr_device_; bool initialized_ = false; // Disabled as it is not used currently. // std::shared_ptr> ref_count_ = nullptr; diff --git a/include/singa/core/device.h b/include/singa/core/device.h index 96bc8e9c41..e9dcc1402d 100644 --- a/include/singa/core/device.h +++ b/include/singa/core/device.h @@ -66,10 +66,8 @@ class Device { /// Called by Tensor. void FreeBlock(Block* block); - void AppendInfo(string blockInfo); - void* GetRealGpuPtrInfo(const Block* block_); - void SwapOutInfo(const Block* block_); - void SwapInInfo(const Block* block_); + void AppendInfo(string block_info); + void* UpdateGpuPtrInfo(const Block* block_ptr); /// Return the size (bytes) of memory in use /// TODO(wangwei) override this function for all devices. @@ -108,7 +106,7 @@ class Device { int id() const { return id_; } - virtual void* GetRealGpuPtr(const Block* block_) = 0; + virtual void* UpdateGpuPtr(const Block* block_ptr) = 0; private: Device() {}; @@ -125,11 +123,8 @@ class Device { /// Free device memory. virtual void Free(void* ptr) = 0; - virtual void MakeMetaTable(Block* block,void* data_,int size) = 0; - virtual void Append(string blockInfo) = 0; - - virtual void SwapOut(const Block* block_) = 0; - virtual void SwapIn(const Block* block_) = 0; + virtual void AppendAfterMalloc(Block* block,void* data_ptr,int size) = 0; + virtual void Append(string block_info) = 0; protected: int id_ = 0; @@ -171,11 +166,10 @@ class CppCPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override {} - void Append(string blockInfo) override {} - void* GetRealGpuPtr(const Block* block_) override {} - void SwapOut(const Block* block_) override {} - void SwapIn(const Block* block_) override {} + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} + void Append(string block_info) override {} + void* UpdateGpuPtr(const Block* block_ptr) override {} + }; @@ -206,11 +200,9 @@ class CudaGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override {} - void Append(string blockInfo) override; - void* GetRealGpuPtr(const Block* block_) override; - void SwapOut(const Block* block_) override; - void SwapIn(const Block* block_) override; + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} + void Append(string block_info) override; + void* UpdateGpuPtr(const Block* block_ptr) override; private: void Setup(); @@ -222,21 +214,21 @@ class CudaGPU : public Device { /// CudaCPU which uses cudaMallocHost to allocate pinned memory for host. ///SwapGPU -struct onePieceMsg{ +struct DeviceOptInfo{ /* - members: [ptr, size, MallocFree, idx] + members: [ptr, size, operation_type, idx] */ string ptr; size_t size; - int MallocFree; + int operation_type; int idx; double t; - onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){} }; struct BlockMeta{ /* - block Meta. + meta of swapping memory blocks */ Block* block_ = nullptr; void* data_ = nullptr; @@ -249,34 +241,39 @@ struct BlockMeta{ }; struct SwapBlock{ - + /* + meta of candidate blocks + */ string ptr; - string cat; //A1, A2, A3... + string cat; //sub category of the candidate blocks, read-read, write-read, etc. int name; size_t size; + //index of last read/write before swap out, and first read/write after swap in int r_idx; //out idx int d_idx; //in idx + //index of last read/write before swap out, and first read/write after swap in double r_time; // out time double d_time; //in time - double dt; //delta t: t2'-t1' - double pri; //look at here if big enough TODO(junzhe) - double dto; //t2-t1 - double wdto = 0; //t2-t1 weighted by swap_load - double r_idx_ready; //r_idx + buffer, could be set during selection. - //int free = -1; //when it is freed - //below as per planned. - int i1 = 0; - int i1p = 0; - int i2 = 0; - int i2p = 0; - double t1 = 0; - double t2 = 0; - double t1p = 0; - double t2p = 0; - SwapBlock(string p, size_t s, int i1, int i2, double t1, double t2): - ptr(p), size(s), r_idx(i1),d_idx(i2),r_time(t1), d_time(t2) {} + double DOA; //Duation of Absence + double AOA; //Area of Absence + double DOA_origin; //t2-t1, DOA without taking out time spent + double WDOA = 0; //weighted DOA + double majority_voting = 0; + int r_idx_ready; //r_idx + buffer + + //below are index and time for scheduling + int idx_out_start = 0; + int idx_out_end = 0; + int idx_in_end = 0; + int idx_in_start = 0; + double t_out_start = 0; + double t_out_end = 0; + double t_in_end = 0; + double t_in_start = 0; + SwapBlock(string p, size_t s, int idx_out_start, int idx_in_end, double t_out_start, double t_in_end): + ptr(p), size(s), r_idx(idx_out_start),d_idx(idx_in_end),r_time(t_out_start), d_time(t_in_end) {} }; -/// Device able to Swap memory between Nvidia GPU and Swap +/// Device able to Swap memory between Nvidia GPU and CPU class SwapGPU : public Device { public: ~SwapGPU(); @@ -300,98 +297,92 @@ class SwapGPU : public Device { /// Free cpu memory. void Free(void* ptr) override; - //Append at every index: malloc, free, read, mutable - void Append(string blockInfo) override; + //Append at every index: free, read, mutable + void Append(string block_info) override; - //append info after Malloc, pair. - void MakeMetaTable(Block* block,void* data_,int size) override; + //append info after Malloc, as Block* is not available till Malloc() done. + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override; - //all the testing, without swap, during Append() - void Test_sched_switch_swap(); + //Detection and Plan + void DetectionPlan(); //test iteration, return GC - int swap_test(vectorvec_block,int &maxLen, int &location); + int Detection(vectorvec_block,int &iteration_length, int &location_of_2nd_iteration); - //entire plan, from swap_select() to swap_sched(), swap_deploy_tables() - void swap_plan(); + //entire plan, from SelectBlock() to Scheduling(), BuildMetaTables() + void Plan(); - //selection algo - vector swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode); + //block selection algo + vector SelectBlock(vectorvec_swap,vector temp_load,double mem_limit,string mode); //schedule algo - void swap_sched(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode); + void Scheduling(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double mem_limit,string mode); - //make tables Table_sched and Table_meta - void swap_construct_tables(vectorvec_swap_selct); + //make tables table_sched and table_meta + void BuildMetaTables(vectorvec_swap_selct); - //update Table_meta, during Append() - void swap_update_tables(Block* tempBlock_); + //update table_meta, during Append() + void UpdateMetaTables(Block* block_ptr); //swap/sync during Append() void DeploySwap(); //exec DelpoySwap - void DeploySwap_exec(int r_gc); - - + void DeploySwapExec(int relative_counter); //load profile as per synchronous swap. - vector swap_load_ideal(vectorvec_load,vector vec_swap_selct); + vector GetIdealLoad(vectorvec_load,vector vec_swap_selct); - //in case gpu ptr wrong. TODO(junzhe) to verify if needed. - void* GetRealGpuPtr(const Block* block_) override; + //in case gpu ptr wrong, updated it after swap_in ad hoc + void* UpdateGpuPtr(const Block* block_ptr) override; - void SwapOut(const Block* block_) override; - void SwapIn(const Block* block_) override; + //Swap Synchronous, for early iterations + void SwapOutSynchronous(const Block* block_ptr); + void SwapInSynchronous(const Block* block_ptr); - //changed to intake data_ instead - void SwapOut_idx(const int r_idx); - void SwapIn_idx(const int r_idx); + //Swap asynchronous, for middle iteraions + void SwapOut(const int idx); + void SwapIn(const int idx); private: void Setup(); - ///Tables needed - //r_idx->BlockMeta - mapTable_meta; - mapTable_block_meta; //TODO(junzhe) for measure speed only. - mapTable_not_at_device; //int refers to its r_idx of the block/meta - //mapTable_block_size; //Table block_ -> size TODO(junzhe) no need, can call block_->size() - - //schedule: idx--> r_idx, dir; sync_r_idx,dir. int 0 means D2H, 1 means H2D. - map>Table_sched; // changed to with sync_r_idx - // vectorvec_swap_selct_global; + maptable_meta; + maptable_block_meta; //for measure speed only. + maptable_not_at_device; //int refers to its r_idx of the block/meta + map>table_sched; // changed to with sync_r_idx //vec_block - vectorvec_block; //iteration 0-3 - vectorvec_block_fresh; //iteration 4 5 6 - vectorvec_block_mf; //itr 8 9 10 - vectorglobal_load; // from begining - vectororigin_load; //vec_load 3 itr. TODO(junzhe) to delete vec_load, global_load after use. - vectorvec_run; - vectoropsSequence; //sequence of operations of one middle iteration - vectorsizeSequence; //size of all operations of one middle iteration - int asyncSwapFlag = 0; //0 for sync, 1 for async. - int testFlag = 0; //0 means open for test, 1 means no need test anymore. - int gc = 0; //global counter, index, add 1 after each Malloc/Free/read/write. - int globeCounter = -1; - int maxLen = 0; - int location = 0; - int three_more_location = 0; //location at 3 more iterations later. - int three_more_globeCounter = -1; // - //design requirement TODO(junzhe) - float memLimit_ratio = 0.70; + vectorvec_block; //iterations for Detection, i.e. detect iterations. + vectorvec_block_fresh; //iterations that are used for Planning, + vectorvec_block_mf; //iterations used to construct pool + vectorglobal_load; // load from begining + vectororigin_load; //3 iteration load, for planning. + vectorvec_run; + vectoroperation_sequence; //sequence of operations of one middle iteration + vectorsize_sequence; //size of all operations of one middle iteration + + int async_swap_flag = 0; //0 for sync, 1 for async. + int past_test_flag = 0; //0 means need to test, 1 means no need test anymore. + int global_index = 0; //global counter, index, add 1 after each Malloc/Free/read/write. + int global_index_threshold = -1; + int iteration_length = 0; + int location_of_2nd_iteration = 0; //index of start of 2nd iteration + int location_of_5th_iteration = 0; //index of start of 5th iteration + int three_more_iteration_global_index_threshold = -1; + + //design specs + float mem_limit_ratio = 0.70; size_t smallest_block = 1<<20; //1 MB int data_buffer = 4; // used to control readyIdx int mutable_data_buffer = 6; - double maxLoad; - int maxIdx; - double total_swapInTime = 0; - double total_swapOutTime = 0; - double tempTime = 0; - double tempTime2 = 0; - double tempTime_baseline; //vec_run[0] time - int maxLen_threshold = 1000; + double max_load; + int max_idx; + double total_swap_in_time = 0; + double total_swap_out_time = 0; + double temp_time = 0; + double temp_time_baseline; //vec_run[0] time + int iteration_length_threshold = 1000; private: shared_ptr pool_; @@ -447,11 +438,9 @@ class OpenclDevice : public singa::Device { /// Converts the void pointer into a Buffer object, then deletes the object. /// This has the effect of freeing up device memory. void Free(void* ptr) override; - void MakeMetaTable(Block* block,void* data_,int size) override {} - void Append(string blockInfo) override {} - void* GetRealGpuPtr(const Block* block_) override {} - void SwapOut(const Block* block_) override {} - void SwapIn(const Block* block_) override {} + void AppendAfterMalloc(Block* block,void* data_ptr,int size) override {} + void Append(string block_info) override {} + void* UpdateGpuPtr(const Block* block_ptr) override {} private: diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h index 343b4449de..b3dfd672ec 100644 --- a/include/singa/core/memory.h +++ b/include/singa/core/memory.h @@ -53,8 +53,6 @@ class DeviceMemPool { virtual void PoolOpt(vector &vec_mf) = 0; - virtual void SwapOut(void* data_) = 0; - virtual void SwapIn(void* data_) = 0; /// Return a pair for free and total memory managed by this pool. virtual std::pair GetMemUsage() { return std::make_pair(0u, 0u); @@ -80,8 +78,6 @@ class CnMemPool : public DeviceMemPool { void PoolOpt(vector &vec_mf) override {} - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} std::pair GetMemUsage() override; // release all memory and set cnmem manager to unintialized @@ -110,135 +106,116 @@ class CudaMemPool : public DeviceMemPool { void PoolOpt(vector &vec_mf) override {} - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} }; -//for SmartMemPool -struct lookUpElement{ - /* - for memory pool Malloc look-up table. - */ - int r_idx; - int d_idx; - size_t size; - size_t offset; - void* ptr; - int Occupied; //0 is free, 1 is occupied. - int crossItr; - int Occupied_backup; +//for SmartMemPool and SwapPool +struct PoolBlockMeta{ + /* + for memory pool Malloc look-up table. + */ + int r_idx; + int d_idx; + size_t size; + size_t offset; + void* ptr; + int occupied; //0 is free, 1 is occupied. + int cross_iteration; + int occupied_backup; +}; + +///struct Vertex +struct Vertex{ + int name; + size_t size; + int r; //arrive + int d; //depart + int cross_iteration =0; + pair color_range; + vector> vec_color_preoccupied; + Vertex(int n, size_t s, int r1, int d1):name(n),size(s),r(r1),d(d1){} + }; -///class mem-pool SmartMemPool + +///SmartMemPool class SmartMemPool: public DeviceMemPool { public: - SmartMemPool(const MemPoolConf &conf); //constructor - //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. - void Malloc(void** ptr, const size_t size); - void Free(void* ptr); - ~SmartMemPool(); - void getMaxLoad(void); - std::pair GetMemUsage() override; - void Append(string blockInfo); - - void PoolOpt(vector &vec_mf) override {} - - void SwapOut(void* data_) override {} - void SwapIn(void* data_) override {} -protected: - void Init(); -private: - MemPoolConf conf_; - // whether the (global) memory pool has been initialized - bool initialized_ = false; - // lock on the initialized variable - std::mutex mtx_; - - string colorMethod; - int mallocFlag =0; //0 for cudaMalloc, 1 for coloringMalloc - int gc =0; //global counter each time Malloc/Free, add 1. - int globeCounter=-1; - int loadLogFlag =1; //record when its 1. - void* ptrPool = NULL; - int idxRange = 0; - size_t offset = 0; - size_t offsetCrossItr=0; //cross iteration offset. - int maxLen =0; - int location=0; - vector vec; - vector vec_block_RW; - vector vec_block_RWMF; - mapTable_r2d; //full duration info, cross-iteration duration. - mapTable_d2r; - //mapTable_r2Ver; - vector>Vec_r2Ver; //b. replace Table_r2Ver - map>Table_load; //gc, - mapTable_p2s; //For tracking load in Free. add when allocate, delete when deallocate. - mapTable_p2r; //ptr for arrival idx, for look up Table during free - int checkPoint=300; //for reduce number of test. - size_t maxTotalLoad; - size_t maxMemUsage; - float memRatio; -}; + SmartMemPool(const MemPoolConf &conf); //constructor + void Malloc(void** ptr, const size_t size); + void Free(void* ptr); + ~SmartMemPool(); + std::pair GetMemUsage() override; + void GetMaxLoad(void); + void Append(string blockInfo); + vector Plan(vectorvec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method); + int Detection(vectorvec_string_test, int &iteration_length, int &location_2nd_iteration); + void PoolOpt(vector &vec_mf) override {} -//for Swap -struct swapLookUpElement{ - /* - book keep the block info and status - */ - void* data_ = nullptr; - void* realGpuPtr = nullptr; - void* realCpuPtr = nullptr; +protected: + void Init(); +private: + MemPoolConf conf_; + // whether the (global) memory pool has been initialized + bool initialized_ = false; + // lock on the initialized variable + std::mutex mtx_; - int location; //1 is at GPU, 2 is at CPU. 3 on the way C2G, 4 on the way G2C. - size_t size; //size may used as of now. + string color_method; + int malloc_flag = 0; //0 for cudaMalloc, 1 for coloringMalloc + int global_index = 0; //global counter each time Malloc/Free, add 1. + int global_index_threshold = -1; + int load_flag = 1; //record load at 1 + void* ptr_pool = NULL; + int idx_range = 0; + size_t offset = 0; + size_t offset_cross_iteration = 0; //cross iteration offset. + int iteration_length = 0; + int location_2nd_iteration = 0; + vector vec; + vector vec_block_rw; //read write only opt info + vector vec_block_rw_mf; //read write, malloc, free opt info + maptable_ridx_to_didx; //table match from r_idx to d_idx + maptable_didx_to_ridx; //table match from d_idx to r_idx + + vector>vec_block_meta; //vec of block meta, index in the vector refering to the r_idx + map>table_load; //global_index, + maptable_ptr_to_size; //for tracking load in Free. add when allocate, delete when deallocate. + maptable_ptr_to_ridx; //ptr for arrival idx, for look up Table during free + int check_point = 300; //for reduce number of test. + size_t max_total_load; + size_t max_mem_usage; }; -struct SwapMeta{ - /* - for copy between block and info. - */ - size_t swapSize; - void* ptr; - void* d_ptr; //not used for -}; +///SwapPool class SwapPool : public DeviceMemPool { public: - SwapPool(const MemPoolConf &conf); //constructor - //TODO(junzhe) in Singa, void Malloc( void**, size_t); change to cudaMalloc and cudaFree. - void Malloc(void** ptr, const size_t size); - void Free(void* ptr); - ~SwapPool(); - void getMaxLoad(void); - std::pair GetMemUsage() override; - void Append(string blockInfo); - - void SwapOut(void* data_); - void SwapIn(void* data_); + SwapPool(const MemPoolConf &conf); //constructor + void Malloc(void** ptr, const size_t size); + void Free(void* ptr); + ~SwapPool(); + std::pair GetMemUsage() override; + void Append(string blockInfo); - //PoolOpt() construct pool based on MF info after Swap constructed. - void PoolOpt(vector &vec_mf); + //PoolOpt() construct pool based on MF info after Swap constructed. + void PoolOpt(vector &vec_mf); protected: - void Init(); + void Init(); private: - MemPoolConf conf_; - // whether the (global) memory pool has been initialized - bool initialized_ = false; - // lock on the initialized variable - std::mutex mtx_; - vector vec_block; - size_t swapLimit = 1<<23; //8MB - int poolFlag = 0; - int pc = 0; - int maxLen_mf = 0; - void* ptrPool = nullptr; - mapTable_p2r; //ptr for arrival idx, for look up Table during free - mapTable_r2v; //r-> vertex - vector>Vec_r2Ver; //Table_r2Ver No need anymore, replaced by Table_r2v TODO(junzhe) - // mapTable_id2LookUpElement; //old TODO(junzhe) remove - // map>Table_Meta; + MemPoolConf conf_; + // whether the (global) memory pool has been initialized + bool initialized_ = false; + // lock on the initialized variable + std::mutex mtx_; + + vector vec_block; + int pool_flag = 0; + int pool_index = 0; //like global counter in device class + int iteration_length_mf = 0; //max length of malloc free operation sequences. + void* ptr_pool = nullptr; + maptable_ptr_to_ridx; //map ptr to arrival idx, for look up Table during free + maptable_pool_meta; //table of pool block meta, key with r_idx }; #endif diff --git a/src/.DS_Store b/src/.DS_Store deleted file mode 100644 index a87953d857c6cf9354df29d312f03c027a6411d3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMTWl0n82Ice&DxX3#+ zgbaiXgbaiXgbaiX{2v&gJzEsE#J(>_!#ZRjWZ;HmK)fGfbO}rZI3v-2bx`FW0SLtr zz%SJ1qCpr7FcIL4L>i%jGL%q;VsOMj8BX?C;Fkz+MxqP{3=SU{%#6Va1^wCS{xRPj zFd;FlLk2Wo$+=jZ|kmqpQD;LO0oV# zDPA)*)zY-5A=b3JWx63Y)f8`zH^g?f?3$ittg60gfA{IJ$+J`ErXPGkCWXO^0V=La z^1{LlEw|7izfhDsU(E39`3%*Sk?uo1YKpHZ&=2p;dZuH${YK6uOz{erQt!?<_R*Xn z#s@Nvl^?U+6tB!^Rwi$0p3!Dmedbxi9rTO|Pqc7`ATOBLfA~ zHPe=nw6!s#5+@a9u6%t}bS78K0pg9Lz~r|jO)rVOG_J; zH0@Sf8ShaB3X)A}+@~I3mDr#RW}LAx%F~tRwnzu#AFyFD_hHTTdb5U2(~Io7Y9o>3 z!rrFqhRztPOKg_w?bA?~h|8Wr+er;2%7t`fcT+ueOGw?ayJc%s<)x%u#dqGm zLyhufBvU!svQt%g8R=M#?rEjb<)m0PS5sBJUOk{D6!DO8@Pn25sLGUouD**f3va-C z@DZu*OZWkPf?wb_Qrt4Ez$(()Ef~cOcsp*vZPTtm#zUmPB=+MV z9>*cfph4;zMH6$Rz5R}qs4a7H3osQl|60>b^DSP%DqUwk)k^*1Ld+b{qC diff --git a/src/core/.DS_Store b/src/core/.DS_Store deleted file mode 100644 index 018d123c47540aef56047ebb82a09e1df9135f84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHKK~BUl476c|7IE1l$9!Rb5Ut7!_5(<*1P4m@z~1``zQ!B)1v9omDnwk6(1PqL z87H>WWU3~yi0JM(Y(+LAvPK~)m4>jpX*vt%6;N%ANgf~Ww)=h>4~>CNv5RxRkX|Mk z@wDf^u#Z#M4acWm3~fcC@Vnm8AecUZYK*GnZgY39Ny$BPX>yhkPIvtIQQm;=l>0UnP!r|49Qk9kPQ4Y26WZ!yB!u4 zXX}^c@vJQ^2g AjsO4v diff --git a/src/core/common/common.cc b/src/core/common/common.cc index f3d144a9c8..d6e9c5a301 100644 --- a/src/core/common/common.cc +++ b/src/core/common/common.cc @@ -22,34 +22,28 @@ #include #include #include -//TODO(junzhe) ifdef to counter verify -///only include mutable_data() and data() namespace singa { void* Block::mutable_data() { - //TODO(junzhe) go back to enable it after device done - //std::cout<<"mutable_data() "<AppendInfo(temp); + string temp_str4 = strm4.str(); + string temp = "Mutable "+temp_str2+" "+temp_str4; + ptr_device_->AppendInfo(temp); } - //TODO(junzhe) this should not happen, can verify and remove + + //update ptr after swap in done, if variable is not swapped back yet as expected. if (data_ == nullptr) { - //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"print returned tempData_ "<UpdateGpuPtrInfo(this); return static_cast(tempData_) + offset_; } @@ -59,41 +53,38 @@ void* Block::mutable_data() { const void* Block::data() const { CHECK(initialized_) << "Must initialize data before reading it"; - //TODO(junzhe) go back to enable it after device done - if (ptrDevice_!=nullptr){ + + //Append block info: opt_type, ptr, time_stamp + if (ptr_device_!=nullptr){ //Append info. stringstream strm2; strm2<AppendInfo(temp); + string temp_str4 = strm4.str(); + string temp = "Read "+temp_str2+" "+temp_str4; + ptr_device_->AppendInfo(temp); } - //TODO(junzhe) this should not happen, can verify and remove + //update ptr after swap in done, if variable is not swapped back yet as expected. if (data_ == nullptr) { - //cout<<"to sleep"<GetRealGpuPtrInfo(this); - cout<<"print returned tempData_ "<UpdateGpuPtrInfo(this); return static_cast(tempData_) + offset_; } - return static_cast(data_) + offset_; } void* Block::get_data() { + //get data without calling data(), to avoid append block info. return data_; } void Block::update_data(void* data_new) { + //update data_, after the swap in completes. data_ = data_new; - std::cout<<"results update_data:: "<Append(blockInfo); } -void* CudaGPU::GetRealGpuPtr(const Block* block_){ +void* CudaGPU::UpdateGpuPtr(const Block* block_){ return nullptr; } -void CudaGPU::SwapOut(const Block* block_){ - -} - -void CudaGPU::SwapIn(const Block* block_){ - -} } // namespace singa #endif // USE_CUDA \ No newline at end of file diff --git a/src/core/device/device.cc b/src/core/device/device.cc index b2988f3615..59faddc5c6 100644 --- a/src/core/device/device.cc +++ b/src/core/device/device.cc @@ -41,9 +41,7 @@ Block* Device::NewBlock(int size) { if (size > 0) { void* ptr = Malloc(size); Block* block_ = new Block(ptr, size,0,this); - //std::cout<<"(reference) from device.cc after, data_, block_ device: "<mutable_data(); - //cout<<"FreeBlock: "<mutable_data()); - //Add Append for free here. + //append block info for free operation. stringstream strm1; strm1< SplitOptString(string s, string delimiter) { + // string delimiter + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + string token; + vector res; + while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); -/// string delimiter -vector swap_split(string s, string delimiter) { - size_t pos_start = 0, pos_end, delim_len = delimiter.length(); - string token; - vector res; - while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { - token = s.substr(pos_start, pos_end - pos_start); - pos_start = pos_end + delim_len; - res.push_back(token); - } - res.push_back(s.substr(pos_start)); - return res; + return res; } -///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg -//vector of pairMsg is used in run. -//vector of iterMsg is used in test. -vector swap_strVec_2_pieceMsgVec(vector vec, int &idxRange){ +vector DeviceOptSeqStrToStruct(vector vec, int &idx_range){ /* - convert vector of string into vector of onePieceMsg, sorted by ptr - and then idx, and update idxRange to pieceMsgVec size. - format of onePieceMsg [ptr, size/-1, flag, idx, timestamp] + convert vector of string into vector of DeviceOptInfo, sorted by ptr + and then idx, and update idx_range to pieceMsgVec size. + format of DeviceOptInfo [ptr, size/-1, flag, idx, timestamp] flag: 1 for malloc, -1 for free, 2 for read, 3 for layer,4 for mutable - version on 5/29, with equval blockInfo length: flag, block_, size, t - */ - vectoronePieceMsgVec_; + */ + vectorvec_opt_info; for (int i=0;i v = swap_split(vec[i], " "); - int MallocFree; + vector v = SplitOptString(vec[i], " "); + int operation_type; if (v[0]=="Malloc"){ - MallocFree = 1; + operation_type = 1; }else if (v[0]=="Free"){ - MallocFree = -1; + operation_type = -1; }else if (v[0]=="Mutable"){ - MallocFree = 4; + operation_type = 4; }else if (v[0]=="Read"){ - MallocFree = 2; + operation_type = 2; }else if (v[0]=="Layer"){ - MallocFree = 3; + operation_type = 3; } - //onePieceMsg(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + //DeviceOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){} size_t result; stringstream convert(v[2]); if (!(convert>>result)){ - result =-1; - cout<<"error for converting size from str to int."<>tempTime; - tempMsg.t =tempTime; - onePieceMsgVec_.push_back(tempMsg); + convert2>>temp_time; + itm.t =temp_time; + vec_opt_info.push_back(itm); } - sort(onePieceMsgVec_.begin(),onePieceMsgVec_.end(),less_than_ptrIdx()); - idxRange = static_cast(onePieceMsgVec_.size()); - - return onePieceMsgVec_; -}// end of strVec_2_pieceMsgVec function - - -vector Swap_piece2rep (vectoronePieceMsgVec_){ - vectoroneIterMsgVec_; - string tempStr; - int tempIdx=0; - for (int i=0;i(vec_opt_info.size()); + + return vec_opt_info; +} + + +vector DeviceOptSeqRepeatableTestPreProcess(vectorvec_opt_info){ + /* + pre process Device Operation Sequence Struct info for repeatable test, + return a vector of int for fast detection. + */ + vectorvec_opt_simplified_info; + string temp_str; + int temp_idx=0; + for (int i=0;irep; // vector of size_delta, name it as rep for simlisity. - for (int i =0; ivec_rep; // vector of size_delta, name it as vec_rep for simlisity. + for (int i =0; irep, int &maxLen, int &location, int maxLen_threshold, int gc ){ - int idxRange = (int)rep.size(); - int threshold = maxLen_threshold; - vector>maxLen_location; - - for (int i=0; ithreshold){ - break; - } - for (int len=1; len<(idxRange-i);len++){ - if (maxLen>threshold){ - break; - } - if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(maxLenrep, int &iteration_length, int &location_of_2nd_iteration, int iteration_length_threshold, int global_index ){ + /* + repeatable test, input vector of int, + in-place update max_legth (length of iteration) + and location_of_2nd_iteration (where 2nd iteration starts) + */ + int idx_range = (int)rep.size(); + int threshold = iteration_length_threshold; + vector>iteration_length_location_of_2nd_iteration; + + for (int i=0; ithreshold){ + break; + } + for (int len=1; len<(idx_range-i);len++){ + if (iteration_length>threshold){ + break; + } + if((equal(rep.begin()+i,rep.begin()+i-1+len,rep.begin()+i+len))&&(iteration_lengthstruct2.dto); + return (struct1.DOA_origin>struct2.DOA_origin); } }; -struct less_than_wdto{ +struct sort_by_WDOA_descending{ /* - sort SwapBlock by weighted dto, descending + sort SwapBlock by weighted DOA_origin, descending + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.WDOA>struct2.WDOA); + } +}; + +struct sort_by_AOA_descending{ + /* + sort SwapBlock by pri, descending */ inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) { - return (struct1.wdto>struct2.wdto); + return (struct1.AOA>struct2.AOA); } }; -// struct less_than_r_idx_ready{ -// /* -// sort SwapBlock by r_idx_ready, ascending -// */ -// inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) -// { -// return (struct1.r_idx_readystruct2.pri); - } +struct sort_by_idx_ascending_swap{ + /* + sort DeviceOptInfo_Swap by idx. + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.r_idxstruct2.d_idx); + } }; -struct less_than_Idx_Swap_rvs{ - /* - sort onePieceMsg_Swap by idx. reverse - */ - inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) - { - return (struct1.d_idx>struct2.d_idx); - } +struct sort_by_majority_voting_ascending{ + /* + sort majority voting, ascending + */ + inline bool operator() (const SwapBlock& struct1, const SwapBlock& struct2) + { + return (struct1.majority_voting load_over_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx,int maxLen){ - //input: vec_load, memLimit, range [start_idx, end_idx) - //return range overlimit [first_over_limit, first_below_limit) +pair GetOptIdxAboveLoadLimit(vectorvec_load, size_t mem_limit, int start_idx, int end_idx,int iteration_length){ + /* + get operation index (range) that above the load limit. + input: vec_load, mem_limit, range [start_idx, end_idx) + return range overlimit [first_over_limit, first_below_limit) + */ int first_over_limit = start_idx; int first_below_limit = end_idx; - for (int i = start_idx+maxLen; i < end_idx+maxLen; i++){ - if (vec_load[i] > memLimit){ - first_over_limit = i-maxLen; + for (int i = start_idx+iteration_length; i < end_idx+iteration_length; i++){ + if (vec_load[i] > mem_limit){ + first_over_limit = i-iteration_length; break; } } - for (int i = end_idx+maxLen; i > first_over_limit+maxLen; i--){ - if (vec_load[i] > memLimit){ - first_below_limit = i-1-maxLen; + for (int i = end_idx+iteration_length; i > first_over_limit+iteration_length; i--){ + if (vec_load[i] > mem_limit){ + first_below_limit = i-1-iteration_length; break; } } + if (first_over_limit == start_idx) first_over_limit = -1; + if (first_below_limit == end_idx) first_below_limit = -1; return std::make_pair(first_over_limit, first_below_limit); } -// pair load_below_limit(vectorvec_load, size_t memLimit, int start_idx, int end_idx, int maxIdx,int maxLen){ -// //input: vec_load, memLimit, range [start_idx, end_idx] -// //return range overlimit [first_over_limit, first_below_limit) -// int first_below_limit = maxIdx; -// int last_below_limit = maxIdx; - -// for (int i = first_below_limit+maxLen; i > start_idx+maxLen; i--){ -// if (vec_load[i] > memLimit){ -// first_below_limit = i+1-maxLen; -// break; -// } -// } - -// for (int i = last_below_limit+maxLen; i < end_idx+maxLen; i++){ -// if (vec_load[i] > memLimit){ -// last_below_limit = i-1-maxLen; -// break; -// } -// } - -// return std::make_pair(first_below_limit, last_below_limit); -// } - -pair load_peak(vectorvec_load_test,int maxLen){ - double maxLoad_test = 0; - int maxIdx_test = 0; - for (int i = maxLen; i < maxLen*2; i++){ - if (maxLoad_test < vec_load_test[i]){ - maxLoad_test = vec_load_test[i]; - maxIdx_test = i - maxLen; + +pair GetLoadPeak(vectorvec_load_test,int iteration_length){ + /* + return value and index of load peak + */ + double max_load_test = 0; + int max_idx_test = 0; + for (int i = iteration_length; i < iteration_length*2; i++){ + if (max_load_test < vec_load_test[i]){ + max_load_test = vec_load_test[i]; + max_idx_test = i - iteration_length; } } - return std::make_pair(maxLoad_test,maxIdx_test); + return std::make_pair(max_load_test,max_idx_test); } -void load_update(vector& vec_load,int start_idx, int end_idx, int plusMinus, size_t size,int maxLen){ - //update load [start_idx, end_idx) by plusMinus*size - for (int i = start_idx+maxLen; i(size) * plusMinus; +void UpdateLoad(vector& vec_load,int start_idx, int end_idx, int plus_minus, size_t size,int iteration_length){ + /* + update load [start_idx, end_idx) by plus_minus*size + */ + for (int i = start_idx+iteration_length; i(size) * plus_minus; } } -vector SwapGPU::swap_select(vectorvec_swap,vector tempLoad,double memLimit,string mode){ + +///define SwapGPU member functions +vector SwapGPU::SelectBlock(vectorvec_swap,vector temp_load,double mem_limit,string mode){ vectorvec_swap_selct; - //vectorvec_swap_reject; - if (mode == "dto"){ - sort(vec_swap.begin(),vec_swap.end(),less_than_dto()); + /* + select swapping blocks based on a cetain priority score or BO score; + with load updated + */ + if (mode == "DOA_origin"){ + sort(vec_swap.begin(),vec_swap.end(),sort_by_DOA_origin_descending()); } - if (mode == "pri"){ - sort(vec_swap.begin(),vec_swap.end(),less_than_pri()); + + if (mode == "AOA"){ + sort(vec_swap.begin(),vec_swap.end(),sort_by_AOA_descending()); } - if (mode == "wdto"){ - //TODO(junzhe) time complexity + + if (mode == "WDOA"){ for (int i = 0; i < vec_swap.size(); i++){ auto itm = vec_swap[i]; for (int j = itm.r_idx; j < itm.d_idx; j++){ - itm.wdto += origin_load[i+maxLen] - memLimit; + itm.WDOA += origin_load[i+iteration_length] - mem_limit; } } - sort(vec_swap.begin(),vec_swap.end(),less_than_wdto()); + sort(vec_swap.begin(),vec_swap.end(),sort_by_WDOA_descending()); } - cout<<"===============select block one by one================="< SwapGPU::swap_load_ideal(vectorvec_load,vector vec_swap_selct){ +vector SwapGPU::GetIdealLoad(vectorvec_load,vector vec_swap_selct){ + /* + get load_ideal, which is equivalent to load by synchronous swapping. + */ auto vec_load_return = vec_load; for (int i =0; i&vec_swap_selct, vector&vec_load_temp,double &overhead,double memLimit,string mode){ +void SwapGPU::Scheduling(vector&vec_swap_selct, vector&vec_load_temp,double &overhead,double mem_limit,string mode){ /* - update i1p, i2p and overhead time based on mode, such as no overhead or stick to limit. + Swap Scheduling algo + update idx_out_end, idx_in_start + compute overhead time + mode selection: no overhead or stick to limit. */ - //TODO(junzhe) wordy, can merge in common part. + overhead = 0; - cout<<"----------------swap_sched----------------"< 0){ - readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); + ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end); } - cout<<" -> "< vec_run[readyIdx+maxLen].t){ //TODO(junzhe) reduce time complexity. - readyIdx++; //ready means when able to finish swapOut, w/ or w/o overhead. + + itm.idx_out_start = ready_idx; + itm.t_out_start = vec_run[ready_idx+iteration_length].t; + itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size); + total_swap_out_time+=SwapOutTime(itm.size); + while (itm.t_out_end > vec_run[ready_idx+iteration_length].t){ + //ready means when able to finish swapOut, w/ or w/o overhead. + ready_idx++; } - //get min compare with maxIdx and readyIdx. - readyIdx = std::min(maxIdx,readyIdx); - cout<<" || "< "< 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } - cout<<" -> "< 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); } + itm.idx_in_end = need_idx; + double prepareTime = vec_run[need_idx+iteration_length].t - SwapInTime(itm.size); + total_swap_in_time+=SwapInTime(itm.size); + while (prepareTime < vec_run[need_idx+iteration_length].t){ + need_idx--; } - needIdx = std::max(needIdx,maxIdx+1); - cout<<" || "< 0){ - // cout< itm.t2p)) { - overhead+=(vec_run[tempOverLimit_3.second+maxLen].t - itm.t2p); - cout<<"==== overhead added "< itm.t_in_start)) { + overhead+=(vec_run[temp_over_limit_3.second+iteration_length].t - itm.t_in_start); + UpdateLoad(vec_load_temp,itm.idx_in_start,temp_over_limit_3.second+1,-1,itm.size,iteration_length); + itm.idx_in_start = temp_over_limit_3.second+1; + auto temp_over_limit_4 = GetOptIdxAboveLoadLimit(vec_load_temp,mem_limit,0,iteration_length,iteration_length); } - cout<<" -> "< 0){ - readyIdx = std::max(readyIdx,vec_swap_selct[i-1].i1p); + ready_idx = std::max(ready_idx,vec_swap_selct[i-1].idx_out_end); } - itm.i1 = readyIdx; - itm.t1 = vec_run[readyIdx].t; - itm.t1p = itm.t1 + SwapOutTime(itm.size); - while (itm.t1p > vec_run[readyIdx].t){ - readyIdx++; + itm.idx_out_start = ready_idx; + itm.t_out_start = vec_run[ready_idx].t; + itm.t_out_end = itm.t_out_start + SwapOutTime(itm.size); + while (itm.t_out_end > vec_run[ready_idx].t){ + ready_idx++; } - itm.i1p = readyIdx; + itm.idx_out_end = ready_idx; vec_swap_selct[i] = itm; } - //update i2p - sort(vec_swap_selct.begin(),vec_swap_selct.end(),less_than_Idx_Swap_rvs()); + //update idx_in_start + sort(vec_swap_selct.begin(),vec_swap_selct.end(),sort_by_idx_descending_swap()); for (int i =0; i 0){ needIdx = std::min(needIdx,vec_swap_selct[i-1].i2p); } - itm.i2 = needIdx; - double prepareTime = vec_run[needIdx].t - SwapInTime(itm.size); - while (prepareTime < vec_run[needIdx].t){ - needIdx--; + int need_idx = itm.d_idx; + if (i > 0){ need_idx = std::min(need_idx,vec_swap_selct[i-1].idx_in_start); } + itm.idx_in_end = need_idx; + double prepareTime = vec_run[need_idx].t - SwapInTime(itm.size); + while (prepareTime < vec_run[need_idx].t){ + need_idx--; } - itm.i2p = needIdx; - itm.t2p = prepareTime; + itm.idx_in_start = need_idx; + itm.t_in_start = prepareTime; vec_swap_selct[i] = itm; - load_update(vec_load_temp,itm.i1p,itm.i2p+1,-1,itm.size,maxLen); //TODO(junzhe) range, right boundary + UpdateLoad(vec_load_temp,itm.idx_out_end,itm.idx_in_start+1,-1,itm.size,iteration_length); } } @@ -548,365 +532,219 @@ void SwapGPU::swap_sched(vector&vec_swap_selct, vector&vec_lo } -void SwapGPU::swap_construct_tables(vectorvec_swap_selct){ +void SwapGPU::BuildMetaTables(vectorvec_swap_selct){ + /* + construct tables: table_sched, and table_meta + */ cudaStream_t stream1; cudaStream_t stream2; - cout<<"---------------print all 1, 1', 2', 2-----------"<(vec_swap_selct.size()-1);i>=0; i--){ for (int i =0; i= 0){ - //TODO(junzhe) for time being, remove negative r_idx itms. - cout<(Table_sched.find(itm.i1)->second) = itm.r_idx; - std::get<1>(Table_sched.find(itm.i1)->second) = 0; - } - //i2p swap - if (Table_sched.find(itm.i2p) == Table_sched.end()){ - Table_sched[itm.i2p] = std::make_tuple(itm.r_idx,1,-1,-1); - } else { - std::get<0>(Table_sched.find(itm.i2p)->second) = itm.r_idx; - std::get<1>(Table_sched.find(itm.i2p)->second) = 1; - } - // i1p sync - if (Table_sched.find(itm.i1p) == Table_sched.end()){ - Table_sched[itm.i1p] = std::make_tuple(-1,-1,itm.r_idx,0); - } else { - std::get<2>(Table_sched.find(itm.i1p)->second) = itm.r_idx; - std::get<3>(Table_sched.find(itm.i1p)->second) = 0; - } - //i2 sync - if (Table_sched.find(itm.i2) == Table_sched.end()){ - Table_sched[itm.i2] = std::make_tuple(-1,-1,itm.r_idx,1); - } else { - std::get<2>(Table_sched.find(itm.i2)->second) = itm.r_idx; - std::get<3>(Table_sched.find(itm.i2)->second) = 1; - } - ///Make Table_meta - void* tempPtr = nullptr; - cudaMallocHost(&tempPtr,itm.size); //pinned memory. - BlockMeta meta; - meta.size = itm.size; - meta.cpu_ptr = tempPtr; - meta.out_stream = stream1; - meta.in_stream = stream2; - //meta.last_out_idx = vec_swap_selct[i].last_out_idx; - //meta.last_in_idx = vec_swap_selct[i].last_in_idx; - //meta.i2 = vec_swap_selct[i].i2; - Table_meta[itm.r_idx] = meta; - // } - - } - cout<<"---------------print all 1, 1', 2', 2-----------DONE"<"; - cout<(Table_sched.find(i)->second)<<" "; - cout<(Table_sched.find(i)->second)<<" "; - cout<(Table_sched.find(i)->second)<<" "; - cout<(Table_sched.find(i)->second)<(table_sched.find(itm.idx_out_start)->second) = itm.r_idx; + std::get<1>(table_sched.find(itm.idx_out_start)->second) = 0; } + //idx_in_start swap + if (table_sched.find(itm.idx_in_start) == table_sched.end()){ + table_sched[itm.idx_in_start] = std::make_tuple(itm.r_idx,1,-1,-1); + } else { + std::get<0>(table_sched.find(itm.idx_in_start)->second) = itm.r_idx; + std::get<1>(table_sched.find(itm.idx_in_start)->second) = 1; + } + // idx_out_end sync + if (table_sched.find(itm.idx_out_end) == table_sched.end()){ + table_sched[itm.idx_out_end] = std::make_tuple(-1,-1,itm.r_idx,0); + } else { + std::get<2>(table_sched.find(itm.idx_out_end)->second) = itm.r_idx; + std::get<3>(table_sched.find(itm.idx_out_end)->second) = 0; + } + //i2 sync + if (table_sched.find(itm.idx_in_end) == table_sched.end()){ + table_sched[itm.idx_in_end] = std::make_tuple(-1,-1,itm.r_idx,1); + } else { + std::get<2>(table_sched.find(itm.idx_in_end)->second) = itm.r_idx; + std::get<3>(table_sched.find(itm.idx_in_end)->second) = 1; + } + + ///Make table_meta + void* temp_ptr = nullptr; + cudaMallocHost(&temp_ptr,itm.size); //pinned memory. + BlockMeta meta; + meta.size = itm.size; + meta.cpu_ptr = temp_ptr; + meta.out_stream = stream1; + meta.in_stream = stream2; + table_meta[itm.r_idx] = meta; } } -void SwapGPU::swap_update_tables(Block* tempBlock_){ - // update Table_meta's block_ and data_; update once atfer swap test is passed. - // enable to update negative r_idx. - // it's safe in below procedure, as r_gc and r_gc_n should never be the same. - if (testFlag == 1) { +void SwapGPU::UpdateMetaTables(Block* block_ptr){ + /* + update table_meta's block_ and data_; update once atfer swap test is passed. + enable to update negative r_idx. + it's safe in below procedure, as r_global_index and relative_counter should never be the same. + */ + + if (past_test_flag == 1) { //update positive r_idx - int r_gc = (gc-location)%maxLen; - if (!(Table_meta.find(r_gc)==Table_meta.end())){ - //cout<<"r_gc, gc and size ot Table_meta "<get_data()<second.block_ = tempBlock_; - Table_meta.find(r_gc)->second.data_ = tempBlock_->get_data(); + int r_global_index = (global_index-location_of_2nd_iteration)%iteration_length; + if (!(table_meta.find(r_global_index)==table_meta.end())){ + table_meta.find(r_global_index)->second.block_ = block_ptr; + table_meta.find(r_global_index)->second.data_ = block_ptr->get_data(); } //update negative r_idx - int r_gc_n = r_gc - maxLen; - if (!(Table_meta.find(r_gc_n)==Table_meta.end())){ - //cout<<"r_gc, gc and size ot Table_meta "<get_data()<second.block_ = tempBlock_; - Table_meta.find(r_gc_n)->second.data_ = tempBlock_->get_data(); + int relative_counter = r_global_index - iteration_length; + if (!(table_meta.find(relative_counter)==table_meta.end())){ + table_meta.find(relative_counter)->second.block_ = block_ptr; + table_meta.find(relative_counter)->second.data_ = block_ptr->get_data(); } } } -int SwapGPU::swap_test(vectorvec_block,int &maxLen, int &location){ +int SwapGPU::Detection(vectorvec_block,int &iteration_length, int &location_of_2nd_iteration){ + /* + test repeatability, detect iteration, and return global_index_threshold. + */ + + ///vec_str (vec_block) to vec_opt_info, sort by ptr and idx. + int idx_range = 0; + vector vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range); - ///vec_str (vec_block) to vec_pieceMsg, sort by ptr and idx. - int idxRange = 0; - vector vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange); - cout<<"size of vec_pieceMsg & vec_block: "< vec_rep = Swap_piece2rep(vec_pieceMsg); - //int idxRange3=0; //rename TODO(junzhe) - //int maxLen=0, location =0; - repPatternDetector(vec_rep,maxLen,location,maxLen_threshold,gc); - cout<<"maxLen and location are: "< vec_rep = DeviceOptSeqRepeatableTestPreProcess(vec_opt_info); + RepeatableTest(vec_rep,iteration_length,location_of_2nd_iteration,iteration_length_threshold,global_index); + + //Note here location_of_2nd_iteration not exactly start of one iteration, //adjust to nearly start of one by restricting "Malloc" int shift_counter = 0; - for (int i=0;i v = swap_split(vec_block[location+i], " "); + for (int i=0;i v = SplitOptString(vec_block[location_of_2nd_iteration+i], " "); if (v[0]=="Malloc"){ shift_counter = i; break; } } - location =location+shift_counter; - cout<<"shift_counter is "< vec_pieceMsg = swap_strVec_2_pieceMsgVec(vec_block,idxRange); - cout<<"size of vec_pieceMsg & vec_block: "< vec_opt_info = DeviceOptSeqStrToStruct(vec_block,idx_range); + sort(vec_opt_info.begin(),vec_opt_info.end(),sort_by_idx_ascending()); + // scale down idx, to middle iteration. - tempTime_baseline = vec_pieceMsg[three_more_location].t; - for (int i=0; ione_itr(&vec_pieceMsg[location+4*maxLen],&vec_pieceMsg[location+5*maxLen]); + vectorone_itr(&vec_opt_info[location_of_2nd_iteration+4*iteration_length],&vec_opt_info[location_of_2nd_iteration+5*iteration_length]); for (int i =0; itemp_vec_run(&vec_pieceMsg[location+3*maxLen],&vec_pieceMsg[location+6*maxLen]); + + //3 iterations of vec_run and vec_load, max_idx and max_load + vectortemp_vec_run(&vec_opt_info[location_of_2nd_iteration+3*iteration_length],&vec_opt_info[location_of_2nd_iteration+6*iteration_length]); vec_run = temp_vec_run; - fstream file_vec_run("vec_run36.csv", ios::in|ios::out|ios::app); - for (int i =0; itemp_vec_run2(&vec_pieceMsg[location],&vec_pieceMsg[location+3*maxLen]); + vectortemp_vec_run2(&vec_opt_info[location_of_2nd_iteration],&vec_opt_info[location_of_2nd_iteration+3*iteration_length]); auto vec_run2 = temp_vec_run2; - fstream file_vec_run2("vec_run03.csv", ios::in|ios::out|ios::app); - for (int i =0; ivec_load(&global_load[location],&global_load[location+3*maxLen]); + + vectorvec_load(&global_load[location_of_2nd_iteration],&global_load[location_of_2nd_iteration+3*iteration_length]); origin_load = vec_load; - //3 iterations - fstream file_load_origin("load_origin03.csv", ios::in|ios::out|ios::app); - for (int i=0; ivec_load2(&global_load[location+3*maxLen],&global_load[location+6*maxLen]); - // auto origin_load2 = vec_load2; - // //3 iterations - // fstream file_load_origin2("load_origin36.csv", ios::in|ios::out|ios::app); - // for (int i=0; ivec_swap; - // size_t load_swap = 0; + for (int i =1; i= smallest_block) && (vec_run_dup[i-1].idxmaxIdx) + //SwapBlock(string p, size_t s, int idx_out_start, int i2, double t1, double t2): + //ptr(p), size(s), r_idx(idx_out_start),d_idx(i2),r_time(t1), d_time(t2) {} + if ((vec_run_dup[i].size >= smallest_block) && (vec_run_dup[i-1].idxmax_idx) && (vec_run_dup[i-1].ptr ==vec_run_dup[i].ptr) - && ((vec_run_dup[i-1].MallocFree==3) or (vec_run_dup[i-1].MallocFree==2) or (vec_run_dup[i-1].MallocFree==4))) + && ((vec_run_dup[i-1].operation_type==3) or (vec_run_dup[i-1].operation_type==2) or (vec_run_dup[i-1].operation_type==4))) { SwapBlock itm(vec_run_dup[i].ptr, vec_run_dup[i].size, vec_run_dup[i-1].idx, vec_run_dup[i].idx, vec_run_dup[i-1].t, vec_run_dup[i].t); - itm.dto = itm.d_time-itm.r_time; - itm.dt = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size); - if (itm.dt>=0){ - itm.pri = itm.dt * itm.size; + itm.DOA_origin = itm.d_time-itm.r_time; + itm.DOA = itm.d_time-itm.r_time-SwapOutTime(itm.size)-SwapOutTime(itm.size); + if (itm.DOA>=0){ + itm.AOA = itm.DOA * itm.size; } else { - itm.pri = itm.dt * 1/itm.size; + itm.AOA = itm.DOA * 1/itm.size; } //cat A - if (vec_run_dup[i-1].MallocFree == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } - if (vec_run_dup[i-1].MallocFree == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} - if (vec_run_dup[i-1].MallocFree == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} + if (vec_run_dup[i-1].operation_type == 3){ itm.cat = "A1"; itm.r_idx_ready = itm.r_idx; } + if (vec_run_dup[i-1].operation_type == 2){ itm.cat = "A2"; itm.r_idx_ready = itm.r_idx + data_buffer;} + if (vec_run_dup[i-1].operation_type == 4){ itm.cat = "A3"; itm.r_idx_ready = itm.r_idx + mutable_data_buffer;} vec_swap.push_back(itm); - // load_swap+=itm.size; - cout< for building SwapGPU, which doesnt matter. pool_ = std::make_shared(conf); Setup(); @@ -988,21 +825,21 @@ void* SwapGPU::Malloc(int size) { CUDA_CHECK(cudaSetDevice(id_)); pool_->Malloc((void**)&ptr, size); - ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Malloc "; + ///append vec_block_mf:for swap & pool + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Malloc "; stringstream strm2; strm2<Free(ptr); - ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Free "; + ///append vec_block_mf: for swap & pool + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Free "; stringstream strm2; strm2< maxLen_threshold) { - testFlag = 1; - three_more_globeCounter = globeCounter + 3*maxLen; - three_more_location = location + 3*maxLen; - cout<<"compele test-swap:::::::::::::::::::::::::::::::::::::::::::::::::"< iteration_length_threshold) { + past_test_flag = 1; + three_more_iteration_global_index_threshold = global_index_threshold + 3*iteration_length; + location_of_5th_iteration = location_of_2nd_iteration + 3*iteration_length; } } ///switch flag; next idx - if ((gc+1) == three_more_globeCounter){ - swap_plan(); - asyncSwapFlag = 1; - // vectorvec_load2(&global_load[three_more_location],&global_load[three_more_location+3*maxLen]); - // origin_load = vec_load2; - // //load before swap, write in - // fstream file_load_origin("load_origin.csv", ios::in|ios::out|ios::app); - // for (int i=0; i= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc_n) == Table_sched.end()))) { - cout<<"condition B"<= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index_n) == table_sched.end()))) { + DeploySwapExec(r_global_index_n); } - if ((gc >= three_more_globeCounter + maxLen) && (!(Table_sched.find(r_gc) == Table_sched.end()))) { - cout<<"condition C"<= three_more_iteration_global_index_threshold + iteration_length) && (!(table_sched.find(r_global_index) == table_sched.end()))) { + DeploySwapExec(r_global_index); } } } -void SwapGPU::DeploySwap_exec(int r_gc){ - cout<<"--------sched action at "<(Table_sched.find(r_gc)->second); - auto swap_dir = std::get<1>(Table_sched.find(r_gc)->second); - auto sync_idx = std::get<2>(Table_sched.find(r_gc)->second); - auto sync_dir = std::get<3>(Table_sched.find(r_gc)->second); +void SwapGPU::DeploySwapExec(int r_global_index){ + //execute DeploySwap + auto swap_idx = std::get<0>(table_sched.find(r_global_index)->second); + auto swap_dir = std::get<1>(table_sched.find(r_global_index)->second); + auto sync_idx = std::get<2>(table_sched.find(r_global_index)->second); + auto sync_dir = std::get<3>(table_sched.find(r_global_index)->second); if (swap_dir == 0){ - SwapOut_idx(swap_idx); - cout<<"----Swap Out "<second; + auto last_meta = table_meta.find(sync_idx)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaEventSynchronize(last_meta.in_event); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - Table_not_at_device[last_meta.block_] = sync_idx; //TODO(junzhe) double check if needed. + table_not_at_device[last_meta.block_] = sync_idx; last_meta.block_->update_data(nullptr); - // cout<<"to free data_"<Free(last_meta.data_); ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Free "; + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Free "; stringstream strm2; strm2<second = last_meta; + last_meta.data_ = nullptr; + table_meta.find(sync_idx)->second = last_meta; } if (sync_dir == 1){ ///sync swap-in, including sync, update block's data_ to new gpu address, update meta. - //if (!(Table_not_at_device.find(last_meta.block_)==Table_not_at_device.end())){ TODO(junzhe) - auto last_meta = Table_meta.find(sync_idx)->second; + auto last_meta = table_meta.find(sync_idx)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaEventSynchronize(last_meta.out_event); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - Table_not_at_device.erase(last_meta.block_); + table_not_at_device.erase(last_meta.block_); last_meta.block_->update_data(last_meta.data_); - cout<<"----sync in "<second = last_meta; + table_meta.find(sync_idx)->second = last_meta; } } -void SwapGPU::Append(string blockInfo){ +void SwapGPU::Append(string block_info){ + /* + Append Operation block info after each operation + Meantime execute following operations: + insert size for non-malloc operations + update global memory load + control swap flag on and off + update table_meta and table_sched + deploy swap at every index. + test moved from start of malloc/free to end of append, only global_index+1 changed + call PoolOpt to Construct Pool + */ - vector v = swap_split(blockInfo, " "); - void* tempPtr; + vector v = SplitOptString(block_info, " "); + void* temp_ptr; stringstream convert(v[1]); - convert>>tempPtr; - auto tempBlock_ = static_cast(tempPtr); + convert>>temp_ptr; + auto block_ptr = static_cast(temp_ptr); // insert size, malloc : flag, block_, size, t; others: insert size t. if (v.size() != 4) { stringstream strm1; - strm1<size(); - string tempStr1 = strm1.str(); - blockInfo = v[0] + ' ' + v[1] + ' ' + tempStr1 + ' ' + v[2]; + strm1<size(); + string temp_str1 = strm1.str(); + block_info = v[0] + ' ' + v[1] + ' ' + temp_str1 + ' ' + v[2]; } // update global load - if (maxLen < maxLen_threshold){ + if (iteration_length < iteration_length_threshold){ if (v[0] == "Malloc"){ if (global_load.size()>0){ - global_load.push_back(global_load[global_load.size()-1]+tempBlock_->size()); + global_load.push_back(global_load[global_load.size()-1]+block_ptr->size()); } else { - global_load.push_back(tempBlock_->size()); + global_load.push_back(block_ptr->size()); } } else if (v[0] == "Free"){ - global_load.push_back(global_load[global_load.size()-1]-tempBlock_->size()); + global_load.push_back(global_load[global_load.size()-1]-block_ptr->size()); } else { global_load.push_back(global_load[global_load.size()-1]); } } //append into vec_block - vec_block.push_back(blockInfo); - - - //cout<size()<maxLen_threshold)&&((gc-globeCounter+1)==3*maxLen)){ - // fstream file_block_fresh("vec_block_fresh.csv", ios::in|ios::out|ios::app); - // for (int i =0; imaxLen_threshold) && ((gc-location)%(maxLen) == 0)){ - if (tempTime != 0){ - fstream file_time("itr_time.csv", ios::in|ios::out|ios::app); - auto t_now = (std::chrono::system_clock::now()).time_since_epoch().count(); - file_time<<(float)(t_now - tempTime)/(float)(1000000)<size() != size_sequence[r_global_index]){ + async_swap_flag = 0; + cout<<"!!!! async_swap_flag changed back to 0"<size() != sizeSequence[r_gc]){ - asyncSwapFlag = 0; - cout<<"!!!! asyncSwapFlag changed back to 0"<PoolOpt(vec_block_mf); - cout<<"==================to call PoolOpt done"<PoolOpt(vec_block_mf); } } -void* SwapGPU::GetRealGpuPtr(const Block* block_){ - // in case that block is at host memory, swapIn ad hoc. - auto r_idx = Table_not_at_device.find(block_)->second; - - // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); +void* SwapGPU::UpdateGpuPtr(const Block* block_ptr){ + /* + in case that block is not at device memory, swapIn ad hoc. + used in block class to update ptr after swap in done, if variable is not swapped back yet as expected. + */ + auto r_idx = table_not_at_device.find(block_ptr)->second; cudaError_t err; - BlockMeta meta = Table_meta.find(r_idx)->second; + BlockMeta meta = table_meta.find(r_idx)->second; cudaEventCreate (&meta.in_event); - //cout<<"update block and data of r_idx: "<Malloc((void**)&ptr, meta.size); - //cout<<"expected results update_data:: "<update_data(last_meta.data_); - // cout<<"----sync in "<second = last_meta; - Table_meta.find(r_idx)->second = meta; - - // //here should be not update_data() - // auto reading_meta = Table_meta.find(Table_not_at_device.find(block_)->second)->second; - // auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // cudaEventSynchronize(reading_meta.in_event); - // auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - // //cout<<"GetRealGpuPtr, overhead is: "<second<<" "<update_data(reading_meta.data_); - // //cout<<"last_meta r_idx::::::malloc due to swapIn ( "<second<update_data(static_cast(ptr)); - - cout<<"print ptr from function GetRealGpuPtr() "<second = meta; - return ptr; //TODO(junzhe) attention, based on no change here. + return ptr; } -void SwapGPU::SwapOut_idx(const int r_idx){ - //cout<<"doing asynchrous swapOut of r_idx: "< CPU, and update meta. + */ cudaError_t err; - BlockMeta meta = Table_meta.find(r_idx)->second; + BlockMeta meta = table_meta.find(idx)->second; cudaEventCreate (&meta.out_event); - //cout<<"right before cudaMemcpyAsync Out"<second = meta; - //cout<<"time for asynchrous: "<second = meta; } -void SwapGPU::SwapIn_idx(const int r_idx){ - //logic: extra meta, swap, update meta in Table - //TODO(junzhe) to clean up free(), make it in somewhere else. - auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); +void SwapGPU::SwapIn(const int idx){ + /* + memory copy asynchronously CPU -> GPU, and update meta. + */ + cudaError_t err; - BlockMeta meta = Table_meta.find(r_idx)->second; + BlockMeta meta = table_meta.find(idx)->second; cudaEventCreate (&meta.in_event); - //cout<<"update block and data of r_idx: "<Malloc((void**)&ptr, meta.size); + ///append vec_block_mf - if ((asyncSwapFlag == 1) && ((gc - 4*maxLen) < three_more_globeCounter) - && ((gc - maxLen) >= three_more_globeCounter)){ - string tempStr1 ="Malloc "; + if ((async_swap_flag == 1) && ((global_index - 4*iteration_length) < three_more_iteration_global_index_threshold) + && ((global_index - iteration_length) >= three_more_iteration_global_index_threshold)){ + string temp_str1 ="Malloc "; stringstream strm2; strm2<second = meta; - //meta.block_->update_data(meta.data_); //TODO(junzhe) debug only, not the right place to update. - //auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - //cout<<"time for asynchrous: "<second = meta; } -void SwapGPU::SwapOut(const Block* block_){ - if (gc < 1000 && block_->size() > 1<<20) { +void SwapGPU::SwapOutSynchronous(const Block* block_ptr){ + /* + for synchronous swap, collect speed info + */ + if (global_index < 1000 && block_ptr->size() > 1<<20) { fstream file_block5("speed.csv", ios::in|ios::out|ios::app); BlockMeta meta; meta.data_ = meta.block_->get_data(); - void* tempPtr = nullptr; - cudaMallocHost(&tempPtr,block_->size()); //pinned memory. - meta.cpu_ptr = tempPtr; - Table_block_meta[block_] = meta; + void* temp_ptr = nullptr; + cudaMallocHost(&temp_ptr,block_ptr->size()); //pinned memory. + meta.cpu_ptr = temp_ptr; + table_block_meta[block_ptr] = meta; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaError_t err; - err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_->size(),cudaMemcpyDeviceToHost); + err = cudaMemcpy(meta.cpu_ptr, meta.data_,block_ptr->size(),cudaMemcpyDeviceToHost); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - file_block5<<"Out "<size()<<' '<size()<<' '<size() > 1<<20) { +void SwapGPU::SwapInSynchronous(const Block* block_ptr){ + /* + for synchronous swap, collect speed info + */ + if (global_index < 1000 && block_ptr->size() > 1<<20) { fstream file_block5("speed.csv", ios::in|ios::out|ios::app); - BlockMeta meta = Table_block_meta.find(block_)->second; + BlockMeta meta = table_block_meta.find(block_ptr)->second; auto t1 = (std::chrono::system_clock::now()).time_since_epoch().count(); cudaError_t err; - err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_->size(),cudaMemcpyHostToDevice); + err = cudaMemcpy(meta.data_, meta.cpu_ptr,block_ptr->size(),cudaMemcpyHostToDevice); auto t2 = (std::chrono::system_clock::now()).time_since_epoch().count(); - file_block5<<"In "<size()<<' '<size()<<' '< colorRange; - vector> colorOccupied; -}; -Vertex::Vertex(int n, size_t s, int r1, int d1){ - name =n; - size = s; - r = r1; - d = d1; -}//end of class Vertex - ///Section for structs and respective sorting function: -// onePieceMsg_pool, onePairMsg, oneIterMsg, version 11/30 3pm -struct onePieceMsg_pool{ +// PoolOptInfo, PoolBlockLifeTime, PoolOptSimplifiedInfo +struct PoolOptInfo{ /* - members: [ptr, size, MallocFree, idx] + members: [ptr, size, operation_type, idx] */ string ptr; size_t size; - int MallocFree; + int operation_type; int idx; - onePieceMsg_pool(string p, size_t s, int M, int i):ptr(p),size(s),MallocFree(M),idx(i){} + PoolOptInfo(string p, size_t s, int M, int i):ptr(p),size(s),operation_type(M),idx(i){} }; -struct less_than_ptrIdx{ - /* - sort onePieceMsg_pool by ptr and then idx. - */ - inline bool operator() (const onePieceMsg_pool& struct1, const onePieceMsg_pool& struct2) - { - return ((struct1.ptrstruct2.size); - } -}; - -struct less_than_size_rIdx{ - /* - sort onePairMsg by descending size and r_idx - */ - inline bool operator() (const onePairMsg& struct1, const onePairMsg& struct2) - { - return ((struct1.size>struct2.size)||((struct1.size==struct2.size)&&(struct1.r_idxstruct2.size); + } }; -struct less_than_lookupIdx{ - /* - sort lookUpElement by idx. - */ - inline bool operator() (const lookUpElement& struct1, const lookUpElement& struct2) - { - return (struct1.r_idxstruct2.size)||((struct1.size==struct2.size)&&(struct1.r_idx split(string s, string delimiter) { - size_t pos_start = 0, pos_end, delim_len = delimiter.length(); - string token; - vector res; - while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { - token = s.substr(pos_start, pos_end - pos_start); - pos_start = pos_end + delim_len; - res.push_back(token); - } - res.push_back(s.substr(pos_start)); - return res; +vector SplitString(string s, string delimiter) { + /// string delimiter + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + string token; + vector res; + while ((pos_end = s.find(delimiter, pos_start)) != string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; } -///Section of converting text file -->vector of Sring --> pieceMsg -->pairMsg -->iterMsg -//vector of pairMsg is used in run. -//vector of iterMsg is used in test. -vector strVec_2_pieceMsgVec(vector vec, int &idxRange){ +vector PoolOptSeqStrToStruct(vector vec, int &idx_range){ /* - convert vector of string into vector of onePieceMsg_pool, sorted by ptr and then idx, and update idxRange to pieceMsgVec size. + convert vector of string into vector of PoolOptInfo, + sorted by ptr and then idx, and update idx_range to pieceMsgVec size. */ - vectoronePieceMsg_poolVec_; + vectorvec_pool_opt_info; for (int i=0;i v = split(vec[i], " "); + vector v = SplitString(vec[i], " "); if (v[0]=="Malloc"){ //convert v[2] from str to size_t size_t result; @@ -262,410 +230,346 @@ vector strVec_2_pieceMsgVec(vector vec, int &idxRange) result =-1; cout<<"error for converting size from str to int."<(onePieceMsg_poolVec_.size()); + sort(vec_pool_opt_info.begin(),vec_pool_opt_info.end(),sort_by_ptr_idx_ascending()); + idx_range = static_cast(vec_pool_opt_info.size()); - return onePieceMsg_poolVec_; -}// end of strVec_2_pieceMsgVec function + return vec_pool_opt_info; +} -pair,vector> pieceMsgVec_2_pairOfPairMsgVec(vectoronePieceMsg_poolVec_, int idxRange){ - /* - pairMsg is grouped into 1. normal blocks 2. cross-iteration blocks. - */ - vectoronePairMsgVec_1; - vectoronePairMsgVec_2; - int i=0; - - //while loop processes a pair at each time, if got a pair. - while (i<(onePieceMsg_poolVec_.size()-1)){ - //condition A: start with free. do nothing. - if (onePieceMsg_poolVec_[i].MallocFree==-1){ - i+=1; - } - //condition B: start with Malloc, next item same ptr and is free. - if ((onePieceMsg_poolVec_[i].MallocFree==1)&& (onePieceMsg_poolVec_[i+1].MallocFree==-1)&&((onePieceMsg_poolVec_[i].ptr==onePieceMsg_poolVec_[i+1].ptr))){ - onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i+1].idx); - onePairMsgVec_1.push_back(tempPair); - i+=2; - } - // condition C: start with Malloc, no free. - if ((onePieceMsg_poolVec_[i].MallocFree==1)&&(onePieceMsg_poolVec_[i].ptr!=onePieceMsg_poolVec_[i+1].ptr)){ - onePairMsg tempPair(onePieceMsg_poolVec_[i].idx,onePieceMsg_poolVec_[i].size,onePieceMsg_poolVec_[i].idx,idxRange); - onePairMsgVec_2.push_back(tempPair); - i+=1; - } - }//end of while - //condition D: if still left with the last item - if ((i,vector> PoolOptInfoToBlockLifeTime(vectorvec_pool_opt_info, int idx_range){ + /* + convert vector of opt info into vector of block life time + return a pair of vectors: 1. normal blocks 2. cross-iteration blocks. + */ + vectorvec_block_life_time1; + vectorvec_block_life_time2; + int i=0; + + //while loop processes a pair at each time, if got a pair. + while (i<(vec_pool_opt_info.size()-1)){ + //condition: start with free. do nothing. + if (vec_pool_opt_info[i].operation_type==-1){ i+=1; } - - //sort both pairVec - sort(onePairMsgVec_1.begin(),onePairMsgVec_1.end(),less_than_size_rIdx()); - sort(onePairMsgVec_2.begin(),onePairMsgVec_2.end(),less_than_size_rIdx()); - pair,vector>pairOfPairMsgVec_(onePairMsgVec_1,onePairMsgVec_2); - - return pairOfPairMsgVec_; -}//end of pieceMsgVec_2_pairOfPairMsgVec function - -///Section of coloring algorithm. mergeSeg and then FFallocation when building edges of the graph. -vector> mergeSeg(vector> colorOccupied){ - /* - version 12/9 11am -- modify to accomodate unsigned int/size_t - input:the collection of color ranges that is once occupied by some block during a block's life time. - function: merge consecutive/overlapping segments of colorOccupied - output: merged segments in ascending order. - time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of colorOccupied. - */ - sort(colorOccupied.begin(), colorOccupied.end()); - - if(colorOccupied.size()<=1){ - return colorOccupied; - } - - int m = 0; - while (m<(colorOccupied.size()-1)){ - - if ((colorOccupied[m].second +2)> colorOccupied[m+1].first){ - pairtempItem(colorOccupied[m].first,max(colorOccupied[m].second,colorOccupied[m+1].second)); - //remove m+1 and m - colorOccupied.erase(colorOccupied.begin()+m+1); - colorOccupied.erase(colorOccupied.begin()+m); - //insert the combined range - colorOccupied.insert(colorOccupied.begin()+m,tempItem); - }else{ - m+=1; - } - }//end of while loop - - //verify if mergeSeg is completed. O(n^2) optional -// if(colorOccupied.size()>1){ -// for (int i=0;i<(colorOccupied.size()-1);i++){ -// if(colorOccupied[i].second>=colorOccupied[i+1].first){ -// cout<<"error in mergeSeg"< FFallocation(vector> colorMerged,size_t size, size_t local_offset){ - /* - version 12/2 4pm - First Fit weighted coloring - return a pair standing for colorRange. - local_offset shifts the returned colorRange, allowing multiple run(). - local_offset not changable, whereas offset is changable. - */ - // condition A: if no occupied, put after the local_offset - if (colorMerged.size()==0){ - return pair(0+local_offset,size-1+local_offset); - } - - // condition B: able to fit before first block, after the local_offset - if ((size+local_offset)<(colorMerged[0].first+1)){ - return pair(0+local_offset,size-1+local_offset); + //condition: start with Malloc, next item same ptr and is free. + if ((vec_pool_opt_info[i].operation_type==1)&& (vec_pool_opt_info[i+1].operation_type==-1)&&((vec_pool_opt_info[i].ptr==vec_pool_opt_info[i+1].ptr))){ + PoolBlockLifeTime temp_block_life_time(vec_pool_opt_info[i].idx,vec_pool_opt_info[i].size,vec_pool_opt_info[i].idx,vec_pool_opt_info[i+1].idx); + vec_block_life_time1.push_back(temp_block_life_time); + i+=2; } - - size_t yLocation= -1; - if (colorMerged.size()>1) { - int n = 0; - while (n<(colorMerged.size()-1)){ - // condition C: able to fit in between middle blocks. - if ((colorMerged[n+1].first-colorMerged[n].second-1)>=size){ - yLocation = colorMerged[n].second+1; - break; - } - n+=1; - }//end of while loop. - // condition D: allocate after the last block. - if (yLocation == -1){ - yLocation = colorMerged[colorMerged.size()-1].second+1; - } - }// end of if loop, conditon C and D. - - // condition E: colorMeger len =1, allocate after the last block. - if (colorMerged.size()==1){ - yLocation = colorMerged[0].second+1; - } - - if (yLocation==-1){ - cout<<"error in FFallocation!!!"<(yLocation,yLocation+size-1); -}//end of FFallocation function + }//end of while + //condition: if still left with the last item + if ((i,vector>pair_vec_block_life_time(vec_block_life_time1,vec_block_life_time2); + + return pair_vec_block_life_time; +} -pair BFallocation(vector> colorMerged,size_t size, size_t local_offset){ - /* - version 12/11 1pm - Best Fit allocation, input and output same as FFallocation - */ - // condition A: if no occupied, put after the local_offset - if (colorMerged.size()==0){ - return pair(0+local_offset,size-1+local_offset); - } - //condition B: if size=1, able to fit before the first block - if ((colorMerged.size()==1)&&((size+local_offset)<(colorMerged[0].first+1))){ - return pair(0+local_offset,size-1+local_offset); - } - - //condition C: else of B - if ((colorMerged.size()==1)&&((size+local_offset)>=(colorMerged[0].first+1))){ - return pair(colorMerged[0].second+1,colorMerged[0].second+size); - } - - //condition D and E: - size_t yLocation=-1; - pairtempHole(-1,-1); // n, hole size between n and n+1 - if (colorMerged.size()>1) { - int n = 0; - while (n<(colorMerged.size()-1)){ - // condition C: able to fit in between middle blocks. select smallest. - if (((colorMerged[n+1].first-colorMerged[n].second-1)>=size)&&((colorMerged[n+1].first-colorMerged[n].second-1)> MergeColoredSegments(vector> vec_color_preoccupied){ + /* + merge consecutive/overlapping segments of vec_color_preoccupied + input:the collection of color ranges that is once occupied by some block during a block's life time. + output: merged segments in ascending order. + time complexity: O(n) for run, O(n^2) for verify section(optional), where n is size of vec_color_preoccupied. + */ + sort(vec_color_preoccupied.begin(), vec_color_preoccupied.end()); + + if(vec_color_preoccupied.size()<=1){ + return vec_color_preoccupied; + } + + int m = 0; + while (m<(vec_color_preoccupied.size()-1)){ + if ((vec_color_preoccupied[m].second +2)> vec_color_preoccupied[m+1].first){ + pairtempItem(vec_color_preoccupied[m].first,max(vec_color_preoccupied[m].second,vec_color_preoccupied[m+1].second)); + //remove m+1 and m + vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m+1); + vec_color_preoccupied.erase(vec_color_preoccupied.begin()+m); + //insert the combined range + vec_color_preoccupied.insert(vec_color_preoccupied.begin()+m,tempItem); + }else{ + m+=1; } - - return pair(yLocation,yLocation+size-1); + }//end of while loop + + return vec_color_preoccupied; } -vector colorSomeVertices(vector pairMsgVec_, size_t &offset,string colorMethod){ - /* - color all or 1/2 vertices using mergeSeg() and FFallocation(), with update offset. - time complexity: O(n^2). - */ - size_t local_offset = offset; //feed into FFallocation, shall never change. - int m = static_cast(pairMsgVec_.size()); - //init all vertices - vectorvertices; - for (int i=0; i FirstFitAllocation(vector> vec_color_merged,size_t size, size_t local_offset){ + /* + First Fit weighted coloring + return a pair standing for color_range. + local_offset shifts the returned color_range, allowing multiple Plan(). + local_offset not changable, whereas offset is changable. + */ + // condition: if no occupied, put after the local_offset + if (vec_color_merged.size()==0){ + return pair(0+local_offset,size-1+local_offset); + } + + // condition: able to fit before first block, after the local_offset + if ((size+local_offset)<(vec_color_merged[0].first+1)){ + return pair(0+local_offset,size-1+local_offset); + } + + size_t y_location= -1; + if (vec_color_merged.size()>1) { + int n = 0; + while (n<(vec_color_merged.size()-1)){ + // condition: able to fit in between middle blocks. + if ((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size){ + y_location = vec_color_merged[n].second+1; + break; + } + n+=1; + }//end of while loop. + // condition: allocate after the last block. + if (y_location == -1){ + y_location = vec_color_merged[vec_color_merged.size()-1].second+1; } + }// end of if loop, conditon C and D. + + // condition: colorMeger len =1, allocate after the last block. + if (vec_color_merged.size()==1){ + y_location = vec_color_merged[0].second+1; + } + + if (y_location==-1){ + cout<<"error in FirstFitAllocation!!!"<(y_location,y_location+size-1); +} - int **adj; - adj = new int*[m]; //TODO(junzhe) should be deleted somewhere. - // build edges with values 1 and 0; combine with mergeSeg and FFallocation in the loop. - for (int i=0; i>colorMerged = mergeSeg(vertices[i].colorOccupied); - - if(colorMethod=="FF"){ - vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset); - - }else{ //BF - vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, local_offset); - } - //update of offset, largest memory footprint as well. - if (vertices[i].colorRange.second >=offset){ - offset = vertices[i].colorRange.second+1; - } - }//end of for loop. +pair BestFitAllocation(vector> vec_color_merged,size_t size, size_t local_offset){ + /* + Best Fit allocation, input and output same as FirstFitAllocation + */ + // condition: if no occupied, put after the local_offset + if (vec_color_merged.size()==0){ + return pair(0+local_offset,size-1+local_offset); + } + //condition: if size=1, able to fit before the first block + if ((vec_color_merged.size()==1)&&((size+local_offset)<(vec_color_merged[0].first+1))){ + return pair(0+local_offset,size-1+local_offset); + } + + //condition: lese of second condition + if ((vec_color_merged.size()==1)&&((size+local_offset)>=(vec_color_merged[0].first+1))){ + return pair(vec_color_merged[0].second+1,vec_color_merged[0].second+size); + } + + size_t y_location=-1; + pairtemp_hole(-1,-1); // n, hole size between n and n+1 + if (vec_color_merged.size()>1) { + int n = 0; + while (n<(vec_color_merged.size()-1)){ + // condition: able to fit in between middle blocks. select smallest. + if (((vec_color_merged[n+1].first-vec_color_merged[n].second-1)>=size)&&((vec_color_merged[n+1].first-vec_color_merged[n].second-1)(y_location,y_location+size-1); } +vector AssignColorToVertices(vector vec_block_life_time, size_t &offset,string color_method){ + /* + color all or 1/2 vertices using MergeColoredSegments() and FirstFitAllocation(), with updated offset. + time complexity: O(n^2). + */ + size_t local_offset = offset; //feed into FirstFitAllocation, shall never change. + int m = static_cast(vec_block_life_time.size()); + //init all vertices + vectorvertices; + for (int i=0; i,map> cross_itr_durations(vectorvec_double, int location, int maxLen, int &doubleRange){ - - vectoronePieceMsg_poolVec_2 = strVec_2_pieceMsgVec(vec_double,doubleRange); - pair,vector>pairOfPairMsgVec_2=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_2,doubleRange); - - mapTable_r2d; //full duration info, cross-iteration duration. - mapTable_d2r; - for (int i=0;i,map>(Table_r2d,Table_d2r); -} - -/// main run funtion -vector run(vectorvec, int &idxRange, size_t &offset, size_t &offsetCrossItr,string colorMethod){ - /* - run function, input vector of strings, return colored vertices, - update idxRange, offset. - time complexity: O(n^2) where n is maxLen. - */ - vectoronePieceMsg_poolVec_ = strVec_2_pieceMsgVec(vec,idxRange); - pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_,idxRange); - //1. normal blocks 2. cross-iteration blocks. - vectorpairMsgVec_1 = pairOfPairMsgVec_.first; - vectorpairMsgVec_2 = pairOfPairMsgVec_.second; - - vectorvertices_2 = colorSomeVertices(pairMsgVec_2,offset,colorMethod); - for (int i=0; i>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied); + + if(color_method=="FF"){ + vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset); + + }else{ //BF + vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, local_offset); } - offsetCrossItr = offset; - offset = offsetCrossItr*2; - vectorvertices = colorSomeVertices(pairMsgVec_1,offset,colorMethod); - //merge - vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end()); - return vertices; + //update of offset, largest memory footprint as well. + if (vertices[i].color_range.second >=offset){ + offset = vertices[i].color_range.second+1; + } + }//end of for loop. + + return vertices; } -///Section of test functions. -vector pairOfPairMsgVec_2_repSeq(pair,vector>pairOfPairMsgVec_){ - int counter_1M=0; int counter_1F=0; int counter_2=0; - vectoronePairMsgVec_1 = pairOfPairMsgVec_.first; - vectoronePairMsgVec_2 = pairOfPairMsgVec_.second; - vectoroneIterMsgVec_; - for (int i =0; i(onePairMsgVec_1[i].d_idx-onePairMsgVec_1[i].r_idx); - oneIterMsg tempIterF(temp_s_d,-1,onePairMsgVec_1[i].d_idx); - oneIterMsgVec_.push_back(tempIterF); - counter_1F++; - } - - for (int i =0; irep; // vector of size_delta, name it as rep for simlisity. - for (int i =0; i,map> GetCrossIterationBlocks(vectorvec_double, int location_2nd_iteration, int iteration_length, int &double_range){ + ///get cross-iteration duration blocks + vectorvec_pool_opt_info2 = PoolOptSeqStrToStruct(vec_double,double_range); + pair,vector>pair_vec_block_life_time2=PoolOptInfoToBlockLifeTime(vec_pool_opt_info2,double_range); + + maptable_ridx_to_didx; //full duration info, cross-iteration duration. + maptable_didx_to_ridx; + for (int i=0;i,map>(table_ridx_to_didx,table_didx_to_ridx); +} - return rep; -}//end of pairOfPairMsgVec_2_repSeq function +///Section of test functions. +vector PoolOptSeqRepeatableTestPreProcess(pair,vector>pair_vec_block_life_time){ + /* + pre process pair of vector of block life time info, for ease of repeatable test. + */ + vectorvec_block_life_time1 = pair_vec_block_life_time.first; + vectorvec_block_life_time2 = pair_vec_block_life_time.second; + vectorvec_pool_opt_simplified_info; + + //process Malloc and Free pair, i.e. normal blocks + for (int i =0; i(vec_block_life_time1[i].d_idx-vec_block_life_time1[i].r_idx); + PoolOptSimplifiedInfo tempIterF(temp_s_d,-1,vec_block_life_time1[i].d_idx); + vec_pool_opt_simplified_info.push_back(tempIterF); + } + + //process Malloc-only blocks, i.e. cross-iteration blocks + for (int i =0; ivec_rep; // vector of size_delta, name it as vec_rep for simlisity. + for (int i =0; i maxRepeatedSeg(vectorrep, int idxRange, int &maxLen, int &location){ - /* - get max repeated non-overlapping Seg of a vector, return the repeated segment, - update maxLen, and location of where Seg starts to repeat. - brtue force method using equal() - time complexity O(n^2) - */ - for (int i=0; isubSeq(&rep[location],&rep[location+maxLen]); - if(!(equal(rep.begin()+location,rep.begin()+maxLen-1+location,subSeq.begin()) && equal(rep.begin()+location+maxLen,rep.begin()+2*maxLen-1+location,subSeq.begin()))){ - cout<<"error in get the maxRep"<subSeq, int &maxLen, int &location){ - /* - to cut, in case the repeated Seg contains multiple iterations. - */ - int tempMaxLen=0; - int tempLocation =0; - int tempIdxRange = maxLen; - - vectortempSubSeq = maxRepeatedSeg(subSeq,tempIdxRange,tempMaxLen, tempLocation); - //TODO(junzhe), tunable threshold. - int threshold =50; - if (tempMaxLen>threshold){ - maxLen = tempMaxLen; - location += tempLocation; - cout<<"max length get cut"< PoolRepeatableTest(vectorrep, int idx_range, int &iteration_length, int &location_2nd_iteration){ + /* + get max repeated non-overlapping Seg of a vector, return the repeated segment, + update iteration_length, and location_2nd_iteration of where Seg starts to repeat. + brtue force method using equal() + time complexity O(n^2) + */ + for (int i=0; isub_sequence(&rep[location_2nd_iteration],&rep[location_2nd_iteration+iteration_length]); + if(!(equal(rep.begin()+location_2nd_iteration,rep.begin()+iteration_length-1+location_2nd_iteration,sub_sequence.begin()) && equal(rep.begin()+location_2nd_iteration+iteration_length,rep.begin()+2*iteration_length-1+location_2nd_iteration,sub_sequence.begin()))){ + cout<<"error in get the maxRep"<vec3, int &maxLen, int &location){ +void VerifyRepeatableTest(vectorsub_sequence, int &iteration_length, int &location_2nd_iteration){ /* - main function of test, returns globeCounter, which is when flag shall be switched, - update maxLen and location of where the repeated Seg starts. - */ - cout<<"====================== test ========================="<onePieceMsg_poolVec_3 =strVec_2_pieceMsgVec(vec3,idxRange3); - pair,vector>pairOfPairMsgVec_=pieceMsgVec_2_pairOfPairMsgVec(onePieceMsg_poolVec_3,idxRange3); - vectorrep=pairOfPairMsgVec_2_repSeq(pairOfPairMsgVec_); + to cut, in case the repeated Segment returned by PoolRepeatableTest contains multiple iterations. + */ + int temp_iteration_length = 0; + int temp_location_2nd_iteration = 0; + int temp_idx_range = iteration_length; + + //verify by testing its subsequence again + vectortempsub_sequence = PoolRepeatableTest(sub_sequence,temp_idx_range,temp_iteration_length, temp_location_2nd_iteration); - //get repeated sub vector. - vectorsubSeq = maxRepeatedSeg(rep,idxRange3,maxLen,location); - //cout<100){ //TODO(junzhe) tunable threshold. - cout<<"new location and maxLen: "<threshold){ + iteration_length = temp_iteration_length; + location_2nd_iteration += temp_location_2nd_iteration; } - return globeCounter; } + ///verify if coloring got overlapping -void overlap_test(vector vertices){ +void OverlapVerification(vector vertices){ size_t s = vertices.size(); int i,j; for (i=0; i vertices){ SmartMemPool::SmartMemPool(const MemPoolConf &conf){ - //TODO(junzhe) to figure out what to do here. - colorMethod = "BF"; + color_method = "BF"; conf_ = conf; } void SmartMemPool::Init(){ - //TODO(junzhe) Note, this is dummy here, not catter multiple GPU. mtx_.lock(); if(!initialized_){ initialized_ =true; @@ -689,291 +591,332 @@ void SmartMemPool::Init(){ } + +int SmartMemPool::Detection(vectorvec_string_test, int &iteration_length, int &location_2nd_iteration){ + /* + Testing repeatability from raw operation sequence + returns global_index_threshold, which is when flag shall be switched, + update iteration_length and location_2nd_iteration of where the repeated Seg starts. + */ + int idx_range_test=0; + vectorvec_pool_opt_info3 = PoolOptSeqStrToStruct(vec_string_test,idx_range_test); + pair,vector>pair_vec_block_life_time = PoolOptInfoToBlockLifeTime(vec_pool_opt_info3,idx_range_test); + vectorvec_rep = PoolOptSeqRepeatableTestPreProcess(pair_vec_block_life_time); + + //repeatable test with verification + vectorsub_sequence = PoolRepeatableTest(vec_rep,idx_range_test,iteration_length,location_2nd_iteration); + VerifyRepeatableTest(sub_sequence, iteration_length, location_2nd_iteration); + + //update global_index_threshold if test past, i.e. iteration_length exceed certain threshold + if (iteration_length>100){ //tunable threshold. + global_index_threshold = idx_range_test+iteration_length-(idx_range_test-location_2nd_iteration)%iteration_length; + } + return global_index_threshold; +} + + +/// main run funtion +vector SmartMemPool::Plan(vectorvec, int &idx_range, size_t &offset, size_t &offset_cross_iteration,string color_method){ + /* + Planning, i.e. Assign Color to Vertices from raw operation sequence info. + input vector of strings, return colored vertices, + update idx_range, offset. + time complexity: O(n^2) where n is iteration_length. + */ + + vectorvec_pool_opt_info = PoolOptSeqStrToStruct(vec,idx_range); + pair,vector>pair_vec_block_life_time=PoolOptInfoToBlockLifeTime(vec_pool_opt_info,idx_range); + + //coloring normal blocks and cross-iteration blocks separately, cannot be miss-matched. + vectorvec_block_life_time1 = pair_vec_block_life_time.first; + vectorvec_block_life_time2 = pair_vec_block_life_time.second; + + //color cross-iteration blocks + vectorvertices_2 = AssignColorToVertices(vec_block_life_time2,offset,color_method); + + for (int i=0; ivertices = AssignColorToVertices(vec_block_life_time1,offset,color_method); + + //merge after coloring + vertices.insert(vertices.end(),vertices_2.begin(),vertices_2.end()); + + return vertices; +} + + ///Malloc void SmartMemPool::Malloc(void** ptr, const size_t size){ - /* - 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool. - 2. if flag=0, malloc/cudaMalloc, collect vec string - 3. if flag=1, look up table, malloc/cudaMalloc if not in the Table - 4. test repeated sequence every 100 blocks, update globeCounter. - */ - - //TODO(junzhe) Note, this is dummy here, not catter multiple GPU. - //fstream file("memInfo.text", ios::in|ios::out|ios::app); //a. - //file<vec_raw_opt_info(&vec[location_2nd_iteration],&vec[location_2nd_iteration+iteration_length]); - if (gc == globeCounter){ - /// 1. switch flag when gc == globeCounter, construct lookup table and malloc the whole pool. - - mallocFlag=1; - cout<<"switched to color-malloc"<vec_run(&vec[location],&vec[location+maxLen]); - - vectorvertices = run(vec_run, idxRange,offset,offsetCrossItr, colorMethod); + //color vertices + vectorvertices = Plan(vec_raw_opt_info,idx_range,offset,offset_cross_iteration,color_method); - //here to verify if the coloring got overlapping. TODO(junzhe) optional - //overlap_test(vertices); - - //obtain the cross-iteration duration info - int doubleRange=0; - vectorvec_double(&vec[location],&vec[location+2*maxLen]); - pair,map>pairs =cross_itr_durations(vec_double, location,maxLen,doubleRange); - Table_r2d = pairs.first; - Table_d2r = pairs.second; - - //update ptrPool - cudaMalloc(&ptrPool,offset); //poolSize or memory foot print offset. - cout<<"ptrPool is: "<second; - temp.size =vertices[i].size; - temp.offset=vertices[i].colorRange.first; - temp.ptr = (void*)((char*)ptrPool+temp.offset*sizeof(char)); - temp.Occupied =0; - temp.crossItr = vertices[i].crossItr; - temp.Occupied_backup =0; - //build tables for lookup. - Vec_r2Ver[vertices[i].r].second= temp; - } - } + //here to verify if the coloring got overlapping. for verify purpose only. + //OverlapVerification(vertices); - if(mallocFlag==0){ - /// 2. if flag=0, malloc/cudaMalloc - cudaMalloc(ptr, size); - allocatedPtr = *ptr; - //update load - if(loadLogFlag==1){ - if (gc>0){ - Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first+size,Table_load.find(gc-1)->second.second); - }else{ //very first block - Table_load[gc]=make_pair(size,0); - } - } - //push_back the string for later test and run. - string tempStr1 ="Malloc "; - stringstream strm2; - strm2<second.first,Table_load.find(gc-1)->second.second+size); - } - //file<<" Condition M2, addr: "<<*ptr<second.first,Table_load.find(gc-1)->second.second+size); - } - //file<<" Condition M4, addr: "<<*ptr<second.first+size,Table_load.find(gc-1)->second.second); - } - //file<<" Condition M3, addr: "<<*ptr<vec_double(&vec[location_2nd_iteration],&vec[location_2nd_iteration+2*iteration_length]); + pair,map>pairs =GetCrossIterationBlocks(vec_double,location_2nd_iteration,iteration_length,double_range); + table_ridx_to_didx = pairs.first; + table_didx_to_ridx = pairs.second; - ///4. test repeated sequence every 100 blocks, update globeCounter. - if (((gc+1)%300==0) && (mallocFlag==0) && (globeCounter==-1)&&(gc+2>checkPoint)){ - cout<<"gc and GC before test: "<0)){ - getMaxLoad(); - loadLogFlag=0; + for (int i=0; isecond; + temp.size =vertices[i].size; + temp.offset=vertices[i].color_range.first; + temp.ptr = (void*)((char*)ptr_pool+temp.offset*sizeof(char)); + temp.occupied =0; + temp.cross_iteration = vertices[i].cross_iteration; + temp.occupied_backup =0; + //build tables for lookup. + vec_block_meta[vertices[i].r].second= temp; } - - gc++; - Table_p2s[allocatedPtr]=size; //update it for load tracking purpose. - *ptr = allocatedPtr; - ///update block_RWMF - string tempStr1 ="Malloc "; + } + /// 2. if flag=0, malloc/cudaMalloc, accumulate vec_info at the beginning iterations. + if(malloc_flag ==0){ + cudaMalloc(ptr, size); + allocated_ptr = *ptr; + //update load + if(load_flag==1){ + if (global_index>0){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second); + }else{ //very first block + table_load[global_index]=make_pair(size,0); + } + } + //push_back the string for later test and run. + string temp_str1 ="Malloc "; stringstream strm2; - strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); - stringstream strm4; - strm4<second.first,table_load.find(global_index-1)->second.second+size); + } + }else if ((vec_block_meta[lookup_idx].second.cross_iteration==1) && (vec_block_meta[lookup_idx].second.occupied==1) && (vec_block_meta[lookup_idx].second.occupied_backup ==0)) { + //condition: cross_iteration's backup + allocated_ptr = (void*)((char*)vec_block_meta[lookup_idx].second.ptr+offset_cross_iteration*sizeof(char)); + vec_block_meta[lookup_idx].second.occupied_backup=1; + table_ptr_to_ridx[allocated_ptr]=lookup_idx; + //update load + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first,table_load.find(global_index-1)->second.second+size); + } + } + }else{ + //condition: size not proper or both occupied. + cudaMalloc(ptr, size); + allocated_ptr = *ptr; + //update load + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first+size,table_load.find(global_index-1)->second.second); + } + } + } //end of loop for flag=1 + + ///4. test repeated sequence every 300 index, update global_index_threshold. + if (((global_index+1)%300==0) && (malloc_flag ==0) && (global_index_threshold==-1)&&(global_index+2>check_point)){ + global_index_threshold = Detection(vec,iteration_length,location_2nd_iteration); + check_point=check_point*2; + } + + ///get load info, when global_index == global_index+2iteration_length + if (global_index==(global_index_threshold+2*iteration_length)&& (global_index_threshold>0)){ + GetMaxLoad(); + load_flag=0; + } + + global_index++; + //update it for load tracking purpose. + table_ptr_to_size[allocated_ptr]=size; + + //update *ptr + *ptr = allocated_ptr; + + ///update block_RWMF + string temp_str1 ="Malloc "; + stringstream strm2; + strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); + stringstream strm4; + strm4<second; + size_t deallocatedSize = table_ptr_to_size.find(ptr)->second; + + /// at the begining iterations, via cudaFree, accumulate opt info. + if ((global_index_threshold==-1)||(global_indexsecond.first-deallocatedSize,Table_load.find(gc-1)->second.second); - } - /// before flag switch, for sure all free shall be done by free() - cudaFree(ptr); - }else{ - if (!(Table_p2r.find(ptr)==Table_p2r.end())){ - int resp_rIdx = Table_p2r.find(ptr)->second; - Table_p2r.erase(ptr); - - if (ptr == Vec_r2Ver[resp_rIdx].second.ptr){ - //Condition F2, from M2 - Vec_r2Ver[resp_rIdx].second.Occupied =0; //freed, able to allocate again. - //file<<" Condition F2, addr: "<0) && ((float)((char*)ptr-((char*)ptrPool+2*offsetCrossItr*sizeof(char)))<0)){ - Vec_r2Ver[resp_rIdx].second.Occupied_backup =0; - }else{ - Vec_r2Ver[resp_rIdx].second.Occupied =0; - } - } - //update load - if(loadLogFlag==1){ - Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first,Table_load.find(gc-1)->second.second-deallocatedSize); - } + //update load before free + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second); + } + // before flag switch, for sure all free shall be done by free() + cudaFree(ptr); + }else{ + /// cases that no need accumulating opt info + + /// free a ptr that is in the memory pool + if (!(table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end())){ + int resp_rIdx = table_ptr_to_ridx.find(ptr)->second; + table_ptr_to_ridx.erase(ptr); + + if (ptr == vec_block_meta[resp_rIdx].second.ptr){ + vec_block_meta[resp_rIdx].second.occupied =0; //freed, able to allocate again. + }else if (ptr == (void*)((char*)vec_block_meta[resp_rIdx].second.ptr+offset_cross_iteration*sizeof(char))){ + vec_block_meta[resp_rIdx].second.occupied_backup =0; + } else{ + if (((float)((char*)ptr-((char*)ptr_pool+offset_cross_iteration*sizeof(char)))>0) && ((float)((char*)ptr-((char*)ptr_pool+2*offset_cross_iteration*sizeof(char)))<0)){ + vec_block_meta[resp_rIdx].second.occupied_backup =0; }else{ - //update load - if(loadLogFlag==1){ - Table_load[gc]=make_pair(Table_load.find(gc-1)->second.first-deallocatedSize,Table_load.find(gc-1)->second.second); - } - //file<<" Condition F3, addr: "<second.first,table_load.find(global_index-1)->second.second-deallocatedSize); + } + }else{ + /// free a ptr that is NOT in the memory pool + + //update load + if(load_flag==1){ + table_load[global_index]=make_pair(table_load.find(global_index-1)->second.first-deallocatedSize,table_load.find(global_index-1)->second.second); + } + cudaFree(ptr); } - gc++; - ///update block_RWMF - string tempStr1 ="Free "; - stringstream strm2; - strm2<(std::chrono::system_clock::now().time_since_epoch()).count(); - stringstream strm4; - strm4<(std::chrono::system_clock::now().time_since_epoch()).count(); + stringstream strm4; + strm4<cudaLoadLog; - for (int i=0; isecond.first); - } - size_t maxCudaLoad = *max_element(cudaLoadLog.begin(),cudaLoadLog.end()); - int idxMaxCudaLoad = static_cast(distance(cudaLoadLog.begin(),max_element(cudaLoadLog.begin(),cudaLoadLog.end()))); - - vectorcolorLoadLog; - for (int i=0; isecond.second); - } - size_t maxColorLoad = *max_element(colorLoadLog.begin(),colorLoadLog.end()); - int idxMaxColorLoad = static_cast(distance(colorLoadLog.begin(),max_element(colorLoadLog.begin(),colorLoadLog.end()))); - size_t offsetCudaLoad = Table_load.find(idxMaxColorLoad)->second.first; - - maxTotalLoad = max(maxCudaLoad,maxColorLoad+offsetCudaLoad); - maxMemUsage = max(maxCudaLoad,offset+offsetCudaLoad); - memRatio = (float)maxMemUsage/(float)maxTotalLoad; - - cout<<"=============================memory usage stats print: ================================"<vec_load_log; + for (int i=0; isecond.first); + } + size_t max_cuda_load = *max_element(vec_load_log.begin(),vec_load_log.end()); + int idx_max_cuda_load = static_cast(distance(vec_load_log.begin(),max_element(vec_load_log.begin(),vec_load_log.end()))); + + vectorvec_color_load; + for (int i=0; isecond.second); + } + size_t max_color_load = *max_element(vec_color_load.begin(),vec_color_load.end()); + int idx_max_color_load = static_cast(distance(vec_color_load.begin(),max_element(vec_color_load.begin(),vec_color_load.end()))); + size_t offset_color_load = table_load.find(idx_max_color_load)->second.first; + + max_total_load = max(max_cuda_load,max_color_load+offset_color_load); + max_mem_usage = max(max_cuda_load,offset+offset_color_load); + } std::pair SmartMemPool::GetMemUsage() { - //TODO(junzhe) note here the pair is different from that of CnMemPool. - return std::make_pair(maxMemUsage, maxTotalLoad); + //note here the pair is different from that of CnMemPool. + return std::make_pair(max_mem_usage, max_total_load); } void SmartMemPool::Append(string blockInfo) { - //TODO(junzhe) add idx later - vec_block_RW.push_back(blockInfo); - vec_block_RWMF.push_back(blockInfo); + vec_block_rw.push_back(blockInfo); + vec_block_rw_mf.push_back(blockInfo); } ///SwapPool @@ -992,121 +935,86 @@ void SwapPool::Init(){ void SwapPool::PoolOpt(vector &vec_mf) { - //TODO(junzhe) redo 9/17 - - ///process vec_mf of 3itr into blocks,maxLen - //assume format of string: MF ptr size; - //onePieceMsg_pool verified - // for (int i = 0; i< vec_mf.size();i++){ - // cout<<"print mf "<onePieceMsg_poolVec_; - maxLen_mf = vec_mf.size()/3; - cout<<"maxLen_mf "<vec_pool_opt_info; + iteration_length_mf = vec_mf.size()/3; //cos input vec_mf is of 3 iteration + + //convert raw opt info into struct: PoolOptInfo for (int i = 0;i < vec_mf.size();i++){ - vector v = split(vec_mf[i], " "); - // cout<<"print mf "< v = SplitString(vec_mf[i], " "); if (v[0]=="Malloc"){ - //convert v[2] from str to size_t size_t result; stringstream convert(v[2]); - // cout<<"1"<>result)){ - result =-1; + result = -1; cout<<"error for converting size from str to int."<pairMsgVec_; + //convert into block lifetime + vectorvec_block_life_time; int i = 0; - // cout<<"before while loop"<=0 && onePieceMsg_poolVec_[i].idx =0 && onePieceMsg_poolVec_[i+1].idx =0 && vec_pool_opt_info[i].idx =0 && vec_pool_opt_info[i+1].idx (pairMsgVec_.size()); + int m = static_cast(vec_block_life_time.size()); vectorvertices; for (int i=0; i &vec_mf) { } } - vector>colorMerged = mergeSeg(vertices[i].colorOccupied); + vector>vec_color_merged = MergeColoredSegments(vertices[i].vec_color_preoccupied); - // vertices[i].colorRange = FFallocation(colorMerged,vertices[i].size, local_offset); - vertices[i].colorRange = BFallocation(colorMerged,vertices[i].size, offset); + // vertices[i].color_range = FirstFitAllocation(vec_color_merged,vertices[i].size, local_offset); + vertices[i].color_range = BestFitAllocation(vec_color_merged,vertices[i].size, offset); //update of offset, largest memory footprint as well. - if (vertices[i].colorRange.second >=offset){ - offset = vertices[i].colorRange.second+1; + if (vertices[i].color_range.second >=offset){ + offset = vertices[i].color_range.second+1; } }//end of for loop. - cout<<"offset is "<second.size))){ + //pool_flag = 1 + if (pool_index < iteration_length_mf){ + if ((table_pool_meta.find(pool_index - iteration_length_mf) == table_pool_meta.end()) || (!(size == table_pool_meta.find(pool_index - iteration_length_mf)->second.size))){ //not in table of negative r_idx cudaError_t status = cudaMalloc(ptr, size); CHECK_EQ(status, cudaError_t::cudaSuccess); } else{ //in the table of negative r_idx - auto tempMeta = Table_r2v.find(pc-maxLen_mf)->second; - allocatedPtr = tempMeta.ptr; - *ptr = allocatedPtr; - Table_p2r[allocatedPtr]=pc-maxLen_mf; + auto temp_meta = table_pool_meta.find(pool_index - iteration_length_mf)->second; + allocated_ptr = temp_meta.ptr; + *ptr = allocated_ptr; + table_ptr_to_ridx[allocated_ptr]=pool_index - iteration_length_mf; } } else{ - //8 9 10 - int r_pc = pc%maxLen_mf; - if ((Table_r2v.find(r_pc) == Table_r2v.end()) || (!(size == Table_r2v.find(r_pc)->second.size))){ + //8 9 10th iteration + int r_pool_index = pool_index%iteration_length_mf; + if ((table_pool_meta.find(r_pool_index) == table_pool_meta.end()) || (!(size == table_pool_meta.find(r_pool_index)->second.size))){ //not here, should be abnormal cudaError_t status = cudaMalloc(ptr, size); CHECK_EQ(status, cudaError_t::cudaSuccess); } else{ //in the table - auto tempMeta = Table_r2v.find(r_pc)->second; - allocatedPtr = tempMeta.ptr; - *ptr = allocatedPtr; - Table_p2r[allocatedPtr]=r_pc; - + auto temp_meta = table_pool_meta.find(r_pool_index)->second; + allocated_ptr = temp_meta.ptr; + *ptr = allocated_ptr; + table_ptr_to_ridx[allocated_ptr]=r_pool_index; } } } - - pc++; + pool_index++; } void SwapPool::Free(void *ptr) { - if (poolFlag == 0){ + if (pool_flag == 0){ cudaError_t status = cudaFree(ptr); CHECK_EQ(status, cudaError_t::cudaSuccess); } else{ - if (Table_p2r.find(ptr)==Table_p2r.end()){ + if (table_ptr_to_ridx.find(ptr)==table_ptr_to_ridx.end()){ cudaError_t status = cudaFree(ptr); CHECK_EQ(status, cudaError_t::cudaSuccess); } - } } @@ -1215,15 +1118,7 @@ void SwapPool::Append(string blockInfo) { } -void SwapPool::SwapOut(void* data_){ - //NA -} - -void SwapPool::SwapIn(void* data_){ - //NA -} - -void getMaxLoad (){ +void GetMaxLoad (){ //empty }