PrincetonVision · danielsuo · Nov 18, 2015 · Nov 18, 2015
diff --git a/README.md b/README.md
@@ -2,10 +2,11 @@
 
 Marvin is a GPU-only neural network framework made with simplicity, hackability, speed, memory consumption, and high dimensional data in mind.
 
-## Dependences
+## Dependencies
 
-Download [CUDA 7.5](https://developer.nvidia.com/cuda-downloads) and [cuDNN 3](https://developer.nvidia.com/cudnn). You will need to register with NVIDIA. Below are some additional steps to set up cuDNN 3:
+Download [CUDA 7.5](https://developer.nvidia.com/cuda-downloads) and [cuDNN 3](https://developer.nvidia.com/cudnn). You will need to register with NVIDIA. For Windows, you will need to download Microsoft Visual Studio 2013 [here](http://go.microsoft.com/fwlink/?LinkId=517284). Below are some additional steps to set up cuDNN 3:
 
+### Linux and OS X
 ```shell
 CUDA_LIB_DIR=/usr/local/cuda/lib$([[ $(uname) == "Linux" ]] && echo 64)
 echo LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_LIB_DIR >> ~/.profile && ~/.profile
@@ -15,17 +16,46 @@ sudo cp cuda/lib/* $CUDA_LIB_DIR
 sudo cp cuda/include/* /usr/local/cuda/include
 ```
 
+### Windows
+- Copy ```CUDNN_PATH/bin/*``` to ```CUDA_PATH/bin```
+- Copy ```CUDNN_PATH/lib/*``` to ```CUDA_PATH/lib```
+- Copy ```CUDNN_PATH/include/*``` to ```CUDA_PATH/include```
+
 ## Compilation
 
+### Linux and OS X
 ```shell
 ./compile.sh
 ```
 
+### Windows
+- Create a new project in Visual Studio 2013
+  - File -> New Project (Ctrl-Shift-N)
+  - Installed -> Templates -> NVIDIA -> CUDA 7.5
+- Add ```marvin.hpp``` and ```marvin.cu``` to the project
+  - Project -> Add Existing Item (Shift-Alt-A)
+  - Select ```marvin.hpp``` and ```marvin.cu``` from Explorer
+- Add cuDNN and cuBLAS libraries
+  - Project -> [Project Name] Properties -> Configuration Properties -> Linker -> Input
+  - Add ```cudnn.lib``` and ```cublas.lib``` to the semicolon-delimited list called ```Additional Dependencies``` (assumes both files are in ```CUDA_PATH/lib```)
+- Build project and run ```marvin.exe``` with appropriate commands
+
 ## MNIST
 
-1. Prepare data: run examples/mnist/prepare_mnist.m in Matlab
-2. Train a model: run ./examples/mnist/demo.sh in shell
-3. Visualize filters: run examples/mnist/demo_vis_filter.m in Matlab
+### Creating from scratch
+1. Prepare data: run ```examples/mnist/prepare_mnist.m``` in ```MATLAB```
+2. Train a model: run ```examples/mnist/demo.sh in``` ```shell```
+3. Visualize filters: run ```examples/mnist/demo_vis_filter.m``` in ```MATLAB```
+
+### Using prebuilt data
+1. Download four tensor files to ```examples/mnist```
+  - [Test images](http://vision.princeton.edu/marvin/mnist/test-images.tensor)
+  - [Test labels](http://vision.princeton.edu/marvin/mnist/test-labels.tensor)
+  - [Training images](http://vision.princeton.edu/marvin/mnist/train-images.tensor)
+  - [Training labels](http://vision.princeton.edu/marvin/mnist/train-labels.tensor)
+2. Run Marvin from the root directory
+  - Train: ```marvin train examples/mnist/lenet.json```
+  - Test: ```marvin test examples/mnist/lenet.json examples/mnist/lenet.marvin```
 
 ## Tutorials and Documentation
 Please see our website at [http://marvin.is](http://marvin.is).

diff --git a/marvin.hpp b/marvin.hpp
@@ -80,7 +80,7 @@
 #include <cublas_v2.h>
 #include <curand.h>
 #include <cudnn.h>
-#include <sys/time.h>
+#include <chrono>
 
 namespace marvin {
 
@@ -168,9 +168,7 @@ void checkCUBLAS(const int lineNumber, cublasStatus_t status) {
 }
 
 unsigned long long get_timestamp() {
-	struct timeval now;
-	gettimeofday (&now, NULL);
-	return  now.tv_usec + (unsigned long long)now.tv_sec * 1000000;
+	return (unsigned long long)std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
 }
 
 unsigned long long ticBegin;
@@ -2850,7 +2848,7 @@ class MemoryDataLayer : public DataLayer {
 
 template <class T>
 class DiskDataLayer : public DataLayer {
-	std::future<void> lock;
+	//std::future<void (&)()> lock;
 	FILE* dataFILE;
 	Tensor<StorageT>* labelCPUall;
 	std::vector<size_t> ordering; 
@@ -2964,7 +2962,7 @@ class DiskDataLayer : public DataLayer {
 	};
 
 	~DiskDataLayer(){
-		if (lock.valid()) lock.wait();
+		// if (lock.valid()) lock.wait();
 		delete distribution_bernoulli;
 		for (int i=0;i<distribution_uniform.size();++i) delete distribution_uniform[i];
 		if (dataFILE!=NULL) fclose(dataFILE);
@@ -3070,11 +3068,12 @@ class DiskDataLayer : public DataLayer {
 	};
 
 	void forward(Phase phase_){
-		lock.wait();
+		// lock.wait();
 		epoch = epoch_prefetch;
 		Kernel_convert_to_StorageT_subtract<<<CUDA_GET_BLOCKS(numel_batch_all_channel_crop), CUDA_NUM_THREADS>>>(CUDA_GET_LOOPS(numel_batch_all_channel_crop), numel_batch_all_channel_crop, numel_all_channel_crop, dataGPU, (in.size()==0? NULL: in[0]->dataGPU), out[0]->dataGPU);
 		std::swap(out[1]->dataGPU,labelGPU);
-		lock = std::async(std::launch::async,&DiskDataLayer<T>::prefetch,this);
+		// lock = std::async(std::launch::async,&DiskDataLayer<T>::prefetch,this);
+		prefetch();
 	};
 
 
@@ -3111,8 +3110,8 @@ class DiskDataLayer : public DataLayer {
 		checkCUDA(__LINE__, cudaMalloc(&dataGPU, numel_batch_all_channel_crop * sizeof(T)) );
 		memoryBytes += numel_batch_all_channel_crop * sizeof(T);
 
-		lock = std::async(std::launch::async,&DiskDataLayer<T>::prefetch,this);
-
+		// lock = std::async(std::launch::async,&DiskDataLayer<T>::prefetch,this);
+		prefetch();
 		return memoryBytes;
 	};	
 };
@@ -3547,7 +3546,7 @@ class InnerProductLayer : public Layer {
 class DropoutLayer: public Layer{
 	ComputeT scale;
 	std::bernoulli_distribution* distribution;
-	std::future<void> lock;
+	// std::future<void (&)()> lock;
 	bool current_mask;
 	std::vector< StorageT* > GPUmask[2];
 	std::vector< StorageT* > CPUmask;	
@@ -3619,13 +3618,13 @@ class DropoutLayer: public Layer{
 			memoryBytes += out[i]->Malloc(in[i]->dim);
 		}
 
-		lock = std::async(std::launch::async,&DropoutLayer::generateMask,this);
-
+		// lock = std::async(std::launch::async,&DropoutLayer::generateMask,this);
+		generateMask();
 		return memoryBytes;
 	};
 
 	~DropoutLayer(){
-		if (lock.valid()) lock.wait();
+		// if (lock.valid()) lock.wait();
 		for (int i=0;i<GPUmask[0].size();++i){
 			checkCUDA(__LINE__, cudaFree(GPUmask[0][i]) );
 			checkCUDA(__LINE__, cudaFree(GPUmask[1][i]) );
@@ -3636,9 +3635,10 @@ class DropoutLayer: public Layer{
 
 	void forward(Phase phase_){
 		if ( phase_==Training ){
-			lock.wait();
+			// lock.wait();
 			current_mask = !current_mask;
-			lock = std::async(std::launch::async,&DropoutLayer::generateMask,this);
+			// lock = std::async(std::launch::async,&DropoutLayer::generateMask,this);
+			generateMask();
 			for (int i=0;i<in.size();++i){
 				// zeros out some elements
 				GPU_elementwise_multiplication(SIZEmask[i], out[i]->dataGPU, GPUmask[current_mask][i], in[i]->dataGPU);
@@ -5565,7 +5565,7 @@ class Solver{
 				}else{
 					for (int t=0; t<threads.size(); ++t){
 						nets[t]->phase = Testing;
-						threads[t] = std::thread(&Net::stepTest, nets[t], true);	//nets[t]->stepTest();
+						// threads[t] = std::thread(&Net::stepTest, nets[t], true);	//nets[t]->stepTest();
 					}
 					for (int t=0; t<threads.size(); ++t){
 						threads[t].join();
@@ -5591,7 +5591,7 @@ class Solver{
 				nets[0]->stepTrain(false); 
 			}else{		
 				for (int t=0; t<threads.size(); ++t){
-					threads[t] = std::thread(&Net::stepTrain, nets[t], true);	//nets[t]->stepTrain(); 
+					// threads[t] = std::thread(&Net::stepTrain, nets[t], true);	//nets[t]->stepTrain(); 
 				}
 				for (int t=0; t<threads.size(); ++t){
 					threads[t].join();
@@ -5614,7 +5614,7 @@ class Solver{
 					nets[0]->eval(false);
 				}else{
 					for (int t=0; t<threads.size(); ++t){
-						threads[t] = std::thread(&Net::eval, nets[t], true); //nets[t]->eval(); 
+						// threads[t] = std::thread(&Net::eval, nets[t], true); //nets[t]->eval(); 
 					}
 					for (int t=0; t<threads.size(); ++t){
 						threads[t].join();