-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_training_files.py
153 lines (113 loc) · 6.46 KB
/
prepare_training_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# Import Modules
import gc
import numpy as np
import pandas as pd
import shutil
import tensorflow as tf
from utils import *
from tqdm import tqdm
# Constants
patch_size = 1024
generate_stage2 = False # Generate Stage 1 ... OR ... Generate Stage 2 based on Pseudo Labelling
# Required Folders - Modify these if required to fit your local folder structure
root_dir = 'C:/KaggleHuBMAP/'
train_data_dir = f'{root_dir}train/' # Folder for official Kaggle HuBMAP Train Data
test_data_dir = f'{root_dir}test/' # Folder for official Kaggle HuBMAP Test Data. Used for Pseudo Labelling Stage2
ext1_data_dir = f'{root_dir}ext1/' # Folder with External data from: https://www.kaggle.com/baesiann/glomeruli-hubmap-external-1024x1024
ext2_data_dir = f'{root_dir}ext2/' # Folder with External data from: https://portal.hubmapconsortium.org/search?entity_type%5B0%5D=Dataset
tfrecords_dir = f'{root_dir}tfrecords/' # Output directory for created TFRecords
tfrecords_train_dir = f'{tfrecords_dir}train/'
tfrecords_test_dir = f'{tfrecords_dir}test/'
tfrecords_ext1_dir = f'{tfrecords_dir}ext1/'
tfrecords_ext2_dir = f'{tfrecords_dir}ext2/'
# Prepare TFRecords and Dataset Output Dir
clean_and_prepare_dir(tfrecords_dir)
# Only generate stage 1 train files
if not generate_stage2:
clean_and_prepare_dir(tfrecords_train_dir)
clean_and_prepare_dir(tfrecords_ext1_dir)
# Only generate stage 2 train files
else:
clean_and_prepare_dir(tfrecords_test_dir)
clean_and_prepare_dir(tfrecords_ext2_dir)
#### STAGE 1 ###########################################################################################################################
# Only generate stage 1 train files
if not generate_stage2:
#### Prepare and Process official Kaggle Train Images ##############################################################################
# Read Train Info
train_df = pd.read_csv(f'{root_dir}train.csv')
print(train_df.shape)
# Loop through all Train Images
train_image_count = train_df.shape[0]
for image_index in tqdm(range(train_image_count), total = train_image_count):
# Get Image ID
image_id = train_df['id'][image_index]
# Get TIFF Image
image = get_tiff_image(f'{train_data_dir}{image_id}.tiff')
# Get Mask
mask = rle2mask(train_df['encoding'][image_index], (image.shape[1], image.shape[0]))
# Create Patches and TFRecords for TIFF Image
patch_df = write_tfrecord_tiles_v1(image_id, image, mask, patch_size, tfrecords_train_dir)
# Create Dataframe
patch_df.to_csv(f'{tfrecords_train_dir}{image_id}_patches.csv', index = False)
# Clean Memory
gc.collect()
#### Prepare and Process First External Dataset Images #############################################################################
# List Images
ext1_images = os.listdir(f'{ext1_data_dir}images_1024/')
# Create
patch_df = write_tfrecord_tiles_v2(ext1_images, ext1_data_dir, tfrecords_ext1_dir)
# Create Dataframe
patch_df.to_csv(f'{tfrecords_ext1_dir}ext1_patches.csv', index = False)
# Clean Memory
gc.collect()
#### Create Final Zip File for upload to Kaggle Datasets ###############################################################################
shutil.make_archive(f'{root_dir}train_files_stage1', 'zip', root_dir = tfrecords_dir, base_dir = './')
#### STAGE 2 ###########################################################################################################################
# Only generate stage 2 train files
if generate_stage2:
#### Prepare and Process official Kaggle Test Images ##############################################################################
# Read Pseudo Label Info for Public Test.
# Pseudo Label public test data by using a selected ensemble of models and perform inference based on the public test data
# The generated predictions .csv file contains the masks for public test data. We can re-use these as additional training data
test_df = pd.read_csv(f'{root_dir}pseudolabel_test.csv')
print(test_df.shape)
# Loop through all public Test Images
test_image_count = test_df.shape[0]
for image_index in tqdm(range(test_image_count), total = test_image_count):
# Get Image ID
image_id = test_df['id'][image_index]
# Get TIFF Image
image = get_tiff_image(f'{test_data_dir}{image_id}.tiff')
# Get Mask
mask = rle2mask(test_df['predicted'][image_index], (image.shape[1], image.shape[0]))
# Create Patches and TFRecords for TIFF Image
patch_df = write_tfrecord_tiles_v1(image_id, image, mask, patch_size, tfrecords_test_dir)
# Create Dataframe
patch_df.to_csv(f'{tfrecords_test_dir}{image_id}_patches.csv', index = False)
# Clean Memory
gc.collect()
#### Prepare and Process Second External Dataset Images #############################################################################
# Pseudo Label second external dataset by using a selected ensemble of models and perform inference on the data
# The generated predictions .csv file contains the predicted masks for the second external data set. We can re-use these as additional training data
ext2_df = pd.read_csv(f'{root_dir}pseudolabel_ext2.csv')
print(ext2_df.shape)
# Loop through all second external data set images
ext2_image_count = ext2_df.shape[0]
for image_index in tqdm(range(ext2_image_count), total = ext2_image_count):
# Get Image ID
image_id = ext2_df['id'][image_index]
# Get TIFF Image
image = get_tiff_image(f'{ext2_data_dir}{image_id}.tiff')
# Get Mask
mask = rle2mask(ext2_df['predicted'][image_index], (image.shape[1], image.shape[0]))
# Create Patches and TFRecords for TIFF Image
patch_df = write_tfrecord_tiles_v1(image_id, image, mask, patch_size, tfrecords_ext2_dir)
# Create Dataframe
patch_df.to_csv(f'{tfrecords_ext2_dir}{image_id}_patches.csv', index = False)
# Clean Memory
gc.collect()
#### Create Final Zip File for upload to Kaggle Datasets ###############################################################################
shutil.make_archive(f'{root_dir}train_files_stage2', 'zip', root_dir = tfrecords_dir, base_dir = './')
# Final
print('=== Finished Training Files Processing')