forked from jfpower/anfis-pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
cmeans.py
250 lines (221 loc) · 8.95 KB
/
cmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Fuzzy c-means (FCM) clustering in torch: standard and SGD version.
@author: James Power <james.power@mu.ie> May 24 2019
Looking at cmeans.py from scikit-fuzzy was a big help gor the basic model,
also "Fuzzy Logic With Engineering Applications" by Timothy J. Ross
- I had the second edition, so this was in chapter 11, pp 379-389.
'''
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
dtype = torch.float64
CLUSTER_DATA = 'cluster_data' # folder where the data sets are stored
class FuzzyCluster(torch.nn.Module):
'''
This does fuzzy c-means clustering, maintaining a set of centroids.
It is packaged as a torch Module, so the forward pass
re-calculates the partition matrix based on the supplied data.
Alternate this with recalc_centroids to do the clustering.
Can optionally register the entroids as parameters and learn them.
'''
_EPS = 1e-12 # Value to use instead of 0 (to prevent div-by-zero)
def __init__(self, n_c, n_in, m=1.7):
'''
n_c is the number of clusters
n_in is the number of input features
m is the weighting parameter, controls the amount of fuzziness
'''
super(FuzzyCluster, self).__init__()
self.n_c = n_c
self.n_in = n_in
self.m = m
self.centroids = torch.rand((self.n_c, self.n_in), dtype=dtype)
self.last_u = None # Record the most recent partition matrix
def set_centroids(self, new_centroids):
self.centroids = new_centroids
def register_centroids(self):
'''
Call this to register the centroids as torch parameters,
so we can (afterwards) use backprop to learn them.
'''
init_centroids = self.centroids
del self.centroids # Delete the version that's not a Torch parameter
self.register_parameter('centroids',
torch.nn.Parameter(init_centroids))
@staticmethod
def _cdist(x1, x2):
'''
Pairwise distances between two sets of points
- this mimics the scipy function spatial.distance.cdist
Source: https://github.com/pytorch/pytorch/issues/15253
'''
x1_norm = x1.pow(2).sum(dim=-1, keepdim=True)
x2_norm = x2.pow(2).sum(dim=-1, keepdim=True)
# res = x2^2 -2(x1 @ x2) + x1^2
res = torch.addmm(x2_norm.transpose(-2, -1),
x1, x2.transpose(-2, -1),
alpha=-2).add_(x1_norm)
res = res.sqrt().clamp_min(FuzzyCluster._EPS)
return res
def recalc_centroids(self, x, u):
'''
Re-calculate the positions of the centroids for each cluster.
x.shape = n_cases * n_in
u.shape = n_cases * self.n_c
Returns the new centroids (but does not update them):
v.shape = self.n_c * n_in
'''
um = u ** self.m
# Batch multiply um by v:
v = torch.einsum('mi,mj->ij', um, x)
# Divide by u^m, summed by clusters:
v /= um.sum(dim=0).clamp_min_(FuzzyCluster._EPS).unsqueeze(1)
return v
def forward(self, x):
''' Calculate and return the partition matrix u,
which shows (for each x) its membership degree of each cluster
x.shape = n_cases * n_in
u.shape = n_cases * self.n_c
'''
d = FuzzyCluster._cdist(x, self.centroids)
u = d ** (- 2. / (self.m - 1))
self.last_u = u / u.sum(dim=1, keepdim=True)
return self.last_u
def plot_clusters(x, fc):
u = fc(x)
num_clusters = u.shape[1]
# Hardening: assign each point to cluster with maximum membership:
crisp_clusters = u.argmax(dim=1)
# Pick a different color for the points in each cluster:
all_colors = sns.color_palette("hls", num_clusters)
pt_colors = [all_colors[i] for i in crisp_clusters]
# Plot clusters and then centroids:
plt.figure(1)
plt.scatter(x[:, 0], x[:, 1], s=1, color=pt_colors)
centroids = fc.centroids.detach().numpy()
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=175, color='black')
def evaluate_clustering(datset, fc):
'''
Compute the fuzzy partition coefficient from u.
This measures the degree of fuzziness
'''
x = dataset.tensors[0]
u = fc(x)
trace = torch.einsum('ij,ji->', u, u.t())
return trace / u.shape[0]
def cmeans_cluster(dataset, num_clusters, max_epochs=250, show_plots=True):
'''
Cluster the x data into num_clusters clusters, optionally plot.
Will execute up to max_epochs, unless centroids stabilise.
Returns the FuzzyCluster object (contains centroids, partition)
'''
MIN_CHANGE = 1e-5 # This is what I mean by "stabilise"
x = dataset.tensors[0]
n_in = x.shape[1]
fc = FuzzyCluster(num_clusters, n_in)
print('### Training for up to {} epochs, size = {} cases'.
format(max_epochs, x.shape[0]))
for t in range(max_epochs):
u = fc(x)
new_centroids = fc.recalc_centroids(x, u)
delta_v = F.l1_loss(new_centroids, fc.centroids)
fc.set_centroids(new_centroids)
if max_epochs < 30 or t % 10 == 0:
print('Epoch {:3d}, change={:.5f}, fuzziness={:.5f}'
.format(t, delta_v.item(),
evaluate_clustering(dataset, fc)))
if delta_v < MIN_CHANGE:
break
if show_plots:
plot_clusters(x, fc)
return fc
def sgd_cluster(dataset, num_clusters, epochs=250, show_plots=True):
'''
Cluster the given data into the given number of clusters,
but use stochastic gradient descent (SGD) and mini-batches.
Always run for the given number of epochs.
'''
BATCH_SIZE = 1024
data = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
n_in = dataset.tensors[0].shape[1]
fc = FuzzyCluster(num_clusters, n_in)
fc.register_centroids()
optimizer = torch.optim.SGD(fc.parameters(), lr=1e-1, momentum=0.99)
print('### Training for {} epochs, size = {} cases, batches of {}'.
format(epochs, dataset.tensors[0].shape[0], BATCH_SIZE))
for t in range(epochs):
old_centroids = fc.centroids.clone()
# Process each mini-batch in turn:
for x, y_actual in data:
u = fc(x)
new_centroids = fc.recalc_centroids(x, u)
optimizer.zero_grad()
loss = F.mse_loss(fc.centroids, new_centroids)
loss.backward()
optimizer.step()
# Epoch ending, so print progress for the whole batch:
with torch.no_grad():
delta_v = F.l1_loss(old_centroids, fc.centroids)
if epochs < 30 or t % 10 == 0:
print('Epoch {:3d}, change={:.5f}, fuzziness={:.5f}'
.format(t, delta_v.item(),
evaluate_clustering(dataset, fc)))
# End of training, so graph the results:
if show_plots:
plot_clusters(dataset.tensors[0], fc)
return fc
def read_data(filename, n_in=2):
'''
Read the x data points from the given file, one point per line;
n_in is the number of input features (i.e. co-ords for each point).
Optionally read the ground-truth categories if there.
Return a dataset with two tensors: (points, ground-truths)
'''
points = []
truths = [] # i.e. the categories the points actually belong to
pathname = os.path.join(CLUSTER_DATA, filename)
with open(pathname, 'r') as fh:
for line in fh:
nums = [n for n in line.strip().split()]
points.append([float(n) for n in nums[:n_in]])
if len(nums) == n_in+1:
truths.append(int(nums[n_in]))
if len(truths) == 0: # no ground truths supplied
truths = [-1] * len(points)
return TensorDataset(torch.tensor(points, dtype=dtype),
torch.tensor(truths, dtype=torch.long))
def read_and_cluster(filename, n_c, n_in=2):
dataset = read_data(filename, n_in)
fc = cmeans_cluster(dataset, n_c)
evaluate_clustering(dataset, fc)
return fc
torch.manual_seed(0)
if __name__ == '__main__':
example = '7'
show_plots = True
if len(sys.argv) == 2: # One arg: example
example = sys.argv[1]
show_plots = False
if example == '1':
x = torch.tensor([[1, 3], [1.5, 3.2], [1.3, 2.8], [3, 1]])
cmeans_cluster(x, 2)
elif example == '2':
read_and_cluster('R15.txt', 15)
elif example == '3':
read_and_cluster('Aggregation.txt', 7)
elif example == '4':
read_and_cluster('D31.txt', 31)
elif example == '5':
read_and_cluster('jain.txt', 5)
elif example == '6':
read_and_cluster('a3.txt', 50)
elif example == '7':
dataset = read_data('birch3.txt')
fc = sgd_cluster(dataset, 100, epochs=50)