Normalization
Normalization¶
When we use KMeans, something to consider is that since we are using euclidean distance, if we have certain features with much higher variance, there will be an outsized impact from them. For example, the data we create below has two features but one is a much larger range of data. Look what happens when we try to use KMeans.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#Set the random seed
np.random.seed(0)
#Create the data
A = pd.DataFrame(np.hstack([np.random.normal(-17.5,5, (100,1)),np.random.normal(50,25, (100,1))]),columns=['X1', 'X2'])
B = pd.DataFrame(np.hstack([np.random.normal(0,5, (100,1)),np.random.normal(0,25, (100,1))]),columns=['X1', 'X2'])
C = pd.DataFrame(np.hstack([np.random.normal(17.5,5, (100,1)),np.random.normal(-50,25, (100,1))]),columns=['X1', 'X2'])
#Plot the data
fig, ax = plt.subplots()
A.plot.scatter(x='X1', y='X2', label='A', ax=ax, color='red')
B.plot.scatter(x='X1', y='X2', label='B', ax=ax, color='green')
C.plot.scatter(x='X1', y='X2', label='C', ax=ax, color='blue')
plt.xlim([-125, 125])
plt.ylim([-125, 125])
plt.show()
In [2]:
from sklearn.cluster import KMeans
#Get the data
X = pd.concat([A,B,C])
#Fit the model
model = KMeans(n_clusters=3, random_state=1).fit(X)
In [3]:
from matplotlib import colors
#Grab all the x and y points
x_grid, y_grid = np.meshgrid(np.linspace(-125, 125, 1000), np.linspace(-125, 125, 1000))
#Grab the colors we want to map to
cmap = colors.ListedColormap(['red','blue','green'])
#Predict and reshape
z = model.predict(np.vstack([x_grid.ravel(), y_grid.ravel()]).T)
z = z.reshape(x_grid.shape)
#Set up our plots
fig, ax = plt.subplots()
#Plot all the actual data
A.plot.scatter(x='X1', y='X2', label='A', ax=ax, color='red')
B.plot.scatter(x='X1', y='X2', label='B', ax=ax, color='green')
C.plot.scatter(x='X1', y='X2', label='C', ax=ax, color='blue')
#Plot the regions
ax.imshow(z, interpolation='nearest',
extent=(-125, 125, -125, 125),
cmap=cmap,
alpha=.4,
aspect='auto', origin='lower')
plt.show()