Solution
Solution
from sklearn.cluster import KMeans
import pandas as pd
def group(n):
df = pd.DataFrame.from_csv("RegressionMatrix1.csv",encoding="UTF-8")
SP500 = pd.DataFrame.from_csv("SP500.csv", encoding="UTF-8")
df = df.fillna(0)
tickers = []
for x in SP500["Ticker"].values:
if "." in x:
x = x.replace(".","")
tickers.append(x)
SP500["Ticker"]=tickers
SP500.index = SP500["Ticker"]
df = df.transpose()
zScores = pd.DataFrame()
for x in df.columns:
zScores[x] = (df[x] - df[x].mean())/df[x].std(ddof=0)
SP500 = pd.concat([SP500,df],axis=1,join="inner")
model = KMeans(n_clusters=n)
model = model.fit(zScores)
SP500["Group"] = model.labels_
return SP500
SP500 = group(8)
print(SP500.groupby("Group")["GICS Sector"].value_counts())
You will notice that there is going to be a difference, but this is normal for machine learning. You won’t get the same results each time because of the way this works, but you can expect to get similar results.
Source Code