# inline plotting instead of popping out
%matplotlib inline

import os
import numpy as np # numpy 1.26.0
import pandas as pd # pandas 2.1.1
import matplotlib.pyplot as plt # matplotlib 3.8.0
import seaborn as sns # seaborn 0.13.0
from sklearn.preprocessing import StandardScaler # scikit-learn 1.3.1


df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
    header = None)

df.columns = [
    'Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
    'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
    'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
    'Proline'
]

X = df.drop(labels='Class label', axis=1)
y = df['Class label']

df.head()


sns.set(style='whitegrid', context='notebook')

sns.pairplot(df, vars=df.columns[1:], hue="Class label", height=2.5, palette=sns.color_palette('deep', df['Class label'].unique().size))
plt.tight_layout()

if not os.path.exists('./output'):
  os.makedirs('./output')
plt.savefig('./output/fig-wine-scatter.png', dpi=300)
plt.show()


# Z-normalize data
sc = StandardScaler()
Z = sc.fit_transform(X)
# Estimate the correlation matrix
R = np.dot(Z.T, Z) / df.shape[0]

sns.set(font_scale=1.0)

ticklabels = [s for s in X.columns]

hm = sns.heatmap(
    R,
    cbar=True,
    square=True,
    yticklabels=ticklabels,
    xticklabels=ticklabels
)

plt.figure(figsize=(10, 8))
plt.tight_layout()
plt.savefig('./output/fig-wine-corr.png', dpi = 300)
plt.show()

sns.reset_orig()

<Figure size 1000x800 with 0 Axes>


eigen_vals, eigen_vecs = np.linalg.eigh(R)

print('\nEigenvalues: \n%s' % eigen_vals)

Eigenvalues: 
[0.10337794 0.16877023 0.22578864 0.25090248 0.28887994 0.34849736
 0.55102831 0.64165703 0.85322818 0.91897392 1.44607197 2.49697373
 4.70585025]


tot = sum(np.abs(eigen_vals))
var_exp = [(i / tot) for i in sorted(np.abs(eigen_vals), reverse=True)]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(1, eigen_vals.size + 1), var_exp, alpha=0.5, align='center',
        label='Individual')
plt.step(range(1, eigen_vals.size + 1), cum_var_exp, where='mid',
         label='Cumulative')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.savefig('./output/fig-pca-var-exp.png', dpi=300)
plt.show()


# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(reverse=True)


W = np.hstack((eigen_pairs[0][1][:, np.newaxis],
               eigen_pairs[1][1][:, np.newaxis]))
print('Projection matrix W:\n', W)

Projection matrix W:
 [[-0.1443294  -0.48365155]
 [ 0.24518758 -0.22493093]
 [ 0.00205106 -0.31606881]
 [ 0.23932041  0.0105905 ]
 [-0.14199204 -0.299634  ]
 [-0.39466085 -0.06503951]
 [-0.4229343   0.00335981]
 [ 0.2985331  -0.02877949]
 [-0.31342949 -0.03930172]
 [ 0.0886167  -0.52999567]
 [-0.29671456  0.27923515]
 [-0.37616741  0.16449619]
 [-0.28675223 -0.36490283]]


Z_pca = Z.dot(W)

colors = ['r', 'b', 'g']
markers = ['s', 'x', 'o']
for l, c, m in zip(np.unique(y.values), colors, markers):
    plt.scatter(Z_pca[y.values==l, 0], 
                Z_pca[y.values==l, 1], 
                c=c, label=l, marker=m)

plt.title('Z_pca')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
plt.savefig('./output/fig-pca-z.png', dpi=300)
plt.show()


np.save('./output/Z_pca.npy', Z_pca)


"""
# import libs, load data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
"""

df_load = pd.read_csv('https://nthu-datalab.github.io/ml/labs/02_EDA_PCA/gen_dataset.csv')

X_load = df_load.drop(labels='Class label', axis=1)
Y_load = df_load['Class label']

df_load.head()


import numpy as np
from sklearn.preprocessing import StandardScaler

# Z-normalize data
sc = StandardScaler()
Z = sc.fit_transform(X_load)
# Estimate the correlation matrix
R = np.dot(Z.T, Z) / df_load.shape[0]

# Calculate the eigen values, eigen vectors
eigen_vals, eigen_vecs = np.linalg.eigh(R)

# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:,i]) for i in range(len(eigen_vals))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(reverse=True)

# Form the projection matrix
W_2D = np.hstack((eigen_pairs[0][1][:, np.newaxis],
               eigen_pairs[1][1][:, np.newaxis]))

# You should form a projection matrix which projects from raw-data dimension to 3 dimension here


import os
import seaborn as sns
sns.set(style='whitegrid', context='notebook')

# import Axes3D for plottin 3d scatter
from mpl_toolkits.mplot3d import Axes3D

# cacculate z_pca(2d and 3d)
Z_pca2 = Z.dot(W_2D)

# plot settings
colors = ['r', 'b', 'g']
markers = ['s', 'x', 'o']
fig = plt.figure(figsize=(12,6))

# plot 2D
plt2 = fig.add_subplot(1,2,1)
for l, c, m in zip(np.unique(Y_load), colors, markers):
    plt2.scatter(Z_pca2[Y_load==l, 0], 
                Z_pca2[Y_load==l, 1], 
                c=c, label=l, marker=m)

plt.title('Z_pca 2D')
plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()

# plot 3D
plt3 = fig.add_subplot(1,2,2, projection='3d')
# you should plot a 3D scatter using plt3.scatter here (see Axes3D.scatter in matplotlib)

if not os.path.exists('./output'):
    os.makedirs('./output')
plt.savefig('./output/fig-pca-2-3-z.png', dpi=300)
plt.show()

	Class label	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

	Class label	a1	a2	a3	a4	a5	a6	a7	a8	a9	a10	a11	a12	a13	a14	a15
0	2.0	-0.016488	-1.310538	-1.552489	-0.785475	1.548429	0.476687	1.090010	-0.351870	-0.000855	-1.932941	0.499177	0.149137	-0.640413	-0.782951	-0.903561
1	0.0	-0.844201	-1.235142	-0.624408	1.502470	-0.079536	1.482053	1.178544	-1.150090	-1.040124	-1.041435	0.281037	-0.283710	-1.176802	0.718408	-0.392095
2	0.0	-0.181053	0.039422	-0.307827	0.162256	-1.283705	0.541288	0.019113	-0.470718	-1.045754	0.983150	-0.121205	-0.189225	-0.539178	0.825261	0.612889
3	2.0	-0.423555	-1.598754	1.597206	-0.239330	1.443564	2.657538	1.824393	-1.809287	1.058634	-4.058539	0.255908	-0.952422	-0.315551	1.854246	-2.369018
4	2.0	-0.499408	-0.814229	-0.178777	-1.757823	0.678134	3.552825	1.483069	-2.341943	2.155062	-4.380612	-0.239352	-1.730919	0.586125	3.902178	-2.891653

Exploring and Visualizing Data¶

Exploratory Data Analysis¶

Visualizing the Important Characteristics of a Dataset¶

Pairwise Joint Distributions¶

Correlation Matrix¶

PCA for Visualization¶

Eigendecomposition¶

Eigenvector Selection¶

Feature Transformation¶

Assignment¶

Requirements:¶