import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#from sklearn.preprocessing import StandardScaler
df = pd.read_csv('gym_crowdedness.csv')
df.head()

df.dtypes

number_people             int64
date                     object
timestamp                 int64
day_of_week               int64
is_weekend                int64
is_holiday                int64
temperature             float64
is_start_of_semester      int64
is_during_semester        int64
month                     int64
hour                      int64
dtype: object

df.describe()

df.shape

(62184, 11)

df.isnull().any()

number_people           False
date                    False
timestamp               False
day_of_week             False
is_weekend              False
is_holiday              False
temperature             False
is_start_of_semester    False
is_during_semester      False
month                   False
hour                    False
dtype: bool

gym = df.drop('date',axis = 1)

median = gym.mean()
standard_deviation = gym.std()
scaled = (gym - median) / standard_deviation

scaled

covariance_matrix = np.cov(scaled.T)
covariance_matrix

array([[ 1.00000000e+00,  5.50218370e-01, -1.62061859e-01,
        -1.73957812e-01, -4.82493487e-02,  3.73327299e-01,
         1.82682899e-01,  3.35350361e-01, -9.78535478e-02,
         5.52049444e-01],
       [ 5.50218370e-01,  1.00000000e+00, -1.79319084e-03,
        -5.08807521e-04,  2.85073776e-03,  1.84849490e-01,
         9.55090525e-03,  4.46758988e-02, -2.32210763e-02,
         9.99077439e-01],
       [-1.62061859e-01, -1.79319084e-03,  1.00000000e+00,
         7.91338197e-01, -7.58620382e-02,  1.11687310e-02,
        -1.17820251e-02, -4.82362857e-03,  1.55586861e-02,
        -1.91427432e-03],
       [-1.73957812e-01, -5.08807521e-04,  7.91338197e-01,
         1.00000000e+00, -3.18988342e-02,  2.06733408e-02,
        -1.66457755e-02, -3.61271915e-02,  8.46234643e-03,
        -5.17288766e-04],
       [-4.82493487e-02,  2.85073776e-03, -7.58620382e-02,
        -3.18988342e-02,  1.00000000e+00, -8.85265918e-02,
        -1.48579083e-02, -7.07984358e-02, -9.49422885e-02,
         2.84316486e-03],
       [ 3.73327299e-01,  1.84849490e-01,  1.11687310e-02,
         2.06733408e-02, -8.85265918e-02,  1.00000000e+00,
         9.32418634e-02,  1.52475895e-01,  6.31245806e-02,
         1.85120732e-01],
       [ 1.82682899e-01,  9.55090525e-03, -1.17820251e-02,
        -1.66457755e-02, -1.48579083e-02,  9.32418634e-02,
         1.00000000e+00,  2.09862098e-01, -1.37159611e-01,
         1.00907232e-02],
       [ 3.35350361e-01,  4.46758988e-02, -4.82362857e-03,
        -3.61271915e-02, -7.07984358e-02,  1.52475895e-01,
         2.09862098e-01,  1.00000000e+00,  9.65556768e-02,
         4.55808573e-02],
       [-9.78535478e-02, -2.32210763e-02,  1.55586861e-02,
         8.46234643e-03, -9.49422885e-02,  6.31245806e-02,
        -1.37159611e-01,  9.65556768e-02,  1.00000000e+00,
        -2.36235024e-02],
       [ 5.52049444e-01,  9.99077439e-01, -1.91427432e-03,
        -5.17288766e-04,  2.84316486e-03,  1.85120732e-01,
         1.00907232e-02,  4.55808573e-02, -2.36235024e-02,
         1.00000000e+00]])

plt.figure(figsize =(10,10))
sns.set(font_scale = 1.5)
sns.heatmap(covariance_matrix,
cbar = True,
annot = True,
square = True,
fmt = '.2f',
annot_kws={'size':12}
)

<AxesSubplot: >

eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

variance_explained = []
for i in eigen_values:
    variance_explained.append(i/sum(eigen_values)*100)

x = [i for i in range(len(variance_explained))]
plt.bar(x,variance_explained, width = 0.5, color = ['red', 'blue'])
plt.xticks(np.arange(10),('pca_1','pca_2', 'pca_3', 'pca_4','pca_5','pca_6','pca_7','pca_8','pca_9','pca_10'),rotation = 45)
plt.title('Variance by principal component')
plt.xlabel('Principal component')
plt.ylabel('Percentage of variance explained')
plt.show()

variance_captured = variance_explained[0]+variance_explained[1]+variance_explained[5]+variance_explained[6]+variance_explained[9]
variance_captured

78.7952503836204

feature_vector = [eigen_vectors.T[0],eigen_vectors.T[1],eigen_vectors.T[5],eigen_vectors.T[6],eigen_vectors.T[9]]
feature_vector=np.array(feature_vector)

scaled.shape

(62184, 10)

feature_vector.T.shape

(10, 5)

gym_recast = np.dot(scaled,feature_vector.T)

gym_recast = pd.DataFrame(gym_recast)

sns.pairplot(gym_recast)

<seaborn.axisgrid.PairGrid at 0x7fc18c0dc040>

	number_people	timestamp	day_of_week	is_weekend	is_holiday	temperature	is_start_of_semester	is_during_semester	month	hour
count	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000	62184.000000
mean	29.072543	45799.437958	2.982504	0.282870	0.002573	58.557108	0.078831	0.660218	7.439824	12.236460
std	22.689026	24211.275891	1.996825	0.450398	0.050660	6.316396	0.269476	0.473639	3.445069	6.717631
min	0.000000	0.000000	0.000000	0.000000	0.000000	38.140000	0.000000	0.000000	1.000000	0.000000
25%	9.000000	26624.000000	1.000000	0.000000	0.000000	55.000000	0.000000	0.000000	5.000000	7.000000
50%	28.000000	46522.500000	3.000000	0.000000	0.000000	58.340000	0.000000	1.000000	8.000000	12.000000
75%	43.000000	66612.000000	5.000000	1.000000	0.000000	62.280000	0.000000	1.000000	10.000000	18.000000
max	145.000000	86399.000000	6.000000	1.000000	1.000000	87.170000	1.000000	1.000000	12.000000	23.000000

	number_people	timestamp	day_of_week	is_weekend	is_holiday	temperature	is_start_of_semester	is_during_semester	month	hour
0	0.349396	0.636545	0.509557	-0.628046	-0.05079	2.090257	-0.292532	-1.393927	0.162602	0.709110
1	0.701989	0.686232	0.509557	-0.628046	-0.05079	2.090257	-0.292532	-1.393927	0.162602	0.709110
2	0.481619	0.711056	0.509557	-0.628046	-0.05079	2.090257	-0.292532	-1.393927	0.162602	0.709110
3	0.657915	0.735879	0.509557	-0.628046	-0.05079	2.090257	-0.292532	-1.393927	0.162602	0.709110
4	0.701989	0.760702	0.509557	-0.628046	-0.05079	2.090257	-0.292532	-1.393927	0.162602	0.709110
...	...	...	...	...	...	...	...	...	...	...
62179	-0.267642	0.890022	1.010352	1.592215	-0.05079	0.397836	-0.292532	0.717386	-1.288748	0.857972
62180	-0.355791	0.915093	1.010352	1.592215	-0.05079	0.397836	-0.292532	0.717386	-1.288748	0.857972
62181	-0.179494	0.940081	1.010352	1.592215	-0.05079	-0.292431	-0.292532	0.717386	-1.288748	1.006834
62182	-0.488013	0.965152	1.010352	1.592215	-0.05079	-0.292431	-0.292532	0.717386	-1.288748	1.006834
62183	-0.267642	0.990099	1.010352	1.592215	-0.05079	-0.292431	-0.292532	0.717386	-1.288748	1.006834

An explanation and application of Principal component analysis (PCA)¶

Definition and Derivation of Principal Components¶

Definning PCs¶

How to find them¶

Derive the form of the PCs¶

A short review about variance of a random vector¶

Maximizing $α_{1}^{T} Σ α_{1}$ ¶

Maximizing $α_{2}^{T} Σ α_{2}$ ¶

Now with $α_{k}^{T} Σ α_{k}$ ¶

Application of PCA to a data set¶

Exploration of the data¶

1. Standardize the range of continuous initial variables¶

2. Compute the covariance matrix to identify correlations¶

3. Compute the eigenvectors and eigenvalues of the covariance matrix to identify the principal components¶

4. Create a feature vector to decide which principal components to keep¶

5. Recast the data along the principal components axes¶

	number_people	date	timestamp	day_of_week	temperature	month	hour
0	37	2015-08-14 17:00:11-07:00	61211	4	71.76	8	17
1	45	2015-08-14 17:20:14-07:00	62414	4	71.76	8	17
2	40	2015-08-14 17:30:15-07:00	63015	4	71.76	8	17
3	44	2015-08-14 17:40:16-07:00	63616	4	71.76	8	17
4	45	2015-08-14 17:50:17-07:00	64217	4	71.76	8	17

An explanation and application of Principal component analysis (PCA)¶

Definition and Derivation of Principal Components¶

Definning PCs¶

How to find them¶

Derive the form of the PCs¶

A short review about variance of a random vector¶

Maximizing αT1Σα1α1TΣα1¶

Maximizing αT2Σα2α2TΣα2¶

Now with αTkΣαkαkTΣαk¶

Application of PCA to a data set¶

Exploration of the data¶

1. Standardize the range of continuous initial variables¶

2. Compute the covariance matrix to identify correlations¶

3. Compute the eigenvectors and eigenvalues of the covariance matrix to identify the principal components¶

4. Create a feature vector to decide which principal components to keep¶

5. Recast the data along the principal components axes¶

Maximizing $α_{1}^{T} Σ α_{1}$ ¶

Maximizing $α_{2}^{T} Σ α_{2}$ ¶

Now with $α_{k}^{T} Σ α_{k}$ ¶