Skip to content
Snippets Groups Projects
Commit 7d75967f authored by sorenmulli's avatar sorenmulli
Browse files

im stuff?

parent 012e7f84
No related branches found
No related tags found
No related merge requests found
docs/report/tex/Billeder/clusterfuck12.png

479 KiB

docs/report/tex/Billeder/clusterfuck23.png

460 KiB

docs/report/tex/Billeder/dendogram.png

23.7 KiB | W: | H:

docs/report/tex/Billeder/dendogram.png

78.1 KiB | W: | H:

docs/report/tex/Billeder/dendogram.png
docs/report/tex/Billeder/dendogram.png
docs/report/tex/Billeder/dendogram.png
docs/report/tex/Billeder/dendogram.png
  • 2-up
  • Swipe
  • Onion skin
docs/report/tex/Billeder/gmmloss.png

26.2 KiB | W: | H:

docs/report/tex/Billeder/gmmloss.png

21.4 KiB | W: | H:

docs/report/tex/Billeder/gmmloss.png
docs/report/tex/Billeder/gmmloss.png
docs/report/tex/Billeder/gmmloss.png
docs/report/tex/Billeder/gmmloss.png
  • 2-up
  • Swipe
  • Onion skin
docs/report/tex/Billeder/out_KDE.png

59.8 KiB | W: | H:

docs/report/tex/Billeder/out_KDE.png

53.4 KiB | W: | H:

docs/report/tex/Billeder/out_KDE.png
docs/report/tex/Billeder/out_KDE.png
docs/report/tex/Billeder/out_KDE.png
docs/report/tex/Billeder/out_KDE.png
  • 2-up
  • Swipe
  • Onion skin
docs/report/tex/Billeder/out_KNNdes.png

58.3 KiB | W: | H:

docs/report/tex/Billeder/out_KNNdes.png

49.4 KiB | W: | H:

docs/report/tex/Billeder/out_KNNdes.png
docs/report/tex/Billeder/out_KNNdes.png
docs/report/tex/Billeder/out_KNNdes.png
docs/report/tex/Billeder/out_KNNdes.png
  • 2-up
  • Swipe
  • Onion skin
docs/report/tex/Billeder/out_KNNrel.png

51.3 KiB | W: | H:

docs/report/tex/Billeder/out_KNNrel.png

50.7 KiB | W: | H:

docs/report/tex/Billeder/out_KNNrel.png
docs/report/tex/Billeder/out_KNNrel.png
docs/report/tex/Billeder/out_KNNrel.png
docs/report/tex/Billeder/out_KNNrel.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -46,18 +46,22 @@ In this section, a clustering is performed using the Gaussian Mixture Model and
\includegraphics[width = 0.6\linewidth]{gmmloss}
\end{figure}
%[23487.56607812 18127.38559714 14899.35424052 14243.28471954
%13453.47497572 13109.79453539 12855.24139837 12576.10894894
%12178.15906456 12210.34021972 12243.54429977]
\noindent From the illustration, it is obvious that the most appropiate number of cluster i 9. Meanwhile, the negative log likelihood increases when the number of clusters is either increased or decreased from 9, which is to be expected.
\subsection{Evaluation of GMM and Hierarchical Clustering}
\textit{Evaluate the quality of the clusterings using GMM label information and for hierarchical clustering with the same number of clusters as in the GMM.}
Hierachical:
Rand: 0.503908248144738
Jaccard: 0.31190502504403683
NMI: 0.015804717180120536
Rand: 0.5328153158470461
Jaccard: 0.19233601406747694
NMI: 0.022295065087228606
GMM:
Rand: 0.5116519504843748
Jaccard: 0.1672924621901809
NMI: 0.013523079325376108
Rand: 0.5231302775613633
Jaccard: 0.12366747954105438
NMI: 0.018119058189570287
\section{Outlier Detection/Anomaly Detection}
......@@ -113,6 +117,21 @@ To find the most probable outliers in the dataset, the best cause of action is t
\subsection{Apriori Algorithm for Frequent Itemsets and Association Rules}
\textit{Find the frequent itemsets and the association rules with high confidence based on the results of the Apriori algorithm.} \\
Hej Per Parametre
\[
\texttt{minsup} = 0.11\qquad \texttt{minconf} = 0.6
\]
%{energy_q1} -> {loudness_q1} (supp: 0.226, conf: 0.674)
%{loudness_q1} -> {energy_q1} (supp: 0.226, conf: 0.676)
%{energy_q3} -> {loudness_q3} (supp: 0.210, conf: 0.634)
%{loudness_q3} -> {energy_q3} (supp: 0.210, conf: 0.631)
%{acousticness_q3, energy_q1} -> {loudness_q1} (supp: 0.142, conf: 0.740)
%{loudness_q1, acousticness_q3} -> {energy_q1} (supp: 0.142, conf: 0.842)
%{loudness_q1, energy_q1} -> {acousticness_q3} (supp: 0.142, conf: 0.631)
%{valence_q1, energy_q1} -> {loudness_q1} (supp: 0.112, conf: 0.708)
%{loudness_q1, valence_q1} -> {energy_q1} (supp: 0.112, conf: 0.833)
\subsection{Interpretation of the Association Rules}
\textit{Interpret the generated association rules.} \\
......
......@@ -35,27 +35,29 @@ def load_data(standardize, target, intervals):
y_idx = list(attributeNames).index(target)
y = X[:, y_idx]
X = np.delete(X, (y_idx), axis = 1)
if standardize:
X = (X - X.mean(0)) / X.std(0)
assert len(intervals) == 3
y_new = np.ones_like(y)
low = y < intervals[0]
y[low] = 1
y_new[low] = 1
mid = (y > intervals[0]) & (y < intervals[1])
y[mid] = 2
y_new[mid] = 2
mid2 = (y > intervals[1]) & (y < intervals[2])
y[mid2] = 3
y_new[mid2] = 3
high = y > intervals[2]
y[high] = 4
y_new[high] = 4
return X, y, attributeNames, song_names
return X, y_new, attributeNames, song_names
def load_data_binarized(quantile):
# Load needed data into proper format
......
......@@ -2,7 +2,7 @@
from data_load import load_data, load_data_binarized
from crossvalidate import onelevel_crossvalidation, twolevel_crossvalidation, statistics_outer_twolevel
from sklearn import model_selection
from toolbox_02450 import clusterplot
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
......@@ -10,11 +10,12 @@ from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from toolbox_02450 import clusterval
from toolbox_02450 import clusterval, clusterplot
import matplotlib.pyplot as plt
from scipy.stats.kde import gaussian_kde
from scipy.linalg import svd
from apyori import apriori
......@@ -51,7 +52,7 @@ def clustering2(X):
reps = 3 # number of fits with different initalizations, best result will be kept
init_procedure = 'kmeans' # 'kmeans' or 'random'
KRange = range(1,11)
KRange = range(1,12)
T = len(KRange)
CVE = np.zeros((T,))
......@@ -64,7 +65,7 @@ def clustering2(X):
# Fit Gaussian mixture model
gmm = GaussianMixture(n_components=K, covariance_type=covar_type,
n_init=reps, init_params=init_procedure,
tol=1e-6, reg_covar=1e-6, random_state = 41).fit(X)
tol=1e-6, reg_covar=1e-6, random_state = 42).fit(X)
# For each crossvalidation fold
......@@ -86,6 +87,39 @@ def clustering2(X):
plt.xlabel('K')
plt.show()
def clustering2enhalv(X, y, optimal_K):
covar_type = 'full' # you can try out 'diag' as well
reps = 3 # number of fits with different initalizations, best result will be kept
init_procedure = 'kmeans' # 'kmeans' or 'random'
gmm = GaussianMixture(n_components=optimal_K, covariance_type=covar_type,
n_init=reps, init_params=init_procedure,
tol=1e-6, reg_covar=1e-6, random_state=41 ).fit(X)
gmm_clusterings = gmm.predict(X)
gmm_centers = gmm.means_
print(gmm_centers)
gmm_covs = gmm.covariances_
# extract cluster shapes (covariances of gaussians)
U, S, V = svd(X,full_matrices=False)
V_ = V.T
# Project the centered data onto principal component space
X_tilde = X @ V_
X_tilde = X_tilde[:, 1:3]
centers_tilde = gmm_centers @ V_
centers_tilde = centers_tilde[:, 1:3]
clusterplot(X_tilde, clusterid = gmm_clusterings, centroids = centers_tilde, y=y)
plt.xlabel("PC2")
plt.ylabel("PC3")
plt.show()
def clustering3(X, y, optimal_K ):
method = 'complete'
......@@ -103,6 +137,7 @@ def clustering3(X, y, optimal_K ):
gmm_clusterings = gmm.predict(X)
hie_rand, hie_jaccard, hie_NMI = clusterval(y, hierchical_clusterings)
gmm_rand, gmm_jaccard, gmm_NMI = clusterval(y, gmm_clusterings)
......@@ -118,9 +153,10 @@ def outlier1(X, songnames):
# Plot kernel density estimate
plt.bar(range(10),scores[:10], log = True)
print(scores[:10])
plt.title('KDE Outlier score: Log scale')
plt.xticks(range(10), [songnames[i].split("(")[0].split(".")[0].split("-")[0] for i in idx], rotation = 25)
plt.yticks([1.79112252e-05, 1.79112253e-05, 1.79112254e-05, 1.79112255e-05])
plt.yticks([7.90598304e-05, 7.90598320e-05, 7.90598337e-05, 7.90598352e-05])
plt.show()
K = 9 #TODO ????
......@@ -132,7 +168,7 @@ def outlier1(X, songnames):
density = density[idx]
# Plot k-neighbor estimate of outlier score (distances)
plt.bar(range(10),density[:10])
plt.xticks(range(10), [songnames[i].split("(")[0].split(".")[0] for i in idx], rotation = 25)
plt.xticks(range(10), [songnames[i].split("(")[0].split(".")[0].split("-")[0] for i in idx], rotation = 25)
plt.title('KNN density: Outlier score')
plt.show()
......@@ -174,12 +210,13 @@ def association_mining1(X, labels):
if __name__ == "__main__":
#X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
#clusters = clustering1(X)
# clustering2(X)
#clustering2enhalv(X, y, 9)
# clustering3(X, y, 9)
#song_names = list(song_names)
#outlier1(X, song_names)
outlier1(X, song_names)
X, attributeNames, song_names = load_data_binarized(3)
association_mining1(X, attributeNames)
\ No newline at end of file
# X, attributeNames, song_names = load_data_binarized(3)
# association_mining1(X, attributeNames)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment