im stuff?

7d75967f · sorenmulli · 012e7f84 · 7d75967f · 7d75967f · 012e7f84
Commit 7d75967f authored Nov 26, 2019 by sorenmulli
--- a/docs/report/tex/Billeder/clusterfuck12.png
+++ b/docs/report/tex/Billeder/clusterfuck12.png
--- a/docs/report/tex/Billeder/clusterfuck23.png
+++ b/docs/report/tex/Billeder/clusterfuck23.png
--- a/docs/report/tex/Billeder/dendogram.png
+++ b/docs/report/tex/Billeder/dendogram.png
--- a/docs/report/tex/Billeder/gmmloss.png
+++ b/docs/report/tex/Billeder/gmmloss.png
--- a/docs/report/tex/Billeder/out_KDE.png
+++ b/docs/report/tex/Billeder/out_KDE.png
--- a/docs/report/tex/Billeder/out_KNNdes.png
+++ b/docs/report/tex/Billeder/out_KNNdes.png
--- a/docs/report/tex/Billeder/out_KNNrel.png
+++ b/docs/report/tex/Billeder/out_KNNrel.png
--- a/docs/report/tex/report3.tex
+++ b/docs/report/tex/report3.tex
@@ -46,18 +46,22 @@ In this section, a clustering is performed using the Gaussian Mixture Model and
 	\includegraphics[width = 0.6\linewidth]{gmmloss}
 \end{figure}

+%[23487.56607812 18127.38559714 14899.35424052 14243.28471954
+%13453.47497572 13109.79453539 12855.24139837 12576.10894894
+%12178.15906456 12210.34021972 12243.54429977]
+
 \noindent From the illustration, it is obvious that the most appropiate number of cluster i 9. Meanwhile, the negative log likelihood increases when the number of clusters is either increased or decreased from 9, which is to be expected.

 \subsection{Evaluation of GMM and Hierarchical Clustering}
 \textit{Evaluate the quality of the clusterings using GMM label information and for hierarchical clustering with the same number of clusters as in the GMM.}
 Hierachical:
-Rand: 0.503908248144738
-Jaccard: 0.31190502504403683
-NMI: 0.015804717180120536
+Rand: 0.5328153158470461
+Jaccard: 0.19233601406747694
+NMI: 0.022295065087228606
 GMM:
-Rand: 0.5116519504843748
-Jaccard: 0.1672924621901809
-NMI: 0.013523079325376108
+Rand: 0.5231302775613633
+Jaccard: 0.12366747954105438
+NMI: 0.018119058189570287
 \section{Outlier Detection/Anomaly Detection}


@@ -113,6 +117,21 @@ To find the most probable outliers in the dataset, the best cause of action is t
 \subsection{Apriori Algorithm for Frequent Itemsets and Association Rules}
 \textit{Find the frequent itemsets and the association rules with high confidence based on the results of the Apriori algorithm.} \\

+Hej Per Parametre
+\[
+\texttt{minsup} = 	0.11\qquad \texttt{minconf} = 0.6
+\]
+
+%{energy_q1} -> {loudness_q1}  (supp: 0.226, conf: 0.674)
+%{loudness_q1} -> {energy_q1}  (supp: 0.226, conf: 0.676)
+%{energy_q3} -> {loudness_q3}  (supp: 0.210, conf: 0.634)
+%{loudness_q3} -> {energy_q3}  (supp: 0.210, conf: 0.631)
+%{acousticness_q3, energy_q1} -> {loudness_q1}  (supp: 0.142, conf: 0.740)
+%{loudness_q1, acousticness_q3} -> {energy_q1}  (supp: 0.142, conf: 0.842)
+%{loudness_q1, energy_q1} -> {acousticness_q3}  (supp: 0.142, conf: 0.631)
+%{valence_q1, energy_q1} -> {loudness_q1}  (supp: 0.112, conf: 0.708)
+%{loudness_q1, valence_q1} -> {energy_q1}  (supp: 0.112, conf: 0.833)
+
 \subsection{Interpretation of the Association Rules}
 \textit{Interpret the generated association rules.} \\


--- a/src/data_load.py
+++ b/src/data_load.py
@@ -35,27 +35,29 @@ def load_data(standardize, target, intervals):
 	y_idx = list(attributeNames).index(target)

 	
-
 	y = X[:, y_idx]
+
 	X = np.delete(X, (y_idx), axis = 1)
 	if standardize:
 		X = (X - X.mean(0)) / X.std(0) 
 	
 	assert len(intervals) == 3

+	y_new = np.ones_like(y)
+
 	low = y < intervals[0]
-	y[low] = 1
+	y_new[low] = 1

 	mid = (y > intervals[0]) & (y < intervals[1])
-	y[mid] = 2
+	y_new[mid] = 2

 	mid2 = (y > intervals[1]) & (y < intervals[2])
-	y[mid2] = 3
+	y_new[mid2] = 3

 	high = y > intervals[2]
-	y[high] = 4
+	y_new[high] = 4

-	return X, y, attributeNames, song_names
+	return X, y_new, attributeNames, song_names

 def load_data_binarized(quantile):
 	# Load needed data into proper format

--- a/src/main.py
+++ b/src/main.py
@@ -2,7 +2,7 @@
 from data_load import load_data, load_data_binarized
 from crossvalidate import onelevel_crossvalidation, twolevel_crossvalidation, statistics_outer_twolevel
 from sklearn import model_selection
-
+from toolbox_02450 import clusterplot
 import numpy as np

 from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
@@ -10,11 +10,12 @@ from sklearn.mixture import GaussianMixture

 from sklearn.neighbors import NearestNeighbors

-from toolbox_02450 import clusterval
+from toolbox_02450 import clusterval, clusterplot

 import matplotlib.pyplot as plt

 from scipy.stats.kde import gaussian_kde
+from scipy.linalg import svd

 from apyori import apriori

@@ -51,7 +52,7 @@ def clustering2(X):
 	reps = 3                  # number of fits with different initalizations, best result will be kept
 	init_procedure = 'kmeans' # 'kmeans' or 'random'

-	KRange = range(1,11)
+	KRange = range(1,12)
 	T = len(KRange)

 	CVE = np.zeros((T,))
@@ -64,7 +65,7 @@ def clustering2(X):
 			# Fit Gaussian mixture model
 			gmm = GaussianMixture(n_components=K, covariance_type=covar_type, 
 								n_init=reps, init_params=init_procedure,
-								tol=1e-6, reg_covar=1e-6, random_state = 41).fit(X)
+								tol=1e-6, reg_covar=1e-6, random_state = 42).fit(X)
 			

 			# For each crossvalidation fold
@@ -86,6 +87,39 @@ def clustering2(X):
 	plt.xlabel('K')
 	plt.show()

+def clustering2enhalv(X, y, optimal_K):
+	covar_type = 'full'       # you can try out 'diag' as well
+	reps = 3                  # number of fits with different initalizations, best result will be kept
+	init_procedure = 'kmeans' # 'kmeans' or 'random'
+	gmm = GaussianMixture(n_components=optimal_K, covariance_type=covar_type, 
+					n_init=reps, init_params=init_procedure,
+					tol=1e-6, reg_covar=1e-6, random_state=41 ).fit(X)
+
+	gmm_clusterings = gmm.predict(X)
+
+
+	gmm_centers = gmm.means_
+	
+	print(gmm_centers)
+
+	gmm_covs = gmm.covariances_
+	# extract cluster shapes (covariances of gaussians)
+
+	U, S, V = svd(X,full_matrices=False)
+	V_ = V.T
+	# Project the centered data onto principal component space
+	X_tilde = X @ V_
+	X_tilde = X_tilde[:, 1:3]
+	
+	centers_tilde = gmm_centers @ V_
+
+	centers_tilde = centers_tilde[:, 1:3]
+
+	clusterplot(X_tilde, clusterid = gmm_clusterings, centroids = centers_tilde, y=y)
+	plt.xlabel("PC2")
+	plt.ylabel("PC3")
+	plt.show()
+
 def clustering3(X, y, optimal_K ):

 	method = 'complete'
@@ -103,6 +137,7 @@ def clustering3(X, y, optimal_K ):

 	gmm_clusterings = gmm.predict(X)

+
 	hie_rand, hie_jaccard, hie_NMI = clusterval(y, hierchical_clusterings)
 	gmm_rand, gmm_jaccard, gmm_NMI = clusterval(y, gmm_clusterings)

@@ -118,9 +153,10 @@ def outlier1(X, songnames):
 	
 	# Plot kernel density estimate
 	plt.bar(range(10),scores[:10], log = True)
+	print(scores[:10])
 	plt.title('KDE Outlier score: Log scale')
 	plt.xticks(range(10), [songnames[i].split("(")[0].split(".")[0].split("-")[0] for i in idx], rotation = 25)
-	plt.yticks([1.79112252e-05,  1.79112253e-05, 1.79112254e-05, 1.79112255e-05])
+	plt.yticks([7.90598304e-05, 7.90598320e-05, 7.90598337e-05, 7.90598352e-05])
 	plt.show()
 	
 	K = 9 #TODO ????
@@ -132,7 +168,7 @@ def outlier1(X, songnames):
 	density = density[idx]
 	# Plot k-neighbor estimate of outlier score (distances)
 	plt.bar(range(10),density[:10])
-	plt.xticks(range(10), [songnames[i].split("(")[0].split(".")[0] for i in idx], rotation = 25)
+	plt.xticks(range(10), [songnames[i].split("(")[0].split(".")[0].split("-")[0] for i in idx], rotation = 25)
 	
 	plt.title('KNN density: Outlier score')
 	plt.show()
@@ -174,12 +210,13 @@ def association_mining1(X, labels):


 if __name__ == "__main__":
-	#X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
+	X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
 	#clusters = clustering1(X)
 #	clustering2(X)
+	#clustering2enhalv(X, y, 9)
 #	clustering3(X, y, 9)
 	#song_names = list(song_names)
-	#outlier1(X, song_names)
+	outlier1(X, song_names)

-	X, attributeNames, song_names = load_data_binarized(3)
-	association_mining1(X, attributeNames)
\ No newline at end of file
+#	X, attributeNames, song_names = load_data_binarized(3)
+#	association_mining1(X, attributeNames)
\ No newline at end of file
--- a/src/visualization.ipynb
+++ b/src/visualization.ipynb