Merge branch 'master' of https://lab.compute.dtu.dk/s183917/ml_data

0d885d6e · Anders Henriksen · 61b92c90 · 0e3ff36d · 61b92c90 · 0d885d6e
Commit 0d885d6e authored Nov 28, 2019 by Anders Henriksen
--- a/docs/report/tex/Billeder/gmmloss.png
+++ b/docs/report/tex/Billeder/gmmloss.png
--- a/docs/report/tex/report3.tex
+++ b/docs/report/tex/report3.tex
@@ -13,7 +13,7 @@
 \title{Methods of Clustering and Outlier detection}
 \author{Oskar Eiler Wiese Christensen s183917, Anders Henriksen s183904, Søren Winkel Holm s183911}
 \date{\today}
-
+\usepackage{booktabs}[H]
 \pagestyle{plain}
 \fancyhf{}
 \rfoot{Page \thepage{} of \pageref{LastPage}}
@@ -37,7 +37,19 @@ For the dataset, we have decided to use complete linkage and euclidian distance,
 	\includegraphics[width = 0.6\linewidth]{dendogram}
 \end{figure}

-\noindent From the figure, it is obvious that the data is split into three major clusters until the last few connections of the dendrogram.
+\noindent From the figure, it is obvious that the data is split into three major clusters until the last few connections of the dendrogram. To understand these four major groups and get an idea of whether they are somewhat divided after tempo (a rigorous test of this is done in 1.3), examples from each group and the mean tempo group (1 being low tempo, 4 being high tempo) of the songs is shown.
+
+\begin{itemize}
+	\item Green group: 17 songs.  Mean tempo group: \(2.7\)\\
+	Examples: \textit{Pyramids, Night on Bald Mountain, Zombie, Who is He and What Is He to You}
+	\item Red group: 39 songs. Mean tempo group: \(2.6\)\\
+	Examples: \textit{Mozart: Requiem in D Minor, I'll Let You Know, Romance, For Wee Folks}
+	\item Turquoise group: 161 songs. Mean tempo group: \(3.2\)\\
+	Examples: \textit{Mask Off, Xanny Family, I've Seen Footage, Bouncin}
+	\item Purple group: 1800 songs. Mean tempo group: \(3.3\)\\
+	Examples: \textit{Redbone, Master of None, Parallel Lines, Sneakin'}
+\end{itemize}
+From this superficial comparison, it seems that there might be a slight connection between the last two groups and songs with higher tempo. From the song names, it is hard to see genre seperations as we don't have access to a genre attribute in the data set. It might seem that music close to a rock genre is prevalent in the first group while the second group could be linked to instrumental, slower classical music. The third group can be linked to rap with multiple of the above songs from the artist \textit{Future} while the fourth could be seen as pop music or just music which is not contained in the other groups.

 \subsection{GMM and Component Estimation}
 \textit{Cluster the data by the Gaussian Mixture Model and find the number of clusters by cross-validation. Interpret the cluster centers.} \\
@@ -45,6 +57,16 @@ In this section, a clustering is performed using the Gaussian Mixture Model and
 \begin{figure}[H]
 	\centering
 	\includegraphics[width = 0.6\linewidth]{gmmloss}
+\end{figure}\noindent 
+
+\noindent From the illustration, it can be concluded that the most appropiate number of components is 9 where \(-\log \mathcal L = 12,178 \). Meanwhile, the negative log likelihood increases when the number of clusters is either increased (rises slightly) or decreased from 9, which can be understood as respectively over- and underfitting.
+\begin{figure}[H]
+	\centering
+	\includegraphics[width=\linewidth]{clusterfuck12}		
+\end{figure}
+\begin{figure}[H]
+	\centering 
+	\includegraphics[width=\linewidth]{clusterfuck23}
 \end{figure}

 %[23487.56607812 18127.38559714 14899.35424052 14243.28471954
@@ -54,15 +76,24 @@ In this section, a clustering is performed using the Gaussian Mixture Model and
 \noindent From the illustration, it is obvious that the most appropiate number of cluster i 9. Meanwhile, the negative log likelihood increases when the number of clusters is either increased or decreased from 9, which is to be expected.

 \subsection{Evaluation of GMM and Hierarchical Clustering}
-\textit{Evaluate the quality of the clusterings using GMM label information and for hierarchical clustering with the same number of clusters as in the GMM.}
-Hierachical:
-Rand: 0.5328153158470461
-Jaccard: 0.19233601406747694
-NMI: 0.022295065087228606
-GMM:
-Rand: 0.5231302775613633
-Jaccard: 0.12366747954105438
-NMI: 0.018119058189570287
+%\textit{Evaluate the quality of the clusterings using GMM label information and for hierarchical clustering with the same number of clusters as in the GMM.} 
+
+To evaluate if the cluterings are similar to the premade clusterings of the tempo-attribute, three different similarities measures are used. These are the following: Rand index, Jaccard and NMI. The Rand Index similarity will typically be very high if there are many clusters. This is intuitively due to the fact that there is a lot of pairs of observations in different clusters rather than in the same cluster. This results in a Rand Index similarity close to one. Therefore the Jaccard index is also used as a similarity measure which disregard the pairs of observation in different cluster. The third measure is the normalized mutual information which is similar to both Jaccard and Rand Index. This similarity has a more theoretical background from information theory. It is based on quantifying the amount of information one cluster(s) provides of the other cluster. The evaluation of the GMM and Hierachical Clustering are illustrated in the following table. 
+\begin{table}[H]
+	\centering
+	\begin{tabular}{l l r l r }
+		\toprule
+		Similarity & GMM&  & Hierachical &  \\ \midrule
+		Rand   & 0.5231  &     & 0.5328   &  \\   
+		Jaccard   &  0.1237  &     & 0.1923     &  \\  
+		NMI   &  0.0181  &    &  0.0223    &   \\  
+		
+		\bottomrule
+	\end{tabular}
+\end{table} \noindent
+
+
+
 \section{Outlier Detection/Anomaly Detection}



--- a/src/main.py
+++ b/src/main.py
@@ -30,20 +30,23 @@ def clustering1(X):



-	dendrogram(Z, truncate_mode='level', p=max_display_levels)
+#	dendrogram(Z, truncate_mode='level', p=max_display_levels)

 	maxclust = 4
 	clustering = fcluster(Z, criterion='maxclust', t=maxclust)
 	
 	

-	plt.show()
+	#plt.show()
 	
-	print(np.unique(clustering, return_counts=True))
+	group_num, amount = np.unique(clustering, return_counts=True)
 	
-	print(song_names[clustering == 3])
-	print(y[clustering == 3])
+	for i in group_num:
+		print(f"\nGroup #{i} with amount {amount[i-1]}")

+		print(song_names[clustering == i])
+		print(y[clustering == i].mean())
+		print(y[clustering == i].std())
 	return clustering


@@ -82,7 +85,7 @@ def clustering2(X):
 				CVE[t] += -gmm.score_samples(X_test).sum()
 	print(CVE)
 	
-	plt.plot(KRange, 2*CVE,'-ok')
+	plt.plot(KRange, CVE,'-ok')
 	plt.legend(['GMM Negative Log Likelihood'])
 	plt.xlabel('K')
 	plt.show()
@@ -133,16 +136,17 @@ def clustering3(X, y, optimal_K ):
 	init_procedure = 'kmeans' # 'kmeans' or 'random'
 	gmm = GaussianMixture(n_components=optimal_K, covariance_type=covar_type, 
 					n_init=reps, init_params=init_procedure,
-					tol=1e-6, reg_covar=1e-6, random_state=41 ).fit(X)
+					tol=1e-6, reg_covar=1e-6, random_state=42).fit(X)

 	gmm_clusterings = gmm.predict(X)


 	hie_rand, hie_jaccard, hie_NMI = clusterval(y, hierchical_clusterings)
 	gmm_rand, gmm_jaccard, gmm_NMI = clusterval(y, gmm_clusterings)
-
+	tog_rand, tog_jaccard, tog_NMI = clusterval(hierchical_clusterings, gmm_clusterings )
 	print(f"Hierachical:\n\tRand: {hie_rand}\n\tJaccard: {hie_jaccard}\n\tNMI: {hie_NMI}")
 	print(f"GMM:\n\tRand: {gmm_rand}\n\tJaccard: {gmm_jaccard}\n\tNMI: {gmm_NMI}")
+	print(f"TOGETHI:\n\tRAND BOI:{tog_rand}\n\t Jaccard_boi: {tog_jaccard}\n\t NMI: {tog_NMI}")

 def outlier1(X, songnames):
 	kde = gaussian_kde(X.T)
@@ -150,7 +154,6 @@ def outlier1(X, songnames):
 	idx = scores.argsort()
 	scores.sort()

-	
 	# Plot kernel density estimate
 	plt.bar(range(10),scores[:10], log = True)
 	print(scores[:10])
@@ -212,11 +215,11 @@ def association_mining1(X, labels):
 if __name__ == "__main__":
 	X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
 	#clusters = clustering1(X)
-#	clustering2(X)
+	clustering2(X)
 	#clustering2enhalv(X, y, 9)
-#	clustering3(X, y, 9)
+	clustering3(X, y, 9)
 	#song_names = list(song_names)
-	outlier1(X, song_names)
+	#outlier1(X, song_names)

 #	X, attributeNames, song_names = load_data_binarized(3)
 #	association_mining1(X, attributeNames)
\ No newline at end of file