Hej Per parameter op til marisering

012e7f84 · sorenmulli · bca818f1 · 012e7f84 · 012e7f84
Commit 012e7f84 authored 5 years ago by sorenmulli
--- a/src/data_load.py
+++ b/src/data_load.py
@@ -5,6 +5,7 @@ os.chdir(sys.path[0])
 import numpy as np 
 import pandas as pd 
+from sklearn.preprocessing import binarize
 def load_data(standardize, target, intervals):
 	# Load needed data into proper format
@@ -14,7 +15,7 @@ def load_data(standardize, target, intervals):
 	song_names = df["song_title"]
 	# Drop index value target and text informations
-	df = df.drop(df.columns[[0, 6, 14, 15, 16]], axis=1) 
+	df = df.drop(df.columns[[0, 6,9, 12, 14, 15, 16]], axis=1) 
 	raw_data = df.get_values()
 	x = raw_data[:,]
@@ -55,3 +56,42 @@ def load_data(standardize, target, intervals):
 	y[high] = 4
 	return X, y, attributeNames, song_names
+def load_data_binarized(quantile):
+	# Load needed data into proper format
+	df = pd.read_csv('data/spotify_data.csv' , sep=',',header=0)
+	song_names = df["song_title"]
+	# Drop index value target and text informations
+	df = df.drop(df.columns[[0, 6,9, 12, 14, 15, 16]], axis=1) 
+	raw_data = df.get_values()
+	x = raw_data[:,]
+	N, M = x.shape
+	X = np.zeros([N,M])
+	for i in range(N):
+		for j in range(M):
+			X[i,j] = x[i,j]
+	# Extract names of attributes
+	attributeNames = np.asarray(df.columns)
+	long_names = list()
+	thresholded_array = np.zeros((X.shape[0], X.shape[1]*quantile))
+	for attribute_i in range(X.shape[1]):
+		thresholds = np.quantile(X[:, attribute_i], [i/quantile for i in range(1, quantile + 1 )] )
+		for k, threshold in enumerate(thresholds):
+			long_names.append(f"{attributeNames[attribute_i]}_q{k+1}")
+			thresholded_array[X[:, attribute_i] <= threshold, 3*attribute_i+k] = 1
+			for m in range(k):
+				thresholded_array[:, 3*attribute_i + k] -= thresholded_array[:, 3*attribute_i + k - (m+1)]
+	return thresholded_array, long_names, song_names
+if __name__ == "__main__":
+	load_data_binarized(3)
\ No newline at end of file
--- a/src/main.py
+++ b/src/main.py
-from data_load import load_data
+from data_load import load_data, load_data_binarized
 from crossvalidate import onelevel_crossvalidation, twolevel_crossvalidation, statistics_outer_twolevel
 from sklearn import model_selection
@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt
 from scipy.stats.kde import gaussian_kde
+from apyori import apriori
 def clustering1(X):
 	method = 'complete'
 	metric = 'euclidean'
@@ -148,10 +150,36 @@ def outlier1(X, songnames):
 	plt.show()
+def association_mining1(X, labels):
+	min_support = .11
+	min_confidence = .6
+	T = []
+	for i in range(X.shape[0]):
+		l = np.nonzero(X[i, :])[0].tolist()
+		l = [labels[i] for i in l]
+		T.append(l)
+	rules = apriori( T, min_support=min_support, min_confidence=min_confidence)
+	for r in rules:
+		for o in r.ordered_statistics: 
+			conf = o.confidence
+			supp = r.support
+			x = ", ".join( list( o.items_base ) )
+			y = ", ".join( list( o.items_add ) )
+			print("{%s} -> {%s}  (supp: %.3f, conf: %.3f)"%(x,y, supp, conf))
 if __name__ == "__main__":
-	X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
+	#X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
 	#clusters = clustering1(X)
 	#clustering2(X)
 	#clustering3(X, y, 9)
-	song_names = list(song_names)
+	#song_names = list(song_names)
-	outlier1(X, song_names)
+	#outlier1(X, song_names)
\ No newline at end of file
+	X, attributeNames, song_names = load_data_binarized(3)
+	association_mining1(X, attributeNames)
\ No newline at end of file