Skip to content
Snippets Groups Projects
Commit 012e7f84 authored by sorenmulli's avatar sorenmulli
Browse files

Hej Per parameter op til marisering

parent bca818f1
No related branches found
No related tags found
No related merge requests found
...@@ -5,6 +5,7 @@ os.chdir(sys.path[0]) ...@@ -5,6 +5,7 @@ os.chdir(sys.path[0])
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn.preprocessing import binarize
def load_data(standardize, target, intervals): def load_data(standardize, target, intervals):
# Load needed data into proper format # Load needed data into proper format
...@@ -14,7 +15,7 @@ def load_data(standardize, target, intervals): ...@@ -14,7 +15,7 @@ def load_data(standardize, target, intervals):
song_names = df["song_title"] song_names = df["song_title"]
# Drop index value target and text informations # Drop index value target and text informations
df = df.drop(df.columns[[0, 6, 14, 15, 16]], axis=1) df = df.drop(df.columns[[0, 6,9, 12, 14, 15, 16]], axis=1)
raw_data = df.get_values() raw_data = df.get_values()
x = raw_data[:,] x = raw_data[:,]
...@@ -55,3 +56,42 @@ def load_data(standardize, target, intervals): ...@@ -55,3 +56,42 @@ def load_data(standardize, target, intervals):
y[high] = 4 y[high] = 4
return X, y, attributeNames, song_names return X, y, attributeNames, song_names
def load_data_binarized(quantile):
# Load needed data into proper format
df = pd.read_csv('data/spotify_data.csv' , sep=',',header=0)
song_names = df["song_title"]
# Drop index value target and text informations
df = df.drop(df.columns[[0, 6,9, 12, 14, 15, 16]], axis=1)
raw_data = df.get_values()
x = raw_data[:,]
N, M = x.shape
X = np.zeros([N,M])
for i in range(N):
for j in range(M):
X[i,j] = x[i,j]
# Extract names of attributes
attributeNames = np.asarray(df.columns)
long_names = list()
thresholded_array = np.zeros((X.shape[0], X.shape[1]*quantile))
for attribute_i in range(X.shape[1]):
thresholds = np.quantile(X[:, attribute_i], [i/quantile for i in range(1, quantile + 1 )] )
for k, threshold in enumerate(thresholds):
long_names.append(f"{attributeNames[attribute_i]}_q{k+1}")
thresholded_array[X[:, attribute_i] <= threshold, 3*attribute_i+k] = 1
for m in range(k):
thresholded_array[:, 3*attribute_i + k] -= thresholded_array[:, 3*attribute_i + k - (m+1)]
return thresholded_array, long_names, song_names
if __name__ == "__main__":
load_data_binarized(3)
\ No newline at end of file
from data_load import load_data from data_load import load_data, load_data_binarized
from crossvalidate import onelevel_crossvalidation, twolevel_crossvalidation, statistics_outer_twolevel from crossvalidate import onelevel_crossvalidation, twolevel_crossvalidation, statistics_outer_twolevel
from sklearn import model_selection from sklearn import model_selection
...@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt ...@@ -16,6 +16,8 @@ import matplotlib.pyplot as plt
from scipy.stats.kde import gaussian_kde from scipy.stats.kde import gaussian_kde
from apyori import apriori
def clustering1(X): def clustering1(X):
method = 'complete' method = 'complete'
metric = 'euclidean' metric = 'euclidean'
...@@ -148,10 +150,36 @@ def outlier1(X, songnames): ...@@ -148,10 +150,36 @@ def outlier1(X, songnames):
plt.show() plt.show()
def association_mining1(X, labels):
min_support = .11
min_confidence = .6
T = []
for i in range(X.shape[0]):
l = np.nonzero(X[i, :])[0].tolist()
l = [labels[i] for i in l]
T.append(l)
rules = apriori( T, min_support=min_support, min_confidence=min_confidence)
for r in rules:
for o in r.ordered_statistics:
conf = o.confidence
supp = r.support
x = ", ".join( list( o.items_base ) )
y = ", ".join( list( o.items_add ) )
print("{%s} -> {%s} (supp: %.3f, conf: %.3f)"%(x,y, supp, conf))
if __name__ == "__main__": if __name__ == "__main__":
X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110]) #X, y, attributeNames, song_names = load_data(standardize = True, target = 'tempo', intervals = [90, 100, 110])
#clusters = clustering1(X) #clusters = clustering1(X)
#clustering2(X) #clustering2(X)
#clustering3(X, y, 9) #clustering3(X, y, 9)
song_names = list(song_names) #song_names = list(song_names)
outlier1(X, song_names) #outlier1(X, song_names)
\ No newline at end of file
X, attributeNames, song_names = load_data_binarized(3)
association_mining1(X, attributeNames)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment