machine learning

dcf2377d · manxilin · 339d69b0 · dcf2377d · dcf2377d · dcf2377d
Commit dcf2377d authored 4 years ago by manxilin
--- a/02450/Exercises/02450ex9_Python.pdf
+++ b/02450/Exercises/02450ex9_Python.pdf
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/__pycache__/bin_classifier_ensemble.cpython-38.pyc
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/__pycache__/bin_classifier_ensemble.cpython-38.pyc
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/bin_classifier_ensemble.py
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/bin_classifier_ensemble.py
+import numpy as np
+
+class BinClassifierEnsemble:
+    '''
+        Simple class to aggregate multiple weak classfiers into ensemble
+    '''
+    classifiers = []
+    alpha = 0
+    cn = 0
+
+    def __init__(self, classifier_list, alpha='auto'):
+        self.classifiers = classifier_list
+        self.cn = len(self.classifiers)
+        if type(alpha) is str and alpha=='auto':
+            self.alpha = np.ones((self.cn,1),dtype=float)/self.cn
+        else:
+            self.alpha = np.asarray(alpha).ravel()
+                
+            
+    def predict(self, X):
+        '''
+            Returns predicted class (value of y) for given X,
+            based on ensemble majority vote.
+        '''
+        votes = np.zeros((X.shape[0],1))
+        for c_id, c in enumerate(self.classifiers):
+            y_est = np.mat(c.predict(X)).T
+            y_est[y_est>1]=1 # restrict to binomial (or first-vs-rest)
+            votes = votes + y_est*self.alpha[c_id]
+        return (votes.astype(float)>.5).astype(int)
+        
+    def predict_proba(self, X):
+        '''
+            Returns proportion of ensemble votes for class being y=1,
+            for given X, that is: votes1/(votes0+votes1).
+        '''
+        votes = np.ones((X.shape[0],1))
+        for c_id, c in enumerate(self.classifiers):
+            y_est = np.mat(c.predict(X)).T
+            y_est[y_est>1]=1 # restrict to binomial (or first-vs-rest)
+            votes = votes - y_est*self.alpha[c_id]
+        return votes.astype(float)
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/bin_classifier_ensemble.pyc
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/bin_classifier_ensemble.pyc
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/categoric2numeric.py
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/categoric2numeric.py
+import numpy as np
+
+def categoric2numeric(x):
+    '''
+    CATEGORIC2NUMERIC converts data matrix with categorical columns given by
+    numeric or text values to numeric columns using one out of K coding.
+
+    Usage:
+        X_num, attribute_names = categoric2numeric(x)
+
+    Input:
+        x                   categorical column of a data matrix 
+
+    Output:
+        X_num               Data matrix where categoric column has been
+                            converted to one out of K coding
+        attribute_names     list of string type with attribute names '''
+
+    x = np.asarray(x).ravel()
+    x_labels = np.unique(x)
+    x_labels_str = x_labels.astype(str).tolist()
+    N = len(x)
+    M = len(x_labels)
+    xc = np.zeros((N,M), dtype=int)
+    for i in range(M):
+        flags = x==x_labels[i]
+        xc[flags,i] = 1
+    return xc, x_labels_str
+    
+    
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/ex9_1_2.py
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/ex9_1_2.py
@@ -46,3 +46,5 @@ for train_index, test_index in CV.split(X,y):
    k+=2
    
 show()    
+
+# 因为只用了一个attribute所以效果很差。太多的attribute会导致速度很慢。
\ No newline at end of file
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/ex9_2_2.py
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/ex9_2_2.py
@@ -20,7 +20,7 @@ C = len(classNames)
 # Fit model using bootstrap aggregation (boosting, AdaBoost):

 # Number of rounds of bagging
-L = 100
+L = 500

 # Weights for selecting samples in each bootstrap
 weights = np.ones((N,),dtype=float)/N

--- a/02450/toolboxes/02450Toolbox_Python/Scripts/similarity.py
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/similarity.py
+import numpy as np
+from scipy.stats import zscore
+
+
+def similarity(X, Y, method):
+    '''
+    SIMILARITY Computes similarity matrices
+
+    Usage:
+        sim = similarity(X, Y, method)
+
+    Input:
+    X   N1 x M matrix
+    Y   N2 x M matrix 
+    method   string defining one of the following similarity measure
+           'SMC', 'smc'             : Simple Matching Coefficient
+           'Jaccard', 'jac'         : Jaccard coefficient 
+           'ExtendedJaccard', 'ext' : The Extended Jaccard coefficient
+           'Cosine', 'cos'          : Cosine Similarity
+           'Correlation', 'cor'     : Correlation coefficient
+
+    Output:
+    sim Estimated similarity matrix between X and Y
+        If input is not binary, SMC and Jaccard will make each
+        attribute binary according to x>median(x)
+
+    Copyright, Morten Morup and Mikkel N. Schmidt
+    Technical University of Denmark '''
+
+    X = np.mat(X)
+    Y = np.mat(Y)
+    N1, M = np.shape(X)
+    N2, M = np.shape(Y)
+    
+    method = method[:3].lower()
+    if method=='smc': # SMC
+        X,Y = binarize(X,Y);
+        sim = ((X*Y.T)+((1-X)*(1-Y).T))/M
+    elif method=='jac': # Jaccard
+        X,Y = binarize(X,Y);
+        sim = (X*Y.T)/(M-(1-X)*(1-Y).T)        
+    elif method=='ext': # Extended Jaccard
+        XYt = X*Y.T
+        sim = XYt / (np.log( np.exp(sum(np.power(X.T,2))).T * np.exp(sum(np.power(Y.T,2))) ) - XYt)
+    elif method=='cos': # Cosine
+        sim = (X*Y.T)/(np.sqrt(sum(np.power(X.T,2))).T * np.sqrt(sum(np.power(Y.T,2))))
+    elif method=='cor': # Correlation
+        X_ = zscore(X,axis=1,ddof=1)
+        Y_ = zscore(Y,axis=1,ddof=1)
+        sim = (X_*Y_.T)/(M-1)
+    return sim
+        
+def binarize(X,Y=None):
+    ''' Force binary representation of the matrix, according to X>median(X) '''
+    x_was_transposed = False
+    if Y is None:
+        if X.shape[0] == 1:
+            x_was_transposed = True
+            X = X.T;
+        
+        Xmedians = np.ones((np.shape(X)[0],1)) * np.median(X,0)
+        Xflags = X>Xmedians
+        X[Xflags] = 1; X[~Xflags] = 0
+
+        if x_was_transposed:
+            return X.T
+        return X
+    else:
+        #X = np.matrix(X); Y = np.matrix(Y);
+        #XYmedian= np.median(np.bmat('X; Y'),0)
+        #Xmedians = np.ones((np.shape(X)[0],1)) * XYmedian
+        #Xflags = X>Xmedians
+        #X[Xflags] = 1; X[~Xflags] = 0
+        #Ymedians = np.ones((np.shape(Y)[0],1)) * XYmedian
+        #Yflags = Y>Ymedians
+        #Y[Yflags] = 1; Y[~Yflags] = 0
+        return [binarize(X,None),binarize(Y,None)]
+        
+
+## Example
+#import numpy as np
+#from similarity import binarize2
+#A = np.asarray([[1,2,3,4,5],[6,7,8,9,10],[1,2,3,4,5],[6,7,8,9,10]]).T
+#binarize2(A,['a','b','c','d'])
+def binarize2(X,columnnames):
+    X = np.concatenate((binarize(X),1-binarize(X)),axis=1)
+
+    new_column_names = []
+    [new_column_names.append(elm) for elm in [name+' 50th-100th percentile' for name in columnnames]]
+    [new_column_names.append(elm) for elm in [name+' 0th-50th percentile' for name in columnnames]]
+
+    return X, new_column_names
\ No newline at end of file
--- a/02450/toolboxes/02450Toolbox_Python/Scripts/similarity.pyc
+++ b/02450/toolboxes/02450Toolbox_Python/Scripts/similarity.pyc