Skip to content
Snippets Groups Projects
Commit dcf2377d authored by manxilin's avatar manxilin
Browse files

machine learning

parent 339d69b0
Branches
No related tags found
No related merge requests found
File added
import numpy as np
class BinClassifierEnsemble:
'''
Simple class to aggregate multiple weak classfiers into ensemble
'''
classifiers = []
alpha = 0
cn = 0
def __init__(self, classifier_list, alpha='auto'):
self.classifiers = classifier_list
self.cn = len(self.classifiers)
if type(alpha) is str and alpha=='auto':
self.alpha = np.ones((self.cn,1),dtype=float)/self.cn
else:
self.alpha = np.asarray(alpha).ravel()
def predict(self, X):
'''
Returns predicted class (value of y) for given X,
based on ensemble majority vote.
'''
votes = np.zeros((X.shape[0],1))
for c_id, c in enumerate(self.classifiers):
y_est = np.mat(c.predict(X)).T
y_est[y_est>1]=1 # restrict to binomial (or first-vs-rest)
votes = votes + y_est*self.alpha[c_id]
return (votes.astype(float)>.5).astype(int)
def predict_proba(self, X):
'''
Returns proportion of ensemble votes for class being y=1,
for given X, that is: votes1/(votes0+votes1).
'''
votes = np.ones((X.shape[0],1))
for c_id, c in enumerate(self.classifiers):
y_est = np.mat(c.predict(X)).T
y_est[y_est>1]=1 # restrict to binomial (or first-vs-rest)
votes = votes - y_est*self.alpha[c_id]
return votes.astype(float)
File added
import numpy as np
def categoric2numeric(x):
'''
CATEGORIC2NUMERIC converts data matrix with categorical columns given by
numeric or text values to numeric columns using one out of K coding.
Usage:
X_num, attribute_names = categoric2numeric(x)
Input:
x categorical column of a data matrix
Output:
X_num Data matrix where categoric column has been
converted to one out of K coding
attribute_names list of string type with attribute names '''
x = np.asarray(x).ravel()
x_labels = np.unique(x)
x_labels_str = x_labels.astype(str).tolist()
N = len(x)
M = len(x_labels)
xc = np.zeros((N,M), dtype=int)
for i in range(M):
flags = x==x_labels[i]
xc[flags,i] = 1
return xc, x_labels_str
......@@ -46,3 +46,5 @@ for train_index, test_index in CV.split(X,y):
k+=2
show()
# 因为只用了一个attribute所以效果很差。太多的attribute会导致速度很慢。
\ No newline at end of file
......@@ -20,7 +20,7 @@ C = len(classNames)
# Fit model using bootstrap aggregation (boosting, AdaBoost):
# Number of rounds of bagging
L = 100
L = 500
# Weights for selecting samples in each bootstrap
weights = np.ones((N,),dtype=float)/N
......
import numpy as np
from scipy.stats import zscore
def similarity(X, Y, method):
'''
SIMILARITY Computes similarity matrices
Usage:
sim = similarity(X, Y, method)
Input:
X N1 x M matrix
Y N2 x M matrix
method string defining one of the following similarity measure
'SMC', 'smc' : Simple Matching Coefficient
'Jaccard', 'jac' : Jaccard coefficient
'ExtendedJaccard', 'ext' : The Extended Jaccard coefficient
'Cosine', 'cos' : Cosine Similarity
'Correlation', 'cor' : Correlation coefficient
Output:
sim Estimated similarity matrix between X and Y
If input is not binary, SMC and Jaccard will make each
attribute binary according to x>median(x)
Copyright, Morten Morup and Mikkel N. Schmidt
Technical University of Denmark '''
X = np.mat(X)
Y = np.mat(Y)
N1, M = np.shape(X)
N2, M = np.shape(Y)
method = method[:3].lower()
if method=='smc': # SMC
X,Y = binarize(X,Y);
sim = ((X*Y.T)+((1-X)*(1-Y).T))/M
elif method=='jac': # Jaccard
X,Y = binarize(X,Y);
sim = (X*Y.T)/(M-(1-X)*(1-Y).T)
elif method=='ext': # Extended Jaccard
XYt = X*Y.T
sim = XYt / (np.log( np.exp(sum(np.power(X.T,2))).T * np.exp(sum(np.power(Y.T,2))) ) - XYt)
elif method=='cos': # Cosine
sim = (X*Y.T)/(np.sqrt(sum(np.power(X.T,2))).T * np.sqrt(sum(np.power(Y.T,2))))
elif method=='cor': # Correlation
X_ = zscore(X,axis=1,ddof=1)
Y_ = zscore(Y,axis=1,ddof=1)
sim = (X_*Y_.T)/(M-1)
return sim
def binarize(X,Y=None):
''' Force binary representation of the matrix, according to X>median(X) '''
x_was_transposed = False
if Y is None:
if X.shape[0] == 1:
x_was_transposed = True
X = X.T;
Xmedians = np.ones((np.shape(X)[0],1)) * np.median(X,0)
Xflags = X>Xmedians
X[Xflags] = 1; X[~Xflags] = 0
if x_was_transposed:
return X.T
return X
else:
#X = np.matrix(X); Y = np.matrix(Y);
#XYmedian= np.median(np.bmat('X; Y'),0)
#Xmedians = np.ones((np.shape(X)[0],1)) * XYmedian
#Xflags = X>Xmedians
#X[Xflags] = 1; X[~Xflags] = 0
#Ymedians = np.ones((np.shape(Y)[0],1)) * XYmedian
#Yflags = Y>Ymedians
#Y[Yflags] = 1; Y[~Yflags] = 0
return [binarize(X,None),binarize(Y,None)]
## Example
#import numpy as np
#from similarity import binarize2
#A = np.asarray([[1,2,3,4,5],[6,7,8,9,10],[1,2,3,4,5],[6,7,8,9,10]]).T
#binarize2(A,['a','b','c','d'])
def binarize2(X,columnnames):
X = np.concatenate((binarize(X),1-binarize(X)),axis=1)
new_column_names = []
[new_column_names.append(elm) for elm in [name+' 50th-100th percentile' for name in columnnames]]
[new_column_names.append(elm) for elm in [name+' 0th-50th percentile' for name in columnnames]]
return X, new_column_names
\ No newline at end of file
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment