Skip to content
Snippets Groups Projects
Commit d004d8ad authored by bjje's avatar bjje
Browse files

Updated structure of lecture plan and file names (needs to be double checked)

parent 9eadc843
Branches
No related tags found
No related merge requests found
Showing
with 151 additions and 292 deletions
%% exercise 3.1.2
cdir = fileparts(mfilename('fullpath'));
[A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'));
X = full(A)';
attributeNames = cellstr(D);
%% Display the result
display(attributeNames);
display(X);
\ No newline at end of file
%% exercise 3.1.3 cdir = fileparts(mfilename('fullpath')); TMGOpts.stoplist = fullfile(cdir,'../Data/stopWords.txt'); [A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts); X = full(A)'; attributeNames = cellstr(D); %% Display the result display(attributeNames); display(X);
\ No newline at end of file
%% exercise 3.1.4 cdir = fileparts(mfilename('fullpath')); TMGOpts.stoplist = '../Data/stopWords.txt'; TMGOpts.stemming = 1; [A, D] = tmg(fullfile(cdir,'../Data/textDocs.txt'), TMGOpts); X = full(A)'; attributeNames = cellstr(D); %% Display the result display(attributeNames); display(X);
\ No newline at end of file
%% exercise 3.1.5
% Query vector
q = [0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0]';
%% Method #1 (a for loop)
N = size(X,1); % Get the number of data objects
sim = nan(N,1); % Allocate a vector for the similarity
for i = 1:N
x = X(i,:); % Get the i'th data object
sim(i) = dot(q/norm(q),x/norm(x)); % Compute cosine similarity
end
%% Method #2 (one compact line of code)
sim = (q*X')'./(sqrt(sum(X.^2,2))*sqrt(sum(q.^2)));
%% Method #3 (use the "similarity" function)
sim = similarity(X, q, 'cos');
%% Display the result
display(sim);
\ No newline at end of file
%% exercise 2.1.1
% Load the data into Matlab
cdir = fileparts(mfilename('fullpath'));
[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/nanonose.xls'));
% exercise 3.2.1
% Extract the rows and columns corresponding to the sensor data, and
% transpose the matrix to have rows correspond to data items
X = NUMERIC(:,3:10);
x = [-0.68; -2.11; 2.39; 0.26; 1.46; 1.33; 1.03; -0.41; -0.33; 0.47];
% Extract attribute names from the first column
attributeNames = RAW(1,4:end);
% Extract unique class names from the first row
classLabels = RAW(3:end,1);
classNames = unique(classLabels);
% Extract class labels that match the class names
[y_,y] = ismember(classLabels, classNames); y = y-1;
mean_x = mean(x);
std_x = std(x);
median_x = median(x);
range_x = range(x);
%% Display results
display(mean_x);
display(std_x);
display(median_x);
display(range_x);
\ No newline at end of file
%% exercise 2.1.2
% Data attributes to be plotted
i = 1;
j = 2;
% Make a simple plot of the i'th attribute against the j'th attribute
mfig('NanoNose: Data'); clf;
plot(X(:,i), X(:,j),'o');
axis tight
% Make another more fancy plot that includes legend, class labels,
% attribute names, and a title
mfig('NanoNose: Classes'); clf; hold all;
C = length(classNames);
% Use a specific color for each class (easy to reuse across plots!):
colors = get(gca, 'colororder');
% Here we the standard colours from MATLAB, but you could define you own.
for c = 0:C-1
h = scatter(X(y==c,i), X(y==c,j), 50, 'o', ...
'MarkerFaceColor', colors(c+1,:), ...
'MarkerEdgeAlpha', 0, ...
'MarkerFaceAlpha', .5);
end
% You can also avoid the loop by using e.g.: (but in this case, do not call legend(classNames) as it will overwrite the legend with wrong entries)
% gscatter(X(:,i), X(:,j), classLabels)
legend(classNames);
axis tight
xlabel(attributeNames{i});
ylabel(attributeNames{j});
title('NanoNose data');
a = 234
%% exercise 2.1.3
% Subtract the mean from the data
Y = bsxfun(@minus, X, mean(X));
% Obtain the PCA solution by calculate the SVD of Y
[U, S, V] = svd(Y);
% Compute variance explained
rho = diag(S).^2./sum(diag(S).^2);
threshold = 0.90;
% Plot variance explained
mfig('NanoNose: Var. explained'); clf;
hold on
plot(rho, 'x-');
plot(cumsum(rho), 'o-');
plot([0,length(rho)], [threshold, threshold], 'k--');
legend({'Individual','Cumulative','Threshold'}, ...
'Location','best');
ylim([0, 1]);
xlim([1, length(rho)]);
grid minor
xlabel('Principal component');
ylabel('Variance explained value');
title('Variance explained by principal components');
%% exercise 2.1.4
% Index of the principal components
i = 1;
j = 2;
% Compute the projection onto the principal components
Z = U*S;
% Plot PCA of data
mfig('NanoNose: PCA Projection'); clf; hold all;
C = length(classNames);
colors = get(gca,'colororder');
for c = 0:C-1
scatter(Z(y==c,i), Z(y==c,j), 50, 'o', ...
'MarkerFaceColor', colors(c+1,:), ...
'MarkerEdgeAlpha', 0, ...
'MarkerFaceAlpha', .5);
end
legend(classNames);
axis tight
xlabel(sprintf('PC %d', i));
ylabel(sprintf('PC %d', j));
title('PCA Projection of NanoNose data');
\ No newline at end of file
%% exercise 2.1.5
% We saw in 2.1.3 that the first 3 components explaiend more than 90
% percent of the variance. Let's look at their coefficients:
pcs = 1:3; % change this to look at more/fewer, or compare e.g. [2,5]
mfig('NanoNose: PCA Component Coefficients');
h = bar(V(:,pcs));
legendCell = cellstr(num2str(pcs', 'PC%-d'));
legend(legendCell, 'location','best');
xticklabels(attributeNames);
grid
xlabel('Attributes');
ylabel('Component coefficients');
title('NanoNose: PCA Component Coefficients');
% Inspecting the plot, we see that the 2nd principal component has large
% (in magnitude) coefficients for attributes A, E and H. We can confirm
% this by looking at it's numerical values directly, too:
disp('PC2:')
disp(V(:,2)') % notice the transpose for display in console
% How does this translate to the actual data and its projections?
% Looking at the data for water:
% Projection of water class onto the 2nd principal component.
all_water_data = Y(y==4,:);
disp('First water observation:')
disp(all_water_data(1,:))
% Based on the coefficients and the attribute values for the observation
% displayed, would you expect the projection onto PC2 to be positive or
% negative - why? Consider *both* the magnitude and sign of *both* the
% coefficient and the attribute!
% You can determine the projection by (remove comments):
disp('...and its projection onto PC2:')
disp(all_water_data(1,:) * V(:,2))
% Try to explain why?
%% exercise 2.2.1
% exercise 3.3.1
% Digit number to display
% Image to use as query
i = 1;
% Load data
% Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation'
SimilarityMeasure = 'cos';
%% Load the CBCL face database
cdir = fileparts(mfilename('fullpath'));
load(fullfile(cdir,'../Data/zipdata.mat'));
% Extract digits data
X = traindata(:,2:end);
y = traindata(:,1);
% Visualize the i'th digit as a vector
mfig('Digits: Data');
subplot(4,1,4);
imagesc(X(i,:));
xlabel('Pixel number');
title('Digit in vector format');
set(gca, 'YTick', []);
% Visualize the i'th digit as an image
subplot(4,1,1:3);
I = reshape(X(i,:), [16,16])';
imagesc(I);
colormap(1-gray);
axis image off
title('Digit as a image');
load(fullfile(cdir,'../Data/digits.mat'));
% You can try out the faces CBCL face database, too:
%load(fullfile(cdir,'../Data/wildfaces_grayscale.mat'));
transpose = true; % set to true if plotted images needs to be transposed
[N,M] = size(X);
imageDim = [sqrt(M),sqrt(M)];
%% Search the face database for similar faces
% Index of all other images than i
noti = [1:i-1 i+1:N];
% Compute similarity between image i and all others
sim = similarity(X(i,:), X(noti,:), SimilarityMeasure);
% Sort similarities
[val, j] = sort(sim, 'descend');
%% Plot query and result
mfig('Faces: Query'); clf;
subplot(3,5,1:5);
img = reshape(X(i,:),imageDim);
if transpose; img = img'; end;
imagesc(img);
axis image
set(gca, 'XTick', [], 'YTick', []);
ylabel(sprintf('Image #%d', i));
title('Query image','FontWeight','bold');
for k = 1:5
subplot(3,5,k+5)
ii = noti(j(k));
img = reshape(X(ii,:),imageDim);
if transpose; img = img'; end;
imagesc(img);
axis image
set(gca, 'XTick', [], 'YTick', []);
xlabel(sprintf('sim=%.2f', val(k)));
ylabel(sprintf('Image #%d', ii));
if k==3, title('Most similar images','FontWeight','bold'); end;
end
for k = 1:5
subplot(3,5,k+10)
ii = noti(j(end+1-k));
img = reshape(X(ii,:),imageDim);
if transpose; img = img'; end;
imagesc(img);
axis image
set(gca, 'XTick', [], 'YTick', []);
xlabel(sprintf('sim=%.3f', val(end+1-k)));
ylabel(sprintf('Image #%d', ii));
if k==3, title('Least similar images','FontWeight','bold'); end;
end
colormap(gray);
%% exercise 2.2.2
% Digits to include in analysis (to include all, n = 1:10);
n = [0,1];
% Number of principal components for reconstruction
K = 22;
% Digits to visualize
nD = 1:5;
%% Load data
cdir = fileparts(mfilename('fullpath'));
load(fullfile(cdir,'../Data/zipdata.mat'));
% Extract digits
X = traindata(:,2:end);
y = traindata(:,1);
classNames = {'0';'1';'2';'3';'4';'5';'6';'7';'8';'9';'10'};
classLabels = classNames(y+1);
% Remove digits that are not to be inspected
j = ismember(y, n);
X = X(j,:);
classLabels = classLabels(j);
classNames = classNames(n+1);
y = cellfun(@(str) find(strcmp(str, classNames)), classLabels)-1;
% Subtract the mean from the data
Y = bsxfun(@minus, X, mean(X));
% Obtain the PCA solution by calculate the SVD of Y
[U, S, V] = svd(Y, 'econ');
% Compute the projection onto the principal components
Z = U*S;
% Compute variance explained
rho = diag(S).^2./sum(diag(S).^2);
%% Plot variance explained
mfig('Digits: Var. explained'); clf;
plot(rho, 'o-');
title('Variance explained by principal components');
xlabel('Principal component');
ylabel('Variance explained value');
%% Plot PCA of data
mfig('Digits: PCA'); clf; hold all;
C = length(classNames);
for c = 0:C-1
plot(Z(y==c,1), Z(y==c,2), 'o');
end
legend(classNames);
xlabel('PC 1');
ylabel('PC 2');
title('PCA of digits data');
%% Visualize the reconstructed data from the firts K principal components
mfig('Digits: Reconstruction'); clf;
W = Z(:,1:K)*V(:,1:K)';
D = length(nD);
for d = 1:D
subplot(2,D,d);
I = reshape(X(nD(d),:), [16,16])';
imagesc(I);
axis image off
title('Original');
subplot(2,D,d+D);
I = reshape(W(nD(d),:)+mean(X), [16,16])';
imagesc(I);
axis image off
title('Reconstructed');
end
colormap(1-gray);
%% Visualize the principal components
mfig('Digits: Principal components'); clf;
for k = 1:K
N1 = ceil(sqrt(K)); N2 = ceil(K/N1);
subplot(N2, N1, k);
I = reshape(V(:,k), [16,16])';
imagesc(I);
colormap(hot);
axis image off
title(sprintf('PC %d',k));
end
% exercise 3.3.2
% Generate two data objects with M random attributes
M = 5;
x = rand(1,M);
y = rand(1,M);
% Two constants
a = 1.5;
b = 1.5;
% Check the statements in the exercise
similarity(x,y,'cos') - similarity(a*x,y,'cos')
similarity(x,y,'ext') - similarity(a*x,y,'ext')
similarity(x,y,'cor') - similarity(a*x,y,'cor')
similarity(x,y,'cos') - similarity(b+x,y,'cos')
similarity(x,y,'ext') - similarity(b+x,y,'ext')
similarity(x,y,'cor') - similarity(b+x,y,'cor')
\ No newline at end of file
%% exercise 2.3.1
%% Load data
cdir = fileparts(mfilename('fullpath'));
load(fullfile(cdir,'../Data/zipdata.mat'));
% Extract digits (training set)
X = traindata(:,2:end);
y = traindata(:,1);
% exercise 4.2.1
% Extract digits (test set)
Xtest = testdata(:,2:end);
ytest = testdata(:,1);
% Disable xlsread warning
warning('off', 'MATLAB:xlsread:ActiveX');
warning('off', 'MATLAB:xlsread:Mode');
% Subtract the mean from the data
Y = bsxfun(@minus, X, mean(X));
Ytest = bsxfun(@minus, Xtest, mean(X));
% Load the data into Matlab
cdir = fileparts(mfilename('fullpath'));
[NUMERIC, TXT, RAW] = xlsread(fullfile(cdir,'../Data/iris.xls'),1,'','basic');
% Obtain the PCA solution by calculate the SVD of Y
[U, S, V] = svd(Y, 'econ');
% Extract the rows and columns corresponding to the data
if isnan(NUMERIC(1,1))
X = NUMERIC(2:end,:);
else
X = NUMERIC;
end
% Number of principal components to use, i.e. the reduced dimensionality
Krange = [8,10,15,20,30,40,50,60,100,150];
errorRate = zeros(length(Krange),1);
for i = 1:length(Krange)
K=Krange(i);
% Compute the projection onto the principal components
Z = Y*V(:,1:K);
Ztest = Ytest*V(:,1:K);
% Extract attribute names from the first row
attributeNames = RAW(1,1:4)';
% Classify digits using a K-nearest neighbour classifier
model=fitcknn(Z,y,'NumNeighbors',1);
yest = predict(model,Ztest);
% Extract unique class names from the last column
classLabels = RAW(2:end,5)';
classNames = unique(classLabels);
errorRate(i) = nnz(ytest~=yest)/length(ytest);
% Extract class labels that match the class names
[y_,y] = ismember(classLabels, classNames); y = y'-1;
% Display results
fprintf('Error rate %.1f%%\n',errorRate(i)*100);
end
% Get the number of data objects, attributes, and classes
[N, M] = size(X);
C = length(classNames);
%% Visualize error rates vs. number of principal components considered
mfig('Variance explained by principal components'); clf;
plot(Krange,errorRate, 'o-');
xlabel('Number of principal components K')
ylabel('Error rate [%]')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment