Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarking Scripts for Mlpack's LMNN, Shogun's LMNN & Matlab's LMNN #123

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
105 changes: 104 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,74 @@ methods:
normalize: True
seed: 42

LMNN:
run: ['metric']
script: methods/mlpack/lmnn.py
format: [csv, txt]
datasets:
- files: ['datasets/iris_train.csv',
'datasets/satellite_train.csv', 'datasets/ionosphere.csv',
'datasets/balance_scale.csv', 'datasets/letter_recognition.csv',
'datasets/oilspill_train.csv', 'datasets/shuttle_train.csv',
'datasets/ecoli_train.csv', 'datasets/vehicle.csv']
options:
num_targets: 3
passes: 10
range: 20
seed: 42

- files: ['datasets/letter_recognition.csv',
'datasets/shuttle_train.csv', 'datasets/isolet_train.csv',
'datasets/covtype.csv', 'datasets/optdigits_train.csv',
'datasets/mnist_all.csv', 'datasets/Twitter.csv']
options:
num_targets: 3
passes: 3
range: 100
seed: 42

- files: ['datasets/iris_train.csv',
'datasets/ecoli_train.csv', 'datasets/vehicle.csv',
'datasets/balance_scale.csv', 'datasets/ionosphere.csv']
options:
num_targets: 3
passes: 5
optimizer: bbsgd
seed: 42

- files: ['datasets/iris_train.csv',
'datasets/satellite_train.csv', 'datasets/ionosphere.csv',
'datasets/ecoli_train.csv', 'datasets/vehicle.csv',
'datasets/balance_scale.csv', 'datasets/letter_recognition.csv']
options:
num_targets: 3
passes: 5
optimizer: sgd
range: 50
step_size: 1e-07
seed: 42

- files: ['datasets/iris_train.csv',
'datasets/satellite_train.csv', 'datasets/ionosphere.csv',
'datasets/ecoli_train.csv', 'datasets/vehicle.csv',
'datasets/balance_scale.csv', 'datasets/letter_recognition.csv']
options:
num_targets: 3
max_iterations: 2000
optimizer: lbfgs
seed: 42
range: 50

- files: ['datasets/covtype.csv',
'datasets/shuttle_train.csv', 'datasets/isolet_train.csv',
'datasets/mnist_all.csv', 'datasets/letter_recognition.csv']
options:
num_targets: 3
max_iterations: 2000
optimizer: lbfgs
seed: 42
range: 100

HMMTRAIN:
run: ['metric']
script: methods/mlpack/hmm_train.py
Expand Down Expand Up @@ -884,6 +952,21 @@ methods:
new_dimensionality: 2
scaled: True

LMNN:
run: ['metric']
script: methods/matlab/lmnn.py
format: [csv, txt]
datasets:
- files: ['datasets/iris_train.csv',
'datasets/satellite_train.csv', 'datasets/ionosphere.csv',
'datasets/balance_scale.csv', 'datasets/vehicle.csv',
'datasets/oilspill_train.csv', 'datasets/ecoli_train.csv',
'datasets/letter_recognition.csv', 'datasets/shuttle_train.csv',
'datasets/isolet_train.csv', 'datasets/optdigits_train.csv',
'datasets/covtype.csv', 'datasets/mnist_all.csv']
options:
k: 3

PERCEPTRON:
run: ['metric']
script: methods/matlab/perceptron.py
Expand Down Expand Up @@ -2174,6 +2257,26 @@ methods:
options:
lambda1: 0.01

LMNN:
run: ['metric']
script: methods/shogun/lmnn.py
format: [csv, txt]
datasets:
- files: [ ['datasets/iris_train.csv'],
['datasets/ecoli_train.csv'],
['datasets/vehicle.csv'],
['datasets/ionosphere.csv'],
['datasets/shuttle_train.csv'],
['datasets/letter_recognition.csv'],
['datasets/balance_scale.csv'],
['datasets/oilspill_train.csv'],
['datasets/mnist_all.csv'],
['datasets/Twitter.csv'],
['datasets/isolet_train.csv'],
['datasets/covtype.csv']]
options:
k: 3

QDA:
run: ['metric','metric']
script: methods/shogun/qda.py
Expand Down Expand Up @@ -3109,4 +3212,4 @@ methods:
['datasets/sickEuthyroid_train.csv', 'datasets/sickEuthyroid_test.csv', 'datasets/sickEuthyroid_labels.csv'],
['datasets/abalone7_train.csv', 'datasets/abalone7_test.csv', 'datasets/abalone7_labels.csv'],
['datasets/satellite_train.csv', 'datasets/satellite_test.csv', 'datasets/satellite_labels.csv'],
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
['datasets/ecoli_train.csv', 'datasets/ecoli_test.csv', 'datasets/ecoli_labels.csv'] ]
2 changes: 2 additions & 0 deletions datasets/dataset-urls.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ artificial_1DSignal*.csv mlpack.org/datasets/artificial_1DSignal.tar.gz
artificial_2DSignal*.csv mlpack.org/datasets/artificial_2DSignal.tar.gz
artificial_40D*.csv mlpack.org/datasets/artificial_40D.tar.gz
artificial_5DSignal*.csv mlpack.org/datasets/artificial_5DSignal.tar.gz
balance_scale*.csv mlpack.org/datasets/balance_scale.tar.gz
bank8FM.csv mlpack.org/datasets/bank8FM.tar.gz
cal_housing.csv mlpack.org/datasets/cal_housing.tar.gz
circle_data.csv mlpack.org/datasets/circle.tar.gz
Expand All @@ -25,6 +26,7 @@ faces.csv mlpack.org/datasets/faces.tar.gz
ionosphere.csv mlpack.org/datasets/ionosphere.tar.gz
iris*.csv mlpack.org/datasets/iris.tar.gz
isolet*.csv mlpack.org/datasets/isolet.tar.gz
letter_recognition*.csv http://www.mlpack.org/datasets/letter_recognition.tar.gz
madelon*.csv mlpack.org/datasets/madelon.tar.gz
mammography*.csv mlpack.org/datasets/mammography.tar.gz
mnist*.csv mlpack.org/datasets/mnist.tar.gz
Expand Down
168 changes: 168 additions & 0 deletions methods/matlab/LMNN.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
% @file lmnn.m

function lmnn(cmd)
% LMNN Learns a metric using large-margin nearest neighbor metric learning
%
% The function uses large-margin nearest neighbor (LMNN) metric learning to
% learn a metric on the data set specified by the NxD matrix X and the
% corresponding Nx1 vector labels. The metric is returned in M.
%
% Required options:
% (-i) [string] Input dataset to perform LMNN on.
% Options:
% (-k) [int] Desired number of targets.
%
%
% This file is part of the Matlab Toolbox for Dimensionality Reduction.
% The toolbox can be obtained from http://homepage.tudelft.nl/19j49
% You are free to use, change, or redistribute this code in any way you
% want for non-commercial purposes. However, it is appreciated if you
% maintain the name of the original author.
%
% (C) Laurens van der Maaten, Delft University of Technology

inputFile = regexp(cmd, '.*?-i ([^\s]+)', 'tokens', 'once');

% Load input dataset.
X = csvread(inputFile{:});

% Use the last row of the data as the labels.
labels = X(:,end);
% Remove the label row.
X = X(:,1:end-1);

% Variable K can't be used
% K = regexp(cmd, '.*?-k ([^\s]+)', 'tokens', 'once');
% K = str2num(K{1});

total_time = tic;

% Initialize some variables
[N, D] = size(X);
assert(length(labels) == N);
[lablist, ~, labels] = unique(labels);
K = length(lablist);
label_matrix = false(N, K);
label_matrix(sub2ind(size(label_matrix), (1:length(labels))', labels)) = true;
same_label = logical(double(label_matrix) * double(label_matrix'));
M = eye(D);
C = Inf; prev_C = Inf;

% Set learning parameters
min_iter = 50; % minimum number of iterations
max_iter = 1000; % maximum number of iterations
eta = .1; % learning rate
mu = .5; % weighting of pull and push terms
tol = 1e-3; % tolerance for convergence
best_C = Inf; % best error obtained so far
best_M = M; % best metric found so far
no_targets = 3; % number of target neighbors

% Select target neighbors
sum_X = sum(X .^ 2, 2);
DD = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (X * X')));
DD(~same_label) = Inf; DD(1:N + 1:end) = Inf;
[~, targets_ind] = sort(DD, 2, 'ascend');
targets_ind = targets_ind(:,1:no_targets);
targets = false(N, N);
targets(sub2ind([N N], vec(repmat((1:N)', [1 no_targets])), vec(targets_ind))) = true;

% Compute pulling term between target neigbhors to initialize gradient
slack = zeros(N, N, no_targets);
G = zeros(D, D);
for i=1:no_targets
G = G + (1 - mu) .* (X - X(targets_ind(:,i),:))' * (X - X(targets_ind(:,i),:));
end

% Perform main learning iterations
iter = 0;
while (prev_C - C > tol || iter < min_iter) && iter < max_iter

% Compute pairwise distances under current metric
XM = X * M;
sum_X = sum(XM .* X, 2);
DD = bsxfun(@plus, sum_X, bsxfun(@plus, sum_X', -2 * (XM * X')));

% Compute value of slack variables
old_slack = slack;
for i=1:no_targets
slack(:,:,i) = ~same_label .* max(0, bsxfun(@minus, 1 + DD(sub2ind([N N], (1:N)', targets_ind(:,i))), DD));
end

% Compute value of cost function
prev_C = C;
C = (1 - mu) .* sum(DD(targets)) + ... % push terms between target neighbors
mu .* sum(slack(:)); % pull terms between impostors

% Maintain best solution found so far (subgradient method)
if C < best_C
best_C = C;
best_M = M;
end

% Perform gradient update
for i=1:no_targets

% Add terms for new violations
[r, c] = find(slack(:,:,i) > 0 & old_slack(:,:,i) == 0);
G = G + mu .* ((X(r,:) - X(targets_ind(r, i),:))' * ...
(X(r,:) - X(targets_ind(r, i),:)) - ...
(X(r,:) - X(c,:))' * (X(r,:) - X(c,:)));

% Remove terms for resolved violations
[r, c] = find(slack(:,:,i) == 0 & old_slack(:,:,i) > 0);
G = G - mu .* ((X(r,:) - X(targets_ind(r, i),:))' * ...
(X(r,:) - X(targets_ind(r, i),:)) - ...
(X(r,:) - X(c,:))' * (X(r,:) - X(c,:)));
end
M = M - (eta ./ N) .* G;

% Project metric back onto the PSD cone
[V, L] = eig(M);
V = real(V); L = real(L);
ind = find(diag(L) > 0);
if isempty(ind)
warning('Projection onto PSD cone failed. All eigenvalues were negative.'); break
end
M = V(:,ind) * L(ind, ind) * V(:,ind)';
if any(isinf(M(:)))
warning('Projection onto PSD cone failed. Metric contains Inf values.'); break
end
if any(isnan(M(:)))
warning('Projection onto PSD cone failed. Metric contains NaN values.'); break
end

% Update learning rate
if prev_C > C
eta = eta * 1.01;
else
eta = eta * .5;
end

% Print out progress
iter = iter + 1;
no_slack = sum(slack(:) > 0);
if rem(iter, 10) == 0
[~, sort_ind] = sort(DD, 2, 'ascend');
disp(['Iteration ' num2str(iter) ': error is ' num2str(C ./ N) ...
', nearest neighbor error is ' num2str(sum(labels(sort_ind(:,2)) ~= labels) ./ N) ...
', number of constraints: ' num2str(no_slack)]);
end
end

% Return best metric and error
M = best_M;
C = best_C;

% Compute mapped data
[L, S, ~] = svd(M);
L = bsxfun(@times, sqrt(diag(S)), L);
disp(sprintf('[INFO ] total_time: %fs', toc(total_time)))

% Save learned distance.
csvwrite('distance.csv', L);
end

function x = vec(x)
x = x(:);
end
Loading