https://github.com/dataiku-research/cardinal
Tip revision: 8702a2b60bdc0d00fd91182af77c7d8e92987380 authored by Alexandre Abraham on 26 August 2020, 09:18:28 UTC
Add experimenting files for icmd2020 paper
Add experimenting files for icmd2020 paper
Tip revision: 8702a2b
plot_confidence_vs_diversity.py
"""
Lowest confidence vs. KMeans sampling
=====================================
This example shows the importance of diversity-based approaches using a
toy example where a very unlucky initialization makes lowest confidence
approach underperform.
"""
##############################################################################
# Those are the necessary imports and initializations
from matplotlib import pyplot as plt
from matplotlib.patches import Polygon
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.svm import SVC
from cardinal.uncertainty import ConfidenceSampler
from cardinal.clustering import KMeansSampler
from cardinal.batch import RankedBatchSampler
from cardinal.random import RandomSampler
np.random.seed(7)
##############################################################################
# We simulate data where samples of one of the class are scattered in 3
# blobs, one of them being far away from the two others. We also select an
# initialization index where no sample from the far-away sample is initially
# selected. This will force the decision boundary to stay far from that cluster
# and thus "trick" the lowest confidence method.
#
# The parameters of this experiment are :
#
# * `n` is the number of points in the simulated data,
# * `batch_size` is the number of samples that will be annotated and added to
# the training set at each iteration,
# * `n_iter` is the number of iterations in our simulation.
n = 28
batch_size = 4
n_iter = 5
X, y = make_blobs(n_samples=n, centers=[(1, 0), (0, 1), (2, 2), (4, 0)],
random_state=0, cluster_std=0.2)
# We select samples in clusters 0, 1 and 2. Cluster 3 will be ignored by uncertainty sampling
init_idx = [i for j in range(3) for i in np.where(y == j)[0][:2]]
y[y > 1] = 1
model = SVC(kernel='linear', C=1E10, probability=True)
##############################################################################
# This helper function plots our simulated points in red and blue. The one that
# are not in the training set are faded. We also plot the linear separation
# estimated by the SVM.
def plot(a, b, score, selected):
plt.xlabel('Accuracy {}%'.format(int(score * 100)), fontsize=10)
l_to_c = {0: 'tomato', 1:'royalblue'}
f = (lambda x: a * x + b)
x1, x2 = (np.min(X[:, 0]), np.max(X[:, 0]))
y1, y2 = (np.min(X[:, 1]), np.max(X[:, 1]))
# This code computes the coordinates of the background rectangles
# in order to have pretty prints.
p1, p2 = (x1, a * x1 + b), ((y1 - b) / a, y1)
p3, p4 = (x2, a * x2 + b), ((y2 - b) / a, y2)
p1, p2, p3, p4 = sorted([p1, p2, p3, p4])
corners = [(x1, y1), (x1, y2), (x2, y2), (x2, y1)]
dists = [f(x) - y for x, y in corners]
while dists[0] > 0 or dists[-1] < 0:
dists.append(dists.pop(0))
corners.append(corners.pop(0))
first_pos = next(i for i, x in enumerate(dists) if x > 0)
plt.gca().add_patch(Polygon(
[p3, p2] + corners[:first_pos], joinstyle='round',
facecolor=l_to_c[model.predict([corners[0]])[0]], alpha=0.2))
plt.gca().add_patch(Polygon(
[p2, p3] + corners[first_pos:], joinstyle='round',
facecolor=l_to_c[model.predict([corners[-1]])[0]], alpha=0.2))
# Plot not selected first in low alpha, then selected
for l, s in [(0, False), (1, False), (0, True), (1, True)]:
alpha = 1. if s else 0.3
mask = np.logical_and(selected == s, l == y)
plt.scatter(X[mask, 0], X[mask, 1], c=l_to_c[l], alpha=alpha)
# Plot the separation margin of the SVM
plt.plot(*zip(p2, p3), c='purple')
eps = 0.1
plt.gca().set_xlim(x1 - eps, x2 + eps)
plt.gca().set_ylim(y1 - eps, y2 + eps)
##############################################################################
# Core Active Learning Experiment
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# As presented in the introduction, this loop is the core of the active learning
# experiment. At each iteration, the model learns on all labeled data to
# measure its performance. The model is then inspected to find out the samples
# on which its confidence is the lowest. This is done through cardinal samplers.
#
# In this experiment, we see that lowest confidence will explore the far-away
# cluster only once all other samples have been labeled. KMeans uses a more
# exploratory approach and select items in this cluster right away.
# It is worth noticing that random sampling also have good exploration
# properties.
samplers = [
('Lowest confidence', ConfidenceSampler(model, batch_size)),
('KMeans', KMeansSampler(batch_size)),
('Weighted KMeans', KMeansSampler(batch_size)),
('Batch', RankedBatchSampler(batch_size)),
('Random', RandomSampler(batch_size))
]
plt.figure(figsize=(10, 10))
for i, (sampler_name, sampler) in enumerate(samplers):
mask = np.zeros(n, dtype=bool)
indices = np.arange(n)
mask[init_idx] = True
for j in range(n_iter):
model.fit(X[mask], y[mask])
sampler.fit(X[mask], y[mask])
w = model.coef_[0]
plt.subplot(len(samplers), n_iter, i * n_iter + j + 1)
if sampler_name == 'Batch':
# This is an SSL method that requires
weights = ConfidenceSampler(model, batch_size).score_samples(X)
weights[mask] = -1
selected = sampler.select_samples(X, samples_weights=weights)
mask[selected] = True
elif sampler_name == 'Weighted Kmeans':
weights = ConfidenceSampler(model, batch_size).score_samples(X[~mask])
selected = sampler.select_samples(X[~mask], samples_weights=weights)
mask[indices[~mask][selected]] = True
else:
selected = sampler.select_samples(X[~mask])
mask[indices[~mask][selected]] = True
if j == 0:
plt.ylabel(sampler_name)
plt.axis('tight')
plt.gca().set_xticks(())
plt.gca().set_yticks(())
if i == 0:
plt.gca().set_title('Iteration {}'.format(j), fontsize=10)
plot(-w[0] / w[1], - model.intercept_[0] / w[1], model.score(X, y),
mask.copy())
plt.tight_layout()
plt.subplots_adjust(top=0.86)
plt.gcf().suptitle('Classification accuracy of random and uncertainty active learning on simulated data', fontsize=12)
plt.show()