https://github.com/ntucllab/libact
Tip revision: 1079085b27bcb5d929ab3d9d0bbc2a132ec8cbdc authored by Poy on 11 August 2021, 01:37:03 UTC
[MRG] Upgrade to support newest scikit-learn version (#188)
[MRG] Upgrade to support newest scikit-learn version (#188)
Tip revision: 1079085
albl_plot.py
#!/usr/bin/env python3
"""
The script runs experiments to compare the performance of ALBL and other active
learning algorithms.
"""
import copy
import os
import numpy as np
import matplotlib.pyplot as plt
try:
from sklearn.model_selection import train_test_split
except ImportError:
from sklearn.cross_validation import train_test_split
# libact classes
from libact.base.dataset import Dataset, import_libsvm_sparse
from libact.models import SVM
from libact.query_strategies import QUIRE, UncertaintySampling, RandomSampling,\
ActiveLearningByLearning, HintSVM
from libact.labelers import IdealLabeler
def run(trn_ds, tst_ds, lbr, model, qs, quota):
E_in, E_out = [], []
for _ in range(quota):
ask_id = qs.make_query()
lb = lbr.label(trn_ds.data[ask_id][0])
trn_ds.update(ask_id, lb)
model.train(trn_ds)
E_in = np.append(E_in, 1 - model.score(trn_ds))
E_out = np.append(E_out, 1 - model.score(tst_ds))
return E_in, E_out
def split_train_test(dataset_filepath, test_size, n_labeled):
X, y = import_libsvm_sparse(dataset_filepath).format_sklearn()
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size)
while len(np.unique((y_train[:n_labeled]))) != 2:
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size)
trn_ds = Dataset(X_train, np.concatenate(
[y_train[:n_labeled], [None] * (len(y_train) - n_labeled)]))
tst_ds = Dataset(X_test, y_test)
fully_labeled_trn_ds = Dataset(X_train, y_train)
return trn_ds, tst_ds, y_train, fully_labeled_trn_ds
def main():
# Specifiy the parameters here:
# path to your binary classification dataset
ds_name = 'australian'
dataset_filepath = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '%s.txt' % ds_name)
test_size = 0.33 # the percentage of samples in the dataset that will be
# randomly selected and assigned to the test set
n_labeled = 10 # number of samples that are initially labeled
results = []
for T in range(20): # repeat the experiment 20 times
print("%dth experiment" % (T+1))
trn_ds, tst_ds, y_train, fully_labeled_trn_ds = \
split_train_test(dataset_filepath, test_size, n_labeled)
trn_ds2 = copy.deepcopy(trn_ds)
trn_ds3 = copy.deepcopy(trn_ds)
trn_ds4 = copy.deepcopy(trn_ds)
trn_ds5 = copy.deepcopy(trn_ds)
lbr = IdealLabeler(fully_labeled_trn_ds)
quota = len(y_train) - n_labeled # number of samples to query
# Comparing UncertaintySampling strategy with RandomSampling.
# model is the base learner, e.g. LogisticRegression, SVM ... etc.
qs = UncertaintySampling(trn_ds,
model=SVM(decision_function_shape='ovr'))
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_1 = run(trn_ds, tst_ds, lbr, model, qs, quota)
results.append(E_out_1.tolist())
qs2 = RandomSampling(trn_ds2)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_2 = run(trn_ds2, tst_ds, lbr, model, qs2, quota)
results.append(E_out_2.tolist())
qs3 = QUIRE(trn_ds3)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_3 = run(trn_ds3, tst_ds, lbr, model, qs3, quota)
results.append(E_out_3.tolist())
qs4 = HintSVM(trn_ds4, cl=1.0, ch=1.0)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_4 = run(trn_ds4, tst_ds, lbr, model, qs4, quota)
results.append(E_out_4.tolist())
qs5 = ActiveLearningByLearning(trn_ds5,
query_strategies=[
UncertaintySampling(trn_ds5,
model=SVM(kernel='linear',
decision_function_shape='ovr')),
QUIRE(trn_ds5),
HintSVM(trn_ds5, cl=1.0, ch=1.0),
],
T=quota,
uniform_sampler=True,
model=SVM(kernel='linear', decision_function_shape='ovr')
)
model = SVM(kernel='linear', decision_function_shape='ovr')
_, E_out_5 = run(trn_ds5, tst_ds, lbr, model, qs5, quota)
results.append(E_out_5.tolist())
result = []
for i in range(5):
_temp = []
for j in range(i, len(results), 5):
_temp.append(results[j])
result.append(np.mean(_temp, axis=0))
# Plot the learning curve of UncertaintySampling to RandomSampling
# The x-axis is the number of queries, and the y-axis is the corresponding
# error rate.
query_num = np.arange(1, quota + 1)
plt.plot(query_num, result[0], 'g', label='uncertainty sampling')
plt.plot(query_num, result[1], 'k', label='random')
plt.plot(query_num, result[2], 'r', label='QUIRE')
plt.plot(query_num, result[3], 'b', label='HintSVM')
plt.plot(query_num, result[4], 'c', label='ALBL')
plt.xlabel('Number of Queries')
plt.ylabel('Error')
plt.title('Experiment Result')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05),
fancybox=True, shadow=True, ncol=5)
plt.show()
if __name__ == '__main__':
main()