https://github.com/stavskal/ADASYN
Tip revision: f6a7ee26a51f18baacaf7b291811c9e9a9097a9f authored by Stavrianos Skalidis on 12 April 2016, 09:42:46 UTC
Update README.md
Update README.md
Tip revision: f6a7ee2
adasyn.py
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import warnings
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array, check_random_state
from collections import Counter
__author__ = 'stavrianos'
# Link to paper: bit.ly/22KgAnP
class ADASYN(object):
"""
Oversampling parent class with the main methods required by scikit-learn:
fit, transform and fit_transform
"""
def __init__(self,
ratio=0.5,
imb_threshold=0.5,
k=5,
random_state=None,
verbose=True):
"""
:ratio:
Growth percentage with respect to initial minority
class size. For example if ratio=0.65 then after
resampling minority class(es) will have 1.65 times
its initial size
:imb_threshold:
The imbalance ratio threshold to allow/deny oversampling.
For example if imb_threshold=0.5 then minority class needs
to be at most half the size of the majority in order for
resampling to apply
:k:
Number of K-nearest-neighbors
:random_state:
seed for random number generation
:verbose:
Determines if messages will be printed to terminal or not
Extra Instance variables:
:self.X:
Feature matrix to be oversampled
:self.y:
Class labels for data
:self.clstats:
Class populations to determine minority/majority
:self.unique_classes_:
Number of unique classes
:self.maj_class_:
Label of majority class
:self.random_state_:
Seed
"""
self.ratio = ratio
self.imb_threshold = imb_threshold
self.k = k
self.random_state = random_state
self.verbose = verbose
self.clstats = {}
self.num_new = 0
self.index_new = []
def fit(self, X, y):
"""
Class method to define class populations and store them as instance
variables. Also stores majority class label
"""
self.X = check_array(X)
self.y = np.array(y).astype(np.int64)
self.random_state_ = check_random_state(self.random_state)
self.unique_classes_ = set(self.y)
# Initialize all class populations with zero
for element in self.unique_classes_:
self.clstats[element] = 0
# Count occurences of each class
for element in self.y:
self.clstats[element] += 1
# Find majority class
v = list(self.clstats.values())
k = list(self.clstats.keys())
self.maj_class_ = k[v.index(max(v))]
if self.verbose:
print(
'Majority class is %s and total number of classes is %s'
% (self.maj_class_, len(self.unique_classes_)))
def transform(self, X, y):
"""
Applies oversampling transformation to data as proposed by
the ADASYN algorithm. Returns oversampled X,y
"""
self.new_X, self.new_y = self.oversample()
def fit_transform(self, X, y):
"""
Fits the data and then returns the transformed version
"""
self.fit(X, y)
self.new_X, self.new_y = self.oversample()
self.new_X = np.concatenate((self.new_X, self.X), axis=0)
self.new_y = np.concatenate((self.new_y, self.y), axis=0)
return self.new_X, self.new_y
def generate_samples(self, x, knns, knnLabels, cl):
# List to store synthetically generated samples and their labels
new_data = []
new_labels = []
for ind, elem in enumerate(x):
# calculating k-neighbors that belong to minority (their indexes in x)
# Unfortunately knn returns the example itself as a neighbor. So it needs
# to be ignored thats why it is iterated [1:-1] and
# knnLabelsp[ind][+1].
min_knns = [ele for index,ele in enumerate(knns[ind][1:-1])
if knnLabels[ind][index +1] == cl]
if not min_knns:
continue
# generate gi synthetic examples for every minority example
for i in range(0, int(self.gi[ind])):
# randi holds an integer to choose a random minority kNNs
randi = self.random_state_.random_integers(
0, len(min_knns) - 1)
# l is a random number in [0,1)
l = self.random_state_.random_sample()
# X[min_knns[randi]] is the Xzi on equation [5]
si = self.X[elem] + \
(self.X[min_knns[randi]] - self.X[elem]) * l
new_data.append(si)
new_labels.append(self.y[elem])
self.num_new += 1
return(np.asarray(new_data), np.asarray(new_labels))
def oversample(self):
"""
Preliminary calculations before generation of
synthetic samples. Calculates and stores as instance
variables: img_degree(d),G,ri,gi as defined by equations
[1],[2],[3],[4] in the original paper
"""
try:
# Checking if variable exists, i.e. if fit() was called
self.unique_classes_ = self.unique_classes_
except:
raise RuntimeError("You need to fit() before applying tranform(),"
"or simply fit_transform()")
int_X = np.zeros([1, self.X.shape[1]])
int_y = np.zeros([1])
# Iterating through all minority classes to determine
# if they should be oversampled and to what extent
for cl in self.unique_classes_:
# Calculate imbalance degree and compare to threshold
imb_degree = float(self.clstats[cl]) / \
self.clstats[self.maj_class_]
if imb_degree > self.imb_threshold:
if self.verbose:
print('Class %s is within imbalance threshold' % cl)
else:
# G is the number of synthetic examples to be synthetically
# produced for the current minority class
self.G = (self.clstats[self.maj_class_] - self.clstats[cl]) \
* self.ratio
# ADASYN is built upon eucliden distance so p=2 default
self.nearest_neighbors_ = NearestNeighbors(n_neighbors=self.k + 1)
self.nearest_neighbors_.fit(self.X)
# keeping indexes of minority examples
minx = [ind for ind, exam in enumerate(self.X) if self.y[ind] == cl]
# Computing kNearestNeighbors for every minority example
knn = self.nearest_neighbors_.kneighbors(
self.X[minx], return_distance=False)
# Getting labels of k-neighbors of each example to determine how many of them
# are of different class than the one being oversampled
knnLabels = self.y[knn.ravel()].reshape(knn.shape)
tempdi = [Counter(i) for i in knnLabels]
# Calculating ri as defined in ADASYN paper:
# No. of k-neighbors belonging to different class than the minority divided by K
# which is ratio of friendly/non-friendly neighbors
self.ri = np.array(
[(sum(i.values())- i[cl]) / float(self.k) for i in tempdi])
# Normalizing so that ri is a density distribution (i.e.
# sum(ri)=1)
if np.sum(self.ri):
self.ri = self.ri / np.sum(self.ri)
# Calculating #synthetic_examples that need to be generated for
# each minority instance and rounding to nearest integer because
# it can't produce e.g 2.35 new examples.
self.gi = np.rint(self.ri * self.G)
# Generation of synthetic samples
inter_X, inter_y = self.generate_samples(
minx, knn, knnLabels, cl)
# in case no samples where generated at all concatenation
# won't be attempted
if len(inter_X):
int_X = np.concatenate((int_X, inter_X), axis=0)
if len(inter_y):
int_y = np.concatenate((int_y, inter_y), axis=0)
# New samples are concatenated in the beggining of the X,y arrays
# index_new contains the indiced of artificial examples
self.index_new = [i for i in range(0,self.num_new)]
return(int_X[1:-1], int_y[1:-1])