https://github.com/awslabs/datawig
Raw File
Tip revision: 18099c579c850901a58ccbfdeecc6a7738f62b1c authored by dependabot[bot] on 01 March 2023, 20:33:22 UTC
Bump mxnet from 1.4.0 to 1.9.1 in /requirements
Tip revision: 18099c5
test_column_encoders.py
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not
# use this file except in compliance with the License. A copy of the License
# is located at
#
#     http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

"""

DataWig ColumnEncoder tests

"""

import os

import numpy as np
import pandas as pd
import pytest

from datawig import column_encoders

df = pd.DataFrame({'features': ['xwcxG pQldP Cel0n 5LaWO 2cjTu',
                                '2cjTu YizDY u1aEa Cel0n SntTK',
                                '2cjTu YizDY u1aEa Cel0n SntTK'],
                   'labels': ['xwcxG', 'SntTK', 'SntTK']})

categorical_encoder = column_encoders.CategoricalEncoder(['labels'], max_tokens=3).fit(df)
sequential_encoder = column_encoders.SequentialEncoder(['features'],
                                                       max_tokens=50, seq_len=3).fit(df)

# CategoricalEncoder Tests
def test_categorical_encoder_unfitted_fail():
    unfitted_categorical_encoder = column_encoders.CategoricalEncoder(["col_1"])
    assert not unfitted_categorical_encoder.is_fitted()
    with pytest.raises(column_encoders.NotFittedError):
        unfitted_categorical_encoder.transform(pd.DataFrame({"col_1": ['a', 'b']}))


def test_fit_categorical_encoder():
    assert categorical_encoder.is_fitted()
    assert categorical_encoder.token_to_idx == {'SntTK': 1, 'xwcxG': 2}
    assert categorical_encoder.idx_to_token == {1: 'SntTK', 2: 'xwcxG'}


def test_categorical_encoder_transform():
    assert categorical_encoder.transform(df).flatten()[0] == 2.


def test_categorical_encoder_transform_missing_token():
    assert categorical_encoder.transform(pd.DataFrame({'labels': ['foobar']})).flatten()[0] == 0


def test_categorical_encoder_max_token():
    categorical_encoder = column_encoders.CategoricalEncoder(['labels'], max_tokens=1e4).fit(df)
    assert categorical_encoder.max_tokens == 2


def test_categorical_encoder_decode_token():
    assert categorical_encoder.decode_token(1) == 'SntTK'


def test_categorical_encoder_decode_missing_token():
    assert categorical_encoder.decode_token(0) == 'MISSING'


def test_categorical_encoder_decode():
    assert categorical_encoder.decode(pd.Series([1])).values[0] == 'SntTK'


def test_categorical_encoder_decode_missing():
    assert categorical_encoder.decode(pd.Series([0])).values[0] == 'MISSING'


def test_categorical_encoder_non_negative_embedding_indices():
    assert all(categorical_encoder.transform(df).flatten() >= 0)


# SequentialEncoder Tests
def test_sequential_encoder_unfitted_fail():
    unfitted_sequential_encoder = column_encoders.SequentialEncoder(["col_1"])
    assert not unfitted_sequential_encoder.is_fitted()
    with pytest.raises(column_encoders.NotFittedError):
        unfitted_sequential_encoder.transform(pd.DataFrame({'brand': ['ab']}))


def test_fit_sequential_encoder():
    sequential_encoder_fewer_tokens = column_encoders.SequentialEncoder(['features'],
                                                                        max_tokens=5,
                                                                        seq_len=3).fit(df)
    assert (set(sequential_encoder_fewer_tokens.token_to_idx.keys()) == {'u', 'a', 'n', 'T', ' '})


def test_sequential_encoder_transform():
    encoded = pd.Series([vec.tolist() for vec in sequential_encoder.transform(df)])
    true_decoded = df['features'].apply(lambda x: x[:sequential_encoder.output_dim])
    assert all(sequential_encoder.decode(encoded) == true_decoded)


def test_sequential_encoder_transform_missing_token():
    assert (sequential_encoder.transform(pd.DataFrame({'features': ['!~']}))[0].tolist() == [0, 0,
                                                                                             0])


def test_sequential_encoder_max_token():
    sequential_encoder_short = column_encoders.SequentialEncoder("features", max_tokens=1e4,
                                                                 seq_len=2)
    sequential_encoder_short.fit(df)
    assert sequential_encoder.is_fitted()
    assert sequential_encoder_short.max_tokens == 32


def test_sequential_encoder_non_negative_embedding_indices():
    assert all(sequential_encoder.transform(df).flatten() >= 0)


def test_bow_encoder():
    bow_encoder = column_encoders.BowEncoder("features", max_tokens=5)
    assert bow_encoder.is_fitted()
    bow = bow_encoder.transform(df)[0].toarray()[0]
    true = np.array([0.615587, -0.3077935, -0.3077935, -0.41039133, 0.51298916])
    assert true == pytest.approx(bow, 1e-4)


def test_bow_encoder_multicol():
    bow_encoder = column_encoders.BowEncoder(["item_name", "product_description"], max_tokens=5)
    data = pd.DataFrame({'item_name': ['bla'], 'product_description': ['fasl']})
    bow = bow_encoder.transform(data)[0].toarray()[0]
    true = np.array([0.27500955, -0.82502865, -0.1833397, 0., -0.45834925])
    assert true == pytest.approx(bow, 1e-4)
    data_strings = ['item_name bla product_description fasl ']
    assert true == pytest.approx(bow_encoder.vectorizer.transform(data_strings).toarray()[0])


def test_categorical_encoder_numeric():
    df = pd.DataFrame({'brand': [1, 2, 3]})
    try:
        column_encoders.CategoricalEncoder("brand").fit(df)
    except TypeError:
        pytest.fail("fitting categorical encoder on integers should not fail")


def test_categorical_encoder_numeric_transform():
    df = pd.DataFrame({'brand': [1, 2, 3, 1, 2, 1, np.nan, None]})
    col_enc = column_encoders.CategoricalEncoder("brand").fit(df)
    assert np.array_equal(col_enc.transform(df), np.array([[1], [2], [3], [1], [2], [1], [0], [0]]))


def test_categorical_encoder_numeric_nan():
    df = pd.DataFrame({'brand': [1, 2, 3, None]})
    try:
        column_encoders.CategoricalEncoder("brand").fit(df)
    except TypeError:
        pytest.fail("fitting categorical encoder on integers with nulls should not fail")

def test_column_encoder_no_list_input_column():
    column_encoder = column_encoders.ColumnEncoder("0")
    assert column_encoder.input_columns == ['0']
    assert column_encoder.output_column == '0'
    with pytest.raises(ValueError):
        column_encoders.ColumnEncoder(0)
    with pytest.raises(ValueError):
        column_encoders.ColumnEncoder([0])


def test_numeric_encoder():
    df = pd.DataFrame({'a': [1, 2, 3, np.nan, None], 'b': [.1, -.1, np.nan, None, 10.5]})
    unfitted_numerical_encoder = column_encoders.NumericalEncoder(["a", 'b'], normalize=False)
    assert unfitted_numerical_encoder.is_fitted()
    fitted_unnormalized_numerical_encoder = unfitted_numerical_encoder.fit(df)
    df_unnormalized = fitted_unnormalized_numerical_encoder.transform(df.copy())

    assert np.array_equal(df_unnormalized, np.array([[1., 0.1],
                                                     [2., -0.1],
                                                     [3., 3.5],
                                                     [2.0, 3.5],
                                                     [2.0, 10.5]], dtype=np.float32))
    df_nans = pd.DataFrame({'a': [None], 'b': [np.nan]})
    df_unnormalized_nans = fitted_unnormalized_numerical_encoder.transform(df_nans.copy())
    assert np.array_equal(df_unnormalized_nans, np.array([[2., 3.5]], dtype=np.float32))

    normalized_numerical_encoder = column_encoders.NumericalEncoder(["a", 'b'], normalize=True)
    assert not normalized_numerical_encoder.is_fitted()
    normalized_numerical_encoder_fitted = normalized_numerical_encoder.fit(df)
    df_normalized = normalized_numerical_encoder_fitted.transform(df)

    assert normalized_numerical_encoder.is_fitted()
    assert np.array_equal(df_normalized, np.array([[-1.58113885, -0.88666826],
                                                   [0., -0.93882525],
                                                   [1.58113885, 0.],
                                                   [0., 0.],
                                                   [0., 1.82549345]], dtype=np.float32))

def test_tfidf_encoder():
    tfidf_encoder = column_encoders.TfIdfEncoder("features", max_tokens=5)
    assert tfidf_encoder.is_fitted() is False
    tfidf_encoder.fit(df)
    bow = tfidf_encoder.transform(df)[0].toarray()[0]
    true = np.array([0.75592893, 0.5669467, 0.18898223, 0.18898223, 0.18898223])
    assert tfidf_encoder.is_fitted() is True
    assert true == pytest.approx(bow, 1e-4)

    decoded_indices = tfidf_encoder.decode(pd.Series([0, 1, 2]))
    assert np.array_equal(decoded_indices.values, np.array([' ', 'c', 'e']))
back to top