https://github.com/awslabs/datawig
Tip revision: 18099c579c850901a58ccbfdeecc6a7738f62b1c authored by dependabot[bot] on 01 March 2023, 20:33:22 UTC
Bump mxnet from 1.4.0 to 1.9.1 in /requirements
Bump mxnet from 1.4.0 to 1.9.1 in /requirements
Tip revision: 18099c5
conftest.py
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not
# use this file except in compliance with the License. A copy of the License
# is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
import os
import random
import shutil
import mxnet as mx
import numpy as np
import pandas as pd
import pytest
from datawig.utils import rand_string
@pytest.fixture(scope='function')
def reset_random_seed():
# Setting seed for PRNG(s) for every test in the file
random.seed(0)
np.random.seed(0)
mx.random.seed(0)
yield
@pytest.fixture
def test_dir():
file_dir = os.path.dirname(os.path.realpath(__file__))
joined = os.path.join(file_dir, "resources")
yield joined
shutil.rmtree(joined)
@pytest.fixture
def data_frame():
def _inner_impl(
feature_col='features',
label_col='labels',
n_samples=500,
word_length=5,
num_words=100,
vocab_size=100,
num_labels=10):
"""
Generates text features and categorical labels.
:param feature_col: name of feature column.
:param label_col: name of label column.
:param n_samples: how many rows to generate.
:return: pd.DataFrame with columns = [feature_col, label_col]
"""
vocab = [rand_string(word_length) for i in range(vocab_size)]
labels = vocab[:num_labels]
words = vocab[num_labels:]
def _sentence_with_label(labels=labels, words=words):
"""
Generates a random token sequence containing a random label
:param labels: label set
:param words: vocabulary of tokens
:return: blank separated token sequence and label
"""
label = random.choice(labels)
tokens = [random.choice(words) for _ in range(num_words)] + [label]
sentence = " ".join(np.random.permutation(tokens))
return sentence, label
sentences, labels = zip(*[_sentence_with_label(labels, words) for _ in range(n_samples)])
df = pd.DataFrame({feature_col: sentences, label_col: labels})
return df
return _inner_impl
def synthetic_label_shift_simple(N, label_proportions, error_proba, covariates=None) -> pd.DataFrame:
"""
Generate data with synthetic label shift and a single string feature column
:param N: Number of observations
:param label_proportions: Dirichlet vector with label proportions
:param error_proba: Probability of sampling from a different covariate distribution
:param covariates: list of covariate names, random strings by default
"""
if covariates is None:
covariates = []
for i in range(len(label_proportions)):
covariates.append(rand_string(6))
out = []
for n in range(N):
label = np.random.choice(range(len(label_proportions)), p=label_proportions)
if np.random.rand() > error_proba:
covariate = covariates[label]
else:
# choose a different covariate at random.
covariate = covariates[np.random.choice([i for i in range(len(label_proportions)) if i != label])]
out.append((covariate, 'label_' + str(label)))
return pd.DataFrame(out, columns=['covariate', 'label'])