Content - 6225d1b8d360a0e8320aae4f201708b365209382 - 304011e/syntheticMake.py

visit type:

Tip revision: e677a38a563ef5a9eefde04725a247c26424d92d authored by Juliane Müller on 11 February 2021, 08:05:13 UTC
Update README.md

Tip revision: e677a38

syntheticMake.py

# https://machinelearningmastery.com/generate-test-datasets-python-scikit-learn/ (another resource)
# resource: https://towardsdatascience.com/exploratory-data-analysis-with-pandas-508a5e8a5964

import pandas as pd
import numpy as np

# Variables -- RETHINK THESE
# --measurements
# a1: height
# a2: weight
# a3: waist circumference

# --lifestyle
# b1: education
# b2: workout frequency / week
# b3: smoker

# c1: gender

# o1: BMI class -> (uw, h, ov, o, exo) -- underweight, healthy, overweight, obese, extremely obese (a1,b1)
# o2: cardiac risk class -> (low, med, high) males with high a2 have increased cardiac risk (a3, b2, b3, c1)


mu1, sigma1 = 1.6, 0.1  # avg height in meters, std. dev.
mu2, sigma2 = 10, 1
n = 500

# grp 1: measurements (a1 = height, a2 = weight, a3 = waist circumference)
a1 = np.random.normal(mu1, sigma1, n)
a2 = np.random.normal(2, 0.01, n) + a1 * 31 # introduce differences so isn't just noise + exact copy of a1
a3 = np.random.normal(0.07, 0.01, n) + a1 * 0.4

# grp 2: lifestyle (b1 = education, b2 = workout intensity, b3 = smoker)
b1_raw = np.random.normal(mu2, sigma2, n)
b1_bins = np.percentile(b1_raw, q=np.linspace(20, 100, 5))
b1 = np.digitize(b1_raw, b1_bins) # apply bins to data

b2_bins = np.percentile(b1_raw, q=np.linspace(20, 100, 5))  # (data source to bin, use percentile to set bin spacing, minimum is 20, max 100, 5 bins)
b2 = np.digitize(b1_raw, b2_bins) # apply bins to data

b3_bins = np.percentile(b1_raw, q=np.linspace(50, 100, 2))  # "smoker"
b3 = np.digitize(b1_raw, b3_bins)

c1 = np.random.randint(0, 2, n)  # binary target variable = "gender"

# grp 3: -> outcome measures from other dimensions; point is to show transitive relationship
o1_raw = a2 / (a1 * a1) # BMI
o1_bins = np.percentile(o1_raw, q=np.linspace(20, 100, 5))
o1 = np.digitize(o1_raw, o1_bins)  # binned BMI

o2_raw = np.random.normal(1, 0.01, n) + np.add(4 * a3, 10 * b2) + np.add(50 * b3, 2 * c1)  # random stuff to decide cardiac risk
o2_bins = np.percentile(o2_raw, q=np.linspace(33, 100, 3))
o2 = np.digitize(o2_raw, o2_bins)
# print(o2)
# o2 = np.random.normal(1, 0.01, n) + np.multiply(a2, b2)


# make dataframe
df = pd.DataFrame(list(zip(a1, a2, a3, b1, b2, b3, c1, o1, o2)), columns=['a1', 'a2', 'a3', 'b1', 'b2', 'b3', 'c1', 'o1', 'o2'])


# The problem was in here
# for b1, b2, b3, o1, and o2, 5, 5, 2, 5, and 3 could occour, respectively
# these ones were still saved as numbers and not as qualitative variables
# convert relevant variables to categorical data items

# High School
df['b1'] = df.b1.replace({0: "some HS", 1: "HS", 2: "Uni", 3: "Grad", 4: "Grad+", 5: "Grad+"})
df['b2'] = df.b2.replace({0: "< one time", 1: "one - two times", 2: "three times", 3: "four times", 4: "> four times", 5: "> four times"})
df['b3'] = df.b3.replace({0: "NS", 1: "S", 2: "S"})
df['c1'] = df.c1.replace({0: "F", 1: "M"})
df['o1'] = df.o1.replace({0: "UW", 1: "H", 2: "OW", 3: "OB", 4: "MOB", 5: "MOB"}) #0: "Underweight", 1: "Healthy", 2: "Overweight", 3: "Obese", 4: "Mordibly obese"
df['o2'] = df.o2.replace({0: "low", 1: "med", 2: "high", 3: "high"})

# print(df.o1)
print(df.dtypes)
print(df)

df.reset_index(inplace=True)
df.to_csv('resources/synthetic-body.csv', index=None)

Browse the archive

https://github.com/JulianeMu/IntegratedDualAnalysisAproach_MDA