Scikit-learn is the most useful and robust library for machine learning in Python.
# def family_size(children):
# if children > 5:
# return "Large"
# elif children > 2:
# return 'Mid'
# else:
# return "Small"
# df['familySize'] = df['children'].apply(lambda x: family_size(x))
import pandas as pd
df = pd.read_csv("galton_handson.csv")
df
/Users/mac/anaconda3/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed). from pandas.core import (
family | father | mother | midparentHeight | children | childNum | gender | childHeight | familySize | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1.0 | male | 73.2 | Mid |
1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2.0 | female | 69.2 | Mid |
2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3.0 | female | 69.0 | Mid |
3 | 1 | 78.5 | 67.0 | 75.43 | 4 | 4.0 | female | 69.0 | Mid |
4 | 2 | 75.5 | 66.5 | 73.66 | 4 | 1.0 | male | 73.5 | Mid |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
929 | 203 | 62.0 | 66.0 | 66.64 | 3 | 1.0 | male | 64.0 | Mid |
930 | 203 | 62.0 | 66.0 | 66.64 | 3 | 2.0 | female | 62.0 | Mid |
931 | 203 | 62.0 | 66.0 | 66.64 | 3 | 3.0 | female | 61.0 | Mid |
932 | 204 | 62.5 | 63.0 | 65.27 | 2 | 1.0 | male | 66.5 | Small |
933 | 204 | 62.5 | 63.0 | 65.27 | 2 | 2.0 | female | 57.0 | Small |
934 rows × 9 columns
df = df.drop(columns='family')
df
father | mother | midparentHeight | children | childNum | gender | childHeight | familySize | |
---|---|---|---|---|---|---|---|---|
0 | 78.5 | 67.0 | 75.43 | 4 | 1.0 | male | 73.2 | Mid |
1 | 78.5 | 67.0 | 75.43 | 4 | 2.0 | female | 69.2 | Mid |
2 | 78.5 | 67.0 | 75.43 | 4 | 3.0 | female | 69.0 | Mid |
3 | 78.5 | 67.0 | 75.43 | 4 | 4.0 | female | 69.0 | Mid |
4 | 75.5 | 66.5 | 73.66 | 4 | 1.0 | male | 73.5 | Mid |
... | ... | ... | ... | ... | ... | ... | ... | ... |
929 | 62.0 | 66.0 | 66.64 | 3 | 1.0 | male | 64.0 | Mid |
930 | 62.0 | 66.0 | 66.64 | 3 | 2.0 | female | 62.0 | Mid |
931 | 62.0 | 66.0 | 66.64 | 3 | 3.0 | female | 61.0 | Mid |
932 | 62.5 | 63.0 | 65.27 | 2 | 1.0 | male | 66.5 | Small |
933 | 62.5 | 63.0 | 65.27 | 2 | 2.0 | female | 57.0 | Small |
934 rows × 8 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 934 entries, 0 to 933 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 father 912 non-null float64 1 mother 899 non-null float64 2 midparentHeight 927 non-null object 3 children 934 non-null int64 4 childNum 929 non-null float64 5 gender 915 non-null object 6 childHeight 934 non-null float64 7 familySize 934 non-null object dtypes: float64(4), int64(1), object(3) memory usage: 58.5+ KB
df['midparentHeight'] = df['midparentHeight'].str.replace(",", '.')
df['midparentHeight'] = df['midparentHeight'].astype(float)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 934 entries, 0 to 933 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 father 912 non-null float64 1 mother 899 non-null float64 2 midparentHeight 927 non-null float64 3 children 934 non-null int64 4 childNum 929 non-null float64 5 gender 915 non-null object 6 childHeight 934 non-null float64 7 familySize 934 non-null object dtypes: float64(5), int64(1), object(2) memory usage: 58.5+ KB
df.duplicated().sum()
1
df = df.drop_duplicates()
df.duplicated().sum()
0
# The target we are trying to predict
y = df['childHeight']
# The features we will use to make the prediction
X = df.drop(columns = 'childHeight')
X
father | mother | midparentHeight | children | childNum | gender | familySize | |
---|---|---|---|---|---|---|---|
0 | 78.5 | 67.0 | 75.43 | 4 | 1.0 | male | Mid |
1 | 78.5 | 67.0 | 75.43 | 4 | 2.0 | female | Mid |
2 | 78.5 | 67.0 | 75.43 | 4 | 3.0 | female | Mid |
3 | 78.5 | 67.0 | 75.43 | 4 | 4.0 | female | Mid |
4 | 75.5 | 66.5 | 73.66 | 4 | 1.0 | male | Mid |
... | ... | ... | ... | ... | ... | ... | ... |
929 | 62.0 | 66.0 | 66.64 | 3 | 1.0 | male | Mid |
930 | 62.0 | 66.0 | 66.64 | 3 | 2.0 | female | Mid |
931 | 62.0 | 66.0 | 66.64 | 3 | 3.0 | female | Mid |
932 | 62.5 | 63.0 | 65.27 | 2 | 1.0 | male | Small |
933 | 62.5 | 63.0 | 65.27 | 2 | 2.0 | female | Small |
933 rows × 7 columns
# Import the TTS from sklearn
from sklearn.model_selection import train_test_split
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=120)
X_train
father | mother | midparentHeight | children | childNum | gender | familySize | |
---|---|---|---|---|---|---|---|
664 | NaN | 63.0 | 68.02 | 8 | 1.0 | male | Large |
212 | 71.0 | 62.0 | 68.98 | 5 | 2.0 | male | Mid |
629 | 68.0 | 64.0 | 68.56 | 4 | 3.0 | female | Mid |
557 | 69.0 | NaN | 67.44 | 9 | 5.0 | male | Large |
409 | 70.0 | 58.0 | 66.32 | 5 | 3.0 | female | Mid |
... | ... | ... | ... | ... | ... | ... | ... |
382 | 70.5 | 62.0 | 68.73 | 8 | 5.0 | NaN | Large |
735 | 67.0 | 65.0 | 68.60 | 6 | 5.0 | female | Large |
158 | 71.0 | 65.5 | 70.87 | 6 | 5.0 | female | Large |
768 | 67.0 | 63.5 | 67.79 | 8 | 5.0 | male | Large |
679 | 68.0 | 63.0 | 68.02 | 1 | 1.0 | male | Small |
699 rows × 7 columns
Rule: Any change to the data should be justified
df.isna().sum()
father 22 mother 35 midparentHeight 7 children 0 childNum 5 gender 19 childHeight 0 familySize 0 dtype: int64
# Define list of categorical features
cat_cols = X_train.select_dtypes("object").columns
# Define the list of numerical features
num_cols = X_train.select_dtypes("number").columns
print(cat_cols)
num_cols
Index(['gender', 'familySize'], dtype='object')
Index(['father', 'mother', 'midparentHeight', 'children', 'childNum'], dtype='object')
from sklearn.impute import SimpleImputer
# Instantiate the imputer with the desired strategy
impute_na = SimpleImputer(strategy='constant', fill_value='NA')
# Instantiate the imputer object from the SimpleImputer class with strategy 'median'
impute_median = SimpleImputer(strategy='median')
# Fit the imputer object on the training data with .fit
impute_na.fit(X_train[cat_cols])
# Fit the imputer object on the numeric training data with .fit()
impute_median.fit(X_train[num_cols])
SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer(strategy='median')
from sklearn import set_config
set_config(transform_output='pandas') #To generate dataframes instead of numpy arrays
# Transform the categorical training data
X_train_cat_imputed = impute_na.transform(X_train[cat_cols])
# Transform the categorical testing data
X_test_cat_imputed = impute_na.transform(X_test[cat_cols])
# Transform the training data
X_train_num_imputed = impute_median.transform(X_train[num_cols])
# Transfrom the testing data
X_test_num_imputed = impute_median.transform(X_test[num_cols])
X_test_cat_imputed
gender | familySize | |
---|---|---|
785 | male | Large |
705 | female | Mid |
357 | male | Mid |
288 | female | Large |
289 | female | Large |
... | ... | ... |
415 | female | Large |
392 | female | Large |
732 | male | Large |
931 | female | Mid |
709 | male | Large |
234 rows × 2 columns
In order for features to be interpreted by a machine learning algorithm, the data must be in a numeric form (integers or floats).
Used for converting categorical data into numeric values that preserve their inherent ordering
from sklearn.preprocessing import OrdinalEncoder
# define a list of columns to encode as ordinal
ordinal_cols = ['familySize']
print(X_test_cat_imputed['familySize'].value_counts())
# Specifying the order of categories in quality/condition columns
fam_size_order = ["Small", "Mid", "Large"]
# Making the list of order lists for OrdinalEncoder
ordinal_category_orders = [fam_size_order]
# Instantiate the encoder and include the list of ordered values as an argument
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)
# Fit the encoder on the training data
ord_encoder.fit(X_train_cat_imputed[ordinal_cols])
# Transform the training data
X_train_ordinal_enc = ord_encoder.transform(X_train_cat_imputed[ordinal_cols])
# Transform the test data
X_test_ordinal_enc = ord_encoder.transform(X_test_cat_imputed[ordinal_cols])
# Value counts after transformation
X_test_ordinal_enc['familySize'].value_counts()
familySize Large 128 Mid 88 Small 18 Name: count, dtype: int64
familySize 2.0 128 1.0 88 0.0 18 Name: count, dtype: int64
df['gender'].value_counts()
gender male 467 female 447 Name: count, dtype: int64
How can represent these values in numbers ?
Can we replace every category with numeric value such as "Male" -> 0, "Female" -> 1?
NO! The ML algorithms might interpret Female labelled data to be having higher weight than others since 1 > 0
One-hot encoding: It creates a binary column for each class in the column.
from sklearn.preprocessing import OneHotEncoder
# saving list of categorical features to one-hot-encode
ohe_cols = X_train_cat_imputed.drop(columns=ordinal_cols).columns
print(ohe_cols)
# Instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_encoder
Index(['gender'], dtype='object')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# Fit the OneHotEncoder on the training data
ohe_encoder.fit(X_train_cat_imputed[ohe_cols])
# Transform the training data
X_train_cat_ohe = ohe_encoder.transform(X_train_cat_imputed[ohe_cols])
# Transform the testing data
X_test_cat_ohe = ohe_encoder.transform(X_test_cat_imputed[ohe_cols])
X_test_cat_ohe.head(5)
gender_NA | gender_female | gender_male | |
---|---|---|---|
785 | 0.0 | 0.0 | 1.0 |
705 | 0.0 | 1.0 | 0.0 |
357 | 0.0 | 0.0 | 1.0 |
288 | 0.0 | 1.0 | 0.0 |
289 | 0.0 | 1.0 | 0.0 |
Feature scaling is the process of making sure that all the values in a dataset are within a certain range.
It outputs something very close to a normal distribution.
Z-score is the number of standard deviations above and below the mean that the value falls. For example, a Z-score of 2 indicates that an observation is two standard deviations above the average while a Z-score of -2 signifies it is two standard deviations below the mean.
. | . |
---|---|
![]() |
![]() |
The following features need to be scaled (Yes, No):
# Obtain summary statistics for training data before scaling
X_train_num_imputed.describe().round(2)
father | mother | midparentHeight | children | childNum | |
---|---|---|---|---|---|
count | 699.00 | 699.00 | 699.00 | 699.00 | 699.00 |
mean | 69.25 | 64.12 | 69.17 | 6.21 | 3.52 |
std | 2.47 | 2.28 | 1.79 | 2.79 | 2.37 |
min | 62.00 | 58.00 | 64.40 | 1.00 | 1.00 |
25% | 68.00 | 63.00 | 68.14 | 4.00 | 2.00 |
50% | 69.00 | 64.00 | 69.18 | 6.00 | 3.00 |
75% | 71.00 | 66.00 | 70.16 | 8.00 | 5.00 |
max | 78.50 | 70.50 | 75.43 | 15.00 | 15.00 |
# New import for scaler
from sklearn.preprocessing import StandardScaler
# instantiate scaler
scaler = StandardScaler()
# fit scaler on training data
scaler.fit(X_train_num_imputed)
# transform training data
X_train_num_scaled = scaler.transform(X_train_num_imputed)
# transform testing data
X_test_num_scaled = scaler.transform(X_test_num_imputed)
# Obtain summary statistics for training data
X_train_num_scaled
father | mother | midparentHeight | children | childNum | |
---|---|---|---|---|---|
664 | -0.100151 | -0.490184 | -0.643400 | 0.643644 | -1.062764 |
212 | 0.710095 | -0.928229 | -0.107311 | -0.432689 | -0.640198 |
629 | -0.505273 | -0.052139 | -0.341850 | -0.791466 | -0.217631 |
557 | -0.100151 | -0.052139 | -0.967288 | 1.002421 | 0.627503 |
409 | 0.304972 | -2.680409 | -1.592726 | -0.432689 | -0.217631 |
... | ... | ... | ... | ... | ... |
382 | 0.507534 | -0.928229 | -0.246917 | 0.643644 | 0.627503 |
735 | -0.910396 | 0.385906 | -0.319513 | -0.073911 | 0.627503 |
158 | 0.710095 | 0.604928 | 0.948116 | -0.073911 | 0.627503 |
768 | -0.910396 | -0.271162 | -0.771838 | 0.643644 | 0.627503 |
679 | -0.505273 | -0.490184 | -0.643400 | -1.867799 | -1.062764 |
699 rows × 5 columns
from sklearn.pipeline import make_pipeline
# instantiate preprocessors
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
# Make a numeric preprocessing pipeline
num_pipe = make_pipeline(impute_median, scaler)
# Fit the pipeline on the numeric training data
num_pipe.fit(X_train[num_cols])
# Transform the training data
X_train_num_tf = num_pipe.transform(X_train[num_cols])
# Transform the testing data
X_test_num_tf = num_pipe.transform(X_test[num_cols])
X_test_num_tf
father | mother | midparentHeight | children | childNum | |
---|---|---|---|---|---|
785 | -0.100151 | 1.261996 | 0.004375 | 0.643644 | 0.627503 |
705 | -0.505273 | -1.804319 | -1.548052 | -0.791466 | 0.204936 |
357 | 0.507534 | -0.490184 | 0.054633 | -0.791466 | -1.062764 |
288 | 0.304972 | 0.385906 | 0.518127 | -0.073911 | -0.217631 |
289 | 0.304972 | 0.385906 | 0.518127 | -0.073911 | 0.204936 |
... | ... | ... | ... | ... | ... |
415 | -0.100151 | 1.919063 | 1.294340 | 1.361199 | 0.204936 |
392 | 0.426509 | -0.621598 | -0.091675 | 0.284866 | 1.472636 |
732 | -0.910396 | 0.385906 | -0.319513 | -0.073911 | -0.640198 |
931 | -2.936010 | 0.823951 | -1.414029 | -1.150244 | -0.217631 |
709 | -0.505273 | -2.242364 | -1.849602 | 1.361199 | -0.217631 |
234 rows × 5 columns
impute_na_ord = SimpleImputer(strategy='constant', fill_value='NA')
# Specifying the order of categories in quality/condition columns
fam_size_order = ["Small", "Mid", "Large"]
# Making the list of order lists for OrdinalEncoder
ordinal_category_orders = [fam_size_order]
# Instantiate the encoder and include the list of ordered values as an argument
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)
# Making a final scaler to scale category #'s
scaler_ord = StandardScaler()
ord_pipe = make_pipeline(impute_na_ord, ord_encoder, scaler_ord)
# Fit the encoder on the training data
ord_pipe.fit(X_train[ordinal_cols])
# Transform the training data
X_train_ordinal_tf = ord_pipe.transform(X_train[ordinal_cols])
# Transform the test data
X_test_ordinal_tf = ord_pipe.transform(X_test[ordinal_cols])
# Value counts after transformation
X_test_ordinal_tf
familySize | |
---|---|
785 | 0.799627 |
705 | -0.735921 |
357 | -0.735921 |
288 | 0.799627 |
289 | 0.799627 |
... | ... |
415 | 0.799627 |
392 | 0.799627 |
732 | 0.799627 |
931 | -0.735921 |
709 | 0.799627 |
234 rows × 1 columns
impute_na = SimpleImputer(strategy='constant', fill_value = "NA")
# Instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Make pipeline with imputer and encoder
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
ohe_pipe.fit(X_train[ohe_cols])
# Transform the training data
X_train_ohe_tf = ohe_pipe.transform(X_train[ohe_cols])
# Transform the test data
X_test_ohe_tf = ohe_pipe.transform(X_test[ohe_cols])
# Value counts after transformation
X_test_ohe_tf
gender_NA | gender_female | gender_male | |
---|---|---|---|
785 | 0.0 | 0.0 | 1.0 |
705 | 0.0 | 1.0 | 0.0 |
357 | 0.0 | 0.0 | 1.0 |
288 | 0.0 | 1.0 | 0.0 |
289 | 0.0 | 1.0 | 0.0 |
... | ... | ... | ... |
415 | 0.0 | 1.0 | 0.0 |
392 | 0.0 | 1.0 | 0.0 |
732 | 0.0 | 0.0 | 1.0 |
931 | 0.0 | 1.0 | 0.0 |
709 | 0.0 | 0.0 | 1.0 |
234 rows × 3 columns
from sklearn.compose import ColumnTransformer
########### Numerical pipeline
# instantiate preprocessors
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
# Make a numeric preprocessing pipeline
num_pipe = make_pipeline(impute_median, scaler)
########### Ordinal pipeline
impute_na_ord = SimpleImputer(strategy='constant', fill_value='NA')
# Specifying the order of categories in quality/condition columns
fam_size_order = ["Small", "Mid", "Large"]
# Making the list of order lists for OrdinalEncoder
ordinal_category_orders = [fam_size_order]
# Instantiate the encoder and include the list of ordered values as an argument
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)
# Making a final scaler to scale category #'s
scaler_ord = StandardScaler()
ord_pipe = make_pipeline(impute_na_ord, ord_encoder, scaler_ord)
######### Nominal pipeline
impute_na = SimpleImputer(strategy='constant', fill_value = "NA")
# Instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Make pipeline with imputer and encoder
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
######### Build pipelines tuples
num_tuple = ('numeric', num_pipe, num_cols)
ord_tuple = ('ordinal', ord_pipe, ordinal_cols)
ohe_tuple = ('categorical', ohe_pipe, ohe_cols)
######### Create ColumnTransformer
# Instantiate with verbose_feature_names_out=False
col_transformer = ColumnTransformer([num_tuple, ord_tuple, ohe_tuple],
remainder='passthrough',
verbose_feature_names_out=False)
col_transformer
ColumnTransformer(remainder='passthrough', transformers=[('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), Index(['father', 'mother', 'midparentHeight', 'children', 'childNum'], dtype='object')), ('ordinal', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='NA', strategy='constant'))... OrdinalEncoder(categories=[['Small', 'Mid', 'Large']])), ('standardscaler', StandardScaler())]), ['familySize']), ('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='NA', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['gender'], dtype='object'))], verbose_feature_names_out=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
ColumnTransformer(remainder='passthrough', transformers=[('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')), ('standardscaler', StandardScaler())]), Index(['father', 'mother', 'midparentHeight', 'children', 'childNum'], dtype='object')), ('ordinal', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='NA', strategy='constant'))... OrdinalEncoder(categories=[['Small', 'Mid', 'Large']])), ('standardscaler', StandardScaler())]), ['familySize']), ('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(fill_value='NA', strategy='constant')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), Index(['gender'], dtype='object'))], verbose_feature_names_out=False)
Index(['father', 'mother', 'midparentHeight', 'children', 'childNum'], dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
['familySize']
SimpleImputer(fill_value='NA', strategy='constant')
OrdinalEncoder(categories=[['Small', 'Mid', 'Large']])
StandardScaler()
Index(['gender'], dtype='object')
SimpleImputer(fill_value='NA', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
passthrough
# Fit on training data
col_transformer.fit(X_train)
# Transform the training data
X_train_processed = col_transformer.transform(X_train)
# Transform the testing data
X_test_processed = col_transformer.transform(X_test)
# View the processed training data
X_train_processed.head()
father | mother | midparentHeight | children | childNum | familySize | gender_NA | gender_female | gender_male | |
---|---|---|---|---|---|---|---|---|---|
664 | -0.100151 | -0.490184 | -0.643400 | 0.643644 | -1.062764 | 0.799627 | 0.0 | 0.0 | 1.0 |
212 | 0.710095 | -0.928229 | -0.107311 | -0.432689 | -0.640198 | -0.735921 | 0.0 | 0.0 | 1.0 |
629 | -0.505273 | -0.052139 | -0.341850 | -0.791466 | -0.217631 | -0.735921 | 0.0 | 1.0 | 0.0 |
557 | -0.100151 | -0.052139 | -0.967288 | 1.002421 | 0.627503 | 0.799627 | 0.0 | 0.0 | 1.0 |
409 | 0.304972 | -2.680409 | -1.592726 | -0.432689 | -0.217631 | -0.735921 | 0.0 | 1.0 | 0.0 |
Transform heterogeneous data types at once. It lets you apply different types of transformers to different columns in your data