import pandas as pd
df = pd.read_csv("ames-housing.csv")
df.plot.scatter(x='Living Area Sqft', y='SalePrice');
/Users/mac/anaconda3/lib/python3.10/site-packages/pandas/core/arrays/masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed). from pandas.core import (
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn import set_config
set_config(transform_output='pandas')
df = pd.read_csv("galton_handson.csv")
#Drop non relevant columns
df = df.drop(columns='family')
#Fix inconsistency
df['midparentHeight'] = df['midparentHeight'].str.replace(",", '.')
df['midparentHeight'] = df['midparentHeight'].astype(float)
df = df.drop_duplicates()
# The target we are trying to predict
y = df['childHeight']
# The features we will use to make the prediction
X = df.drop(columns = 'childHeight')
X.head(5)
father | mother | midparentHeight | children | childNum | gender | familySize | |
---|---|---|---|---|---|---|---|
0 | 78.5 | 67.0 | 75.43 | 4 | 1.0 | male | Mid |
1 | 78.5 | 67.0 | 75.43 | 4 | 2.0 | female | Mid |
2 | 78.5 | 67.0 | 75.43 | 4 | 3.0 | female | Mid |
3 | 78.5 | 67.0 | 75.43 | 4 | 4.0 | female | Mid |
4 | 75.5 | 66.5 | 73.66 | 4 | 1.0 | male | Mid |
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
num_cols = X_train.select_dtypes("number").columns
ordinal_cols = ['familySize']
ohe_cols = X_train.select_dtypes("object").drop(columns=ordinal_cols).columns
########### Numrical pipeline
# instantiate preprocessors
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
# Make a numeric preprocessing pipeline
num_pipe = make_pipeline(impute_median, scaler)
########### Ordinal pipeline
impute_na_ord = SimpleImputer(strategy='constant', fill_value='NA')
# Specifying the order of categories in quality/condition columns
fam_size_order = ["Small", "Mid", "Large"]
# Making the list of order lists for OrdinalEncoder
ordinal_category_orders = [fam_size_order]
# Instantiate the encoder and include the list of ordered values as an argument
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)
# Making a final scaler to scale category #'s
scaler_ord = StandardScaler()
ord_pipe = make_pipeline(impute_na_ord, ord_encoder, scaler_ord)
######### Nominal pipeline
impute_na = SimpleImputer(strategy='constant', fill_value = "male")
# Instantiate one hot encoder
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Make pipeline with imputer and encoder
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
######### Build pipelines tuples
num_tuple = ('numeric', num_pipe, num_cols)
ord_tuple = ('ordinal', ord_pipe, ordinal_cols)
ohe_tuple = ('categorical', ohe_pipe, ohe_cols)
######### Create ColumnTransformer
# Instantiate with verbose_feature_names_out=False
col_transformer = ColumnTransformer([num_tuple, ord_tuple, ohe_tuple],
remainder='passthrough',
verbose_feature_names_out=False)
# Fit on training data
col_transformer.fit(X_train)
# Transform the training data
X_train_tf = col_transformer.transform(X_train)
# Transform the testing data
X_test_tf = col_transformer.transform(X_test)
# View the processed training data
X_train_tf
father | mother | midparentHeight | children | childNum | familySize | gender_female | gender_male | |
---|---|---|---|---|---|---|---|---|
385 | 0.519228 | -0.922169 | -0.260641 | 0.676793 | 1.898582 | 0.813335 | 1.0 | 0.0 |
453 | 0.096910 | 1.030101 | 0.834581 | -0.784453 | -0.262833 | -0.698689 | 1.0 | 0.0 |
347 | 0.308069 | -0.054494 | 0.210361 | 1.042104 | 1.898582 | 0.813335 | 1.0 | 0.0 |
602 | -0.325409 | 0.379344 | 0.091192 | 0.676793 | -0.695116 | 0.813335 | 0.0 | 1.0 |
622 | -0.536568 | -0.054494 | -0.357112 | 1.407416 | 1.034016 | 0.813335 | 1.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
767 | -0.958887 | -0.271412 | -0.794066 | 0.676793 | 0.169450 | 0.813335 | 0.0 | 1.0 |
72 | 1.448329 | 2.114696 | 2.508626 | 0.676793 | 0.601733 | 0.813335 | 1.0 | 0.0 |
908 | -1.592365 | -1.789845 | -2.292194 | -0.419141 | 0.169450 | -0.698689 | 0.0 | 1.0 |
235 | 0.730388 | -2.657521 | -1.344514 | -1.515075 | -0.695116 | -2.210713 | 0.0 | 1.0 |
37 | 1.997344 | -0.922169 | 0.732436 | 0.676793 | 0.601733 | 0.813335 | 1.0 | 0.0 |
699 rows × 8 columns
from sklearn.linear_model import LinearRegression
# from sklearn.linear_model SGDRegressor
model = LinearRegression()
# Fit the model on the training data
model.fit(X_train_tf, y_train)
model
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
By calling the fit function, the model studied the patterns between the features and the target, and found the optimal weights which mimize the loss
b_0 = model.intercept_.round(2)
b_i = model.coef_.round(2)
feature_names = col_transformer.get_feature_names_out()
print(f"features: {feature_names}")
print(f"coffiecients (weights): {b_i}")
print(f"intercept (bias): {b_0}")
features: ['father' 'mother' 'midparentHeight' 'children' 'childNum' 'familySize' 'gender_female' 'gender_male'] coffiecients (weights): [ 0.63 0.33 0.45 0.58 -1.55 0.19 -1.73 1.73] intercept (bias): 66.62
eq = " +\n".join([ f"{'' if i == 0 else ' '*24}{b_i[i]} * {feature_names[i]}" for i in range(len(feature_names))])
print(f"Model equation: y_hat = {eq}")
Model equation: y_hat = 0.63 * father + 0.33 * mother + 0.45 * midparentHeight + 0.58 * children + -1.55 * childNum + 0.19 * familySize + -1.73 * gender_female + 1.73 * gender_male
# import numpy as np
new_child = [70, 64, 69.6, 3, 3, 'male', 'Mid']
X = pd.DataFrame(columns=X_train.columns)
X.loc[0] = new_child
X = col_transformer.transform(X)
model.predict(X)
array([68.24359064])
MAE, MSE, and RMSE scores are all dependent on the scale and units of the target. 5,000 USD on the sale of a house isn't too bad, but being off by 5,000 USD on the sale of a car would be horrible!
# Calculating MSE with sklearn
from sklearn.metrics import mean_squared_error
#Get predictions for test data
y_pred_test = model.predict(X_test_tf)
test_MSE = mean_squared_error(y_test, y_pred_test)
print(f'Model Testing MSE: {test_MSE:,.2f}')
Model Testing MSE: 3.46
. | . | . |
---|---|---|
![]() |
![]() |
![]() |
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn import set_config
set_config(transform_output='pandas')
df = pd.read_csv('medical_data.csv')
df['Additional_charges'].describe()
count 1000.000000 mean 13124.934863 std 6677.691402 min 3241.339760 25% 8121.383834 50% 11698.462430 75% 16493.908180 max 30087.650940 Name: Additional_charges, dtype: float64
# Drop State
droplist = ['State']
df = df.drop(droplist, axis=1)
# Correct inconsistencies in values in Gender column
df['Gender'] = df['Gender'].replace(['male', 'm', 'M'], 'Male')
df['Gender'] = df['Gender'].replace(['F', 'f'], 'Female')
# Define X and y
X = df.drop(columns='Additional_charges')
y = df['Additional_charges']
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Create the preprocessing pipeline for categorical data
# (New) Select columns with make_column_selector
cat_selector = make_column_selector(dtype_include='object')
# Insantiate transfomers
freq_imputer = SimpleImputer(strategy='most_frequent')
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Instantiate the pipeline
cat_pipe = make_pipeline(freq_imputer, ohe)
# Make a tuple for column transformer
cat_tuple = ('categorical',cat_pipe, cat_selector)
# Create the preprocessing pipeline for numeric data
# (New) Select columns wiht make_column)selector
num_selector = make_column_selector(dtype_include='number')
# Instantiate the transformers
scaler = StandardScaler()
mean_imputer = SimpleImputer(strategy='mean')
# Instantiate the pipeline
num_pipe = make_pipeline(mean_imputer, scaler)
# Make the tuple for ColumnTransformer
num_tuple = ('numeric',num_pipe, num_selector)
# Create the preprocessing ColumnTransformer
preprocessor = ColumnTransformer([cat_tuple, num_tuple],
verbose_feature_names_out=False)
# Instantiate a linear regression model
model = LinearRegression()
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
model_pipe = make_pipeline(preprocessor, model)
model_pipe
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>), ('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>)], verbose_feature_names_out=False)), ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>), ('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>)], verbose_feature_names_out=False)), ('linearregression', LinearRegression())])
ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>), ('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>)], verbose_feature_names_out=False)
<sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
<sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>
SimpleImputer()
StandardScaler()
LinearRegression()
model_pipe.fit(X_train, y_train)
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>), ('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>)], verbose_feature_names_out=False)), ('linearregression', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('columntransformer', ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>), ('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>)], verbose_feature_names_out=False)), ('linearregression', LinearRegression())])
ColumnTransformer(transformers=[('categorical', Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')), ('onehotencoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>), ('numeric', Pipeline(steps=[('simpleimputer', SimpleImputer()), ('standardscaler', StandardScaler())]), <sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>)], verbose_feature_names_out=False)
<sklearn.compose._column_transformer.make_column_selector object at 0x7fa3282ff610>
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse_output=False)
<sklearn.compose._column_transformer.make_column_selector object at 0x7fa3252ce8c0>
SimpleImputer()
StandardScaler()
LinearRegression()
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
# Get metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r_squared = r2_score(y_true, y_pred)
if verbose == True:
# Print Result with Label and Header
header = "-"*60
print(header, f"Regression Metrics: {label}", header, sep='\n')
print(f"- MAE = {mae:,.3f}")
print(f"- MSE = {mse:,.3f}")
print(f"- RMSE = {rmse:,.3f}")
print(f"- R^2 = {r_squared:,.3f}")
if output_dict == True:
metrics = {'Label':label, 'MAE':mae,
'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
# Get predictions for training data
y_train_pred = model_pipe.predict(X_train)
# Call the helper function to obtain regression metrics for training data
results_train = regression_metrics(y_train, y_train_pred, label='Training Data')
print()
# Get predictions for test data
y_test_pred = model_pipe.predict(X_test)
# Call the helper function to obtain regression metrics for test data
results_test = regression_metrics(y_test, y_test_pred, label='Test Data' )
------------------------------------------------------------ Regression Metrics: Training Data ------------------------------------------------------------ - MAE = 1,376.776 - MSE = 2,609,667.246 - RMSE = 1,615.446 - R^2 = 0.943 ------------------------------------------------------------ Regression Metrics: Test Data ------------------------------------------------------------ - MAE = 1,366.751 - MSE = 2,747,847.920 - RMSE = 1,657.663 - R^2 = 0.933
Explore Kaggle and select a dataset for regression modeling. Ensure the dataset adheres to the following criteria:
**Any attempt of using generative AI will be considered as cheating !**