from sklearn.pipeline import make_pipeline
cat_vars = ["gender", "ever_married", "Residence_type",
"work_type", "smoking_status"]
ct = make_column_transformer(
(OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_vars),
("passthrough", ["hypertension", "heart_disease"]),
remainder=make_pipeline(SimpleImputer(), StandardScaler()),
verbose_feature_names_out=False
)
X_train_ct = ct.fit_transform(X_train)
X_val_ct = ct.transform(X_val)
X_test_ct = ct.transform(X_test)
for name, X in zip(("train", "val", "test"), (X_train_ct, X_val_ct, X_test_ct)):
num_na = X.isna().sum().sum()
print(f"The {name} set has shape {X.shape} & with {num_na} NAs.")
The train set has shape (3066, 20) & with 0 NAs.
The val set has shape (1022, 20) & with 0 NAs.
The test set has shape (1022, 20) & with 0 NAs.