Overview¶
The feature-grouper
package provides functions and a scikit-learn
transformer class for applying a simple yet effective form of
dimensionality reduction based on hierarchical clustering of correlated
features.
Example usage:
import numpy as np
import pandas as pd
from sklearn import datasets
import feature_grouper
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
print(iris_df.head())
"""
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
"""
threshold = 0.5 # correlation coefficient threshold for clustering
fg = feature_grouper.FeatureGrouper(threshold)
iris_trans = fg.fit_transform(iris_df)
loadings = pd.DataFrame(fg.components_, columns=iris.feature_names)
print(loadings)
"""
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 0.471405 0.0 0.471405 0.471405
1 0.000000 1.0 0.000000 0.000000
"""
# column 0 is now a linear combination of correlated features
# sepal length, petal length, and petal width.
print(iris_trans.head())
"""
0 1
0 3.158410 3.5
1 3.064129 3.0
2 2.922708 3.2
3 2.969848 3.1
4 3.111270 3.6
"""