This example compares 2 dimensionality reduction strategies:
- univariate feature selection with Anova
- feature agglomeration with Ward hierarchical clustering
Both methods are compared in a regression problem using a BayesianRidge as supervised estimator.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr> # License: BSD 3 clause print (__doc__) import shutil import tempfile import numpy as np import matplotlib.pyplot as plt from scipy import linalg, ndimage from sklearn.feature_extraction.image import grid_to_graph from sklearn import feature_selection from sklearn.cluster import FeatureAgglomeration from sklearn.linear_model import BayesianRidge from sklearn.pipeline import Pipeline from sklearn.externals.joblib import Memory from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold |
Generate data
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | n_samples = 200 size = 40 # image size roi_size = 15 snr = 5. np.random.seed( 0 ) mask = np.ones([size, size], dtype = np. bool ) coef = np.zeros((size, size)) coef[ 0 :roi_size, 0 :roi_size] = - 1. coef[ - roi_size:, - roi_size:] = 1. X = np.random.randn(n_samples, size * * 2 ) for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma = 1.0 ).ravel() X - = X.mean(axis = 0 ) X / = X.std(axis = 0 ) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[ 0 ]) noise_coef = (linalg.norm(y, 2 ) / np.exp(snr / 20. )) / linalg.norm(noise, 2 ) y + = noise_coef * noise # add noise |
Compute the coefs of a Bayesian Ridge with GridSearch
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | cv = KFold( 2 ) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir = cachedir, verbose = 1 ) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x = size, n_y = size) ward = FeatureAgglomeration(n_clusters = 10 , connectivity = connectivity, memory = mem) clf = Pipeline([( 'ward' , ward), ( 'ridge' , ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, { 'ward__n_clusters' : [ 10 , 20 , 30 ]}, n_jobs = 1 , cv = cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[ - 1 ][ 1 ].coef_ coef_ = clf.best_estimator_.steps[ 0 ][ 1 ].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([( 'anova' , anova), ( 'ridge' , ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, { 'anova__percentile' : [ 5 , 10 , 20 ]}, cv = cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[ - 1 ][ 1 ].coef_ coef_ = clf.best_estimator_.steps[ 0 ][ 1 ].inverse_transform(coef_.reshape( 1 , - 1 )) coef_selection_ = coef_.reshape(size, size) |
Out:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | ________________________________________________________________________________ [Memory] Calling sklearn.cluster.hierarchical.ward_tree... ward_tree(array([[ - 0.451933 , ..., - 0.675318 ], ..., [ 0.275706 , ..., - 1.085711 ]]), < 1600x1600 sparse matrix of type '<type ' numpy.int64 '>' with 7840 stored elements in COOrdinate format >, n_clusters = None ) ________________________________________________________ward_tree - 0.2s , 0.0min ________________________________________________________________________________ [Memory] Calling sklearn.cluster.hierarchical.ward_tree... ward_tree(array([[ 0.905206 , ..., 0.161245 ], ..., [ - 0.849835 , ..., - 1.091621 ]]), < 1600x1600 sparse matrix of type '<type ' numpy.int64 '>' with 7840 stored elements in COOrdinate format >, n_clusters = None ) ________________________________________________________ward_tree - 0.2s , 0.0min ________________________________________________________________________________ [Memory] Calling sklearn.cluster.hierarchical.ward_tree... ward_tree(array([[ 0.905206 , ..., - 0.675318 ], ..., [ - 0.849835 , ..., - 1.085711 ]]), < 1600x1600 sparse matrix of type '<type ' numpy.int64 '>' with 7840 stored elements in COOrdinate format >, n_clusters = None ) ________________________________________________________ward_tree - 0.2s , 0.0min ________________________________________________________________________________ [Memory] Calling sklearn.feature_selection.univariate_selection.f_regression... f_regression(array([[ - 0.451933 , ..., 0.275706 ], ..., [ - 0.675318 , ..., - 1.085711 ]]), array([ 25.267703 , ..., - 25.026711 ])) _____________________________________________________f_regression - 0.0s , 0.0min ________________________________________________________________________________ [Memory] Calling sklearn.feature_selection.univariate_selection.f_regression... f_regression(array([[ 0.905206 , ..., - 0.849835 ], ..., [ 0.161245 , ..., - 1.091621 ]]), array([ - 27.447268 , ..., - 112.638768 ])) _____________________________________________________f_regression - 0.0s , 0.0min ________________________________________________________________________________ [Memory] Calling sklearn.feature_selection.univariate_selection.f_regression... f_regression(array([[ 0.905206 , ..., - 0.849835 ], ..., [ - 0.675318 , ..., - 1.085711 ]]), array([ - 27.447268 , ..., - 25.026711 ])) _____________________________________________________f_regression - 0.0s , 0.0min |
Inverse the transformation to plot the results on an image
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | plt.close( 'all' ) plt.figure(figsize = ( 7.3 , 2.7 )) plt.subplot( 1 , 3 , 1 ) plt.imshow(coef, interpolation = "nearest" , cmap = plt.cm.RdBu_r) plt.title( "True weights" ) plt.subplot( 1 , 3 , 2 ) plt.imshow(coef_selection_, interpolation = "nearest" , cmap = plt.cm.RdBu_r) plt.title( "Feature Selection" ) plt.subplot( 1 , 3 , 3 ) plt.imshow(coef_agglomeration_, interpolation = "nearest" , cmap = plt.cm.RdBu_r) plt.title( "Feature Agglomeration" ) plt.subplots_adjust( 0.04 , 0.0 , 0.98 , 0.94 , 0.16 , 0.26 ) plt.show() # Attempt to remove the temporary cachedir, but don't worry if it fails shutil.rmtree(cachedir, ignore_errors = True ) |
Total running time of the script: (0 minutes 1.336 seconds)
Download Python source code:
plot_feature_agglomeration_vs_univariate_selection.py
Download IPython notebook:
plot_feature_agglomeration_vs_univariate_selection.ipynb
Please login to continue.