Compares FeatureHasher and DictVectorizer by using both to vectorize text documents.
The example demonstrates syntax and speed only; it doesn?t actually do anything useful with the extracted vectors. See the example scripts {document_classification_20newsgroups,clustering}.py for actual learning on text documents.
A discrepancy between the number of terms reported for DictVectorizer and for FeatureHasher is to be expected due to hash collisions.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | # Author: Lars Buitinck # License: BSD 3 clause from __future__ import print_function from collections import defaultdict import re import sys from time import time import numpy as np from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction import DictVectorizer, FeatureHasher def n_nonzero_columns(X): """Returns the number of non-zero columns in a CSR matrix X.""" return len (np.unique(X.nonzero()[ 1 ])) def tokens(doc): """Extract tokens from doc. This uses a simple regex to break strings into tokens. For a more principled approach, see CountVectorizer or TfidfVectorizer. """ return (tok.lower() for tok in re.findall(r "\w+" , doc)) def token_freqs(doc): """Extract a dict mapping tokens from doc to their frequencies.""" freq = defaultdict( int ) for tok in tokens(doc): freq[tok] + = 1 return freq categories = [ 'alt.atheism' , 'comp.graphics' , 'comp.sys.ibm.pc.hardware' , 'misc.forsale' , 'rec.autos' , 'sci.space' , 'talk.religion.misc' , ] # Uncomment the following line to use a larger set (11k+ documents) #categories = None print (__doc__) print ( "Usage: %s [n_features_for_hashing]" % sys.argv[ 0 ]) print ( " The default number of features is 2**18." ) print () try : n_features = int (sys.argv[ 1 ]) except IndexError: n_features = 2 * * 18 except ValueError: print ( "not a valid number of features: %r" % sys.argv[ 1 ]) sys.exit( 1 ) print ( "Loading 20 newsgroups training data" ) raw_data = fetch_20newsgroups(subset = 'train' , categories = categories).data data_size_mb = sum ( len (s.encode( 'utf-8' )) for s in raw_data) / 1e6 print ( "%d documents - %0.3fMB" % ( len (raw_data), data_size_mb)) print () print ( "DictVectorizer" ) t0 = time() vectorizer = DictVectorizer() vectorizer.fit_transform(token_freqs(d) for d in raw_data) duration = time() - t0 print ( "done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print ( "Found %d unique terms" % len (vectorizer.get_feature_names())) print () print ( "FeatureHasher on frequency dicts" ) t0 = time() hasher = FeatureHasher(n_features = n_features) X = hasher.transform(token_freqs(d) for d in raw_data) duration = time() - t0 print ( "done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print ( "Found %d unique terms" % n_nonzero_columns(X)) print () print ( "FeatureHasher on raw tokens" ) t0 = time() hasher = FeatureHasher(n_features = n_features, input_type = "string" ) X = hasher.transform(tokens(d) for d in raw_data) duration = time() - t0 print ( "done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print ( "Found %d unique terms" % n_nonzero_columns(X)) |
Total running time of the script: (0 minutes 0.000 seconds)
Download Python source code:
hashing_vs_dict_vectorizer.py
Download IPython notebook:
hashing_vs_dict_vectorizer.ipynb
Please login to continue.