This is an example showing how the scikit-learn can be used to classify documents by topics using a bag-of-words approach. This example uses a scipy.sparse matrix to store the features instead of standard numpy arrays.
The dataset used in this example is the 20 newsgroups dataset and should be downloaded from the http://mlcomp.org (free registration required):
http://mlcomp.org/datasets/379
Once downloaded unzip the archive somewhere on your filesystem. For instance in:
1 2 3 | % mkdir - p ~ / data / mlcomp % cd ~ / data / mlcomp % unzip / path / to / dataset - 379 - 20news - 18828_XXXXX . zip |
You should get a folder ~/data/mlcomp/379
with a file named metadata
and subfolders raw
, train
and test
holding the text documents organized by newsgroups.
Then set the MLCOMP_DATASETS_HOME
environment variable pointing to the root folder holding the uncompressed archive:
1 | % export MLCOMP_DATASETS_HOME = "~/data/mlcomp" |
Then you are ready to run this example using your favorite python shell:
1 | % ipython examples / mlcomp_sparse_document_classification.py |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | # Author: Olivier Grisel <olivier.grisel@ensta.org> # License: BSD 3 clause from __future__ import print_function from time import time import sys import os import numpy as np import scipy.sparse as sp import matplotlib.pyplot as plt from sklearn.datasets import load_mlcomp from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import SGDClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report from sklearn.naive_bayes import MultinomialNB print (__doc__) if 'MLCOMP_DATASETS_HOME' not in os.environ: print ( "MLCOMP_DATASETS_HOME not set; please follow the above instructions" ) sys.exit( 0 ) # Load the training set print ( "Loading 20 newsgroups training set... " ) news_train = load_mlcomp( '20news-18828' , 'train' ) print (news_train.DESCR) print ( "%d documents" % len (news_train.filenames)) print ( "%d categories" % len (news_train.target_names)) print ( "Extracting features from the dataset using a sparse vectorizer" ) t0 = time() vectorizer = TfidfVectorizer(encoding = 'latin1' ) X_train = vectorizer.fit_transform(( open (f).read() for f in news_train.filenames)) print ( "done in %fs" % (time() - t0)) print ( "n_samples: %d, n_features: %d" % X_train.shape) assert sp.issparse(X_train) y_train = news_train.target print ( "Loading 20 newsgroups test set... " ) news_test = load_mlcomp( '20news-18828' , 'test' ) t0 = time() print ( "done in %fs" % (time() - t0)) print ( "Predicting the labels of the test set..." ) print ( "%d documents" % len (news_test.filenames)) print ( "%d categories" % len (news_test.target_names)) print ( "Extracting features from the dataset using the same vectorizer" ) t0 = time() X_test = vectorizer.transform(( open (f).read() for f in news_test.filenames)) y_test = news_test.target print ( "done in %fs" % (time() - t0)) print ( "n_samples: %d, n_features: %d" % X_test.shape) |
Benchmark classifiers
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | def benchmark(clf_class, params, name): print ( "parameters:" , params) t0 = time() clf = clf_class( * * params).fit(X_train, y_train) print ( "done in %fs" % (time() - t0)) if hasattr (clf, 'coef_' ): print ( "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ ! = 0 ) * 100 )) print ( "Predicting the outcomes of the testing set" ) t0 = time() pred = clf.predict(X_test) print ( "done in %fs" % (time() - t0)) print ( "Classification report on test set for classifier:" ) print (clf) print () print (classification_report(y_test, pred, target_names = news_test.target_names)) cm = confusion_matrix(y_test, pred) print ( "Confusion matrix:" ) print (cm) # Show confusion matrix plt.matshow(cm) plt.title( 'Confusion matrix of the %s classifier' % name) plt.colorbar() print ( "Testbenching a linear classifier..." ) parameters = { 'loss' : 'hinge' , 'penalty' : 'l2' , 'n_iter' : 50 , 'alpha' : 0.00001 , 'fit_intercept' : True , } benchmark(SGDClassifier, parameters, 'SGD' ) print ( "Testbenching a MultinomialNB classifier..." ) parameters = { 'alpha' : 0.01 } benchmark(MultinomialNB, parameters, 'MultinomialNB' ) plt.show() |
Total running time of the script: (0 minutes 0.000 seconds)
mlcomp_sparse_document_classification.py
mlcomp_sparse_document_classification.ipynb
Please login to continue.