"# Author: Peter Prettenhofer <
[email protected]>\n# Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n \"--lsa\",\n dest=\"n_components\",\n type=\"int\",\n help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n \"--no-minibatch\",\n action=\"store_false\",\n dest=\"minibatch\",\n default=True,\n help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n \"--no-idf\",\n action=\"store_false\",\n dest=\"use_idf\",\n default=True,\n help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n \"--use-hashing\",\n action=\"store_true\",\n default=False,\n help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n \"--n-features\",\n type=int,\n default=10000,\n help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n \"--verbose\",\n action=\"store_true\",\n dest=\"verbose\",\n default=False,\n help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\nop.print_help()\nprint()\n\n\ndef is_interactive():\n return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n op.error(\"this script takes no arguments.\")\n sys.exit(1)\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n \"alt.atheism\",\n \"talk.religion.misc\",\n \"comp.graphics\",\n \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n if opts.use_idf:\n # Perform an IDF normalization on the output of HashingVectorizer\n hasher = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=None,\n )\n vectorizer = make_pipeline(hasher, TfidfTransformer())\n else:\n vectorizer = HashingVectorizer(\n n_features=opts.n_features,\n stop_words=\"english\",\n alternate_sign=False,\n norm=\"l2\",\n )\nelse:\n vectorizer = TfidfVectorizer(\n max_df=0.5,\n max_features=opts.n_features,\n min_df=2,\n stop_words=\"english\",\n use_idf=opts.use_idf,\n )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n print(\"Performing dimensionality reduction using LSA\")\n t0 = time()\n # Vectorizer results are normalized, which makes KMeans behave as\n # spherical k-means for better results. Since LSA/SVD results are\n # not normalized, we have to redo the normalization.\n svd = TruncatedSVD(opts.n_components)\n normalizer = Normalizer(copy=False)\n lsa = make_pipeline(svd, normalizer)\n\n X = lsa.fit_transform(X)\n\n print(\"done in %fs\" % (time() - t0))\n\n explained_variance = svd.explained_variance_ratio_.sum()\n print(\n \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n )\n\n print()\n\n\n# #############################################################################\n# Do the actual clustering\n\nif opts.minibatch:\n km = MiniBatchKMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n n_init=1,\n init_size=1000,\n batch_size=1000,\n verbose=opts.verbose,\n )\nelse:\n km = KMeans(\n n_clusters=true_k,\n init=\"k-means++\",\n max_iter=100,\n n_init=1,\n verbose=opts.verbose,\n )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n \"Silhouette Coefficient: %0.3f\"\n % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()\n\n\nif not opts.use_hashing:\n print(\"Top terms per cluster:\")\n\n if opts.n_components:\n original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n order_centroids = original_space_centroids.argsort()[:, ::-1]\n else:\n order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n terms = vectorizer.get_feature_names_out()\n for i in range(true_k):\n print(\"Cluster %d:\" % i, end=\"\")\n for ind in order_centroids[i, :10]:\n print(\" %s\" % terms[ind], end=\"\")\n print()"
0 commit comments