scikit-learn
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
35 Bytes b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
35 Bytes
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.2 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
1.2 KB
diff --git a/‎dev/_downloads/751db3d5e6b909ff00972495eaae53df/plot_document_clustering.ipynb
Lines changed: 84 additions & 1 deletion b/‎dev/_downloads/751db3d5e6b909ff00972495eaae53df/plot_document_clustering.ipynb
Lines changed: 84 additions & 1 deletion
diff --git a/‎dev/_downloads/ba68199eea858ec04949b2c6c65147e0/plot_document_clustering.py
Lines changed: 22 additions & 5 deletions b/‎dev/_downloads/ba68199eea858ec04949b2c6c65147e0/plot_document_clustering.py
Lines changed: 22 additions & 5 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
-13 KB b/‎dev/_downloads/scikit-learn-docs.zip
-13 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-140 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-140 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
14 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
14 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
391 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
391 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
126 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
126 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
19 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
19 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-12 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-12 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
138 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
138 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_cluster_comparison_001.png
138 Bytes b/‎dev/_images/sphx_glr_plot_cluster_comparison_001.png
138 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_cluster_comparison_thumb.png
-19 Bytes b/‎dev/_images/sphx_glr_plot_cluster_comparison_thumb.png
-19 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_001.png
-117 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_001.png
-117 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_002.png
-77 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_002.png
-77 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_003.png
-113 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_003.png
-113 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_thumb.png
-13 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_thumb.png
-13 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_calibration_001.png
71 Bytes b/‎dev/_images/sphx_glr_plot_compare_calibration_001.png
71 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_calibration_thumb.png
-1 Bytes b/‎dev/_images/sphx_glr_plot_compare_calibration_thumb.png
-1 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_methods_001.png
-135 Bytes b/‎dev/_images/sphx_glr_plot_compare_methods_001.png
-135 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_methods_thumb.png
-10 Bytes b/‎dev/_images/sphx_glr_plot_compare_methods_thumb.png
-10 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_dict_face_patches_001.png
94 Bytes b/‎dev/_images/sphx_glr_plot_dict_face_patches_001.png
94 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_dict_face_patches_thumb.png
108 Bytes b/‎dev/_images/sphx_glr_plot_dict_face_patches_thumb.png
108 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_digits_pipe_001.png
29 Bytes b/‎dev/_images/sphx_glr_plot_digits_pipe_001.png
29 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_digits_pipe_thumb.png
20 Bytes b/‎dev/_images/sphx_glr_plot_digits_pipe_thumb.png
20 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_001.png
8 Bytes b/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_001.png
8 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_thumb.png
50 Bytes b/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_thumb.png
50 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_faces_decomposition_003.png
-53 Bytes b/‎dev/_images/sphx_glr_plot_faces_decomposition_003.png
-53 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_faces_decomposition_004.png
-56 Bytes b/‎dev/_images/sphx_glr_plot_faces_decomposition_004.png
-56 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_faces_decomposition_005.png
58 Bytes b/‎dev/_images/sphx_glr_plot_faces_decomposition_005.png
58 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_faces_decomposition_006.png
54 Bytes b/‎dev/_images/sphx_glr_plot_faces_decomposition_006.png
54 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_faces_decomposition_007.png
94 Bytes b/‎dev/_images/sphx_glr_plot_faces_decomposition_007.png
94 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_faces_decomposition_009.png
10 Bytes b/‎dev/_images/sphx_glr_plot_faces_decomposition_009.png
10 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_feature_transformation_001.png
-5 Bytes b/‎dev/_images/sphx_glr_plot_feature_transformation_001.png
-5 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_feature_transformation_002.png
103 Bytes b/‎dev/_images/sphx_glr_plot_feature_transformation_002.png
103 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_feature_transformation_thumb.png
-11 Bytes b/‎dev/_images/sphx_glr_plot_feature_transformation_thumb.png
-11 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_001.png
1.44 KB b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_001.png
1.44 KB
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_002.png
-646 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_002.png
-646 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_thumb.png
738 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_thumb.png
738 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_early_stopping_002.png
319 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_early_stopping_002.png
319 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_001.png
-4 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_001.png
-4 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_004.png
97 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_004.png
97 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_005.png
-78 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_005.png
-78 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_thumb.png
12 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_thumb.png
12 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_approximation_001.png
-422 Bytes b/‎dev/_images/sphx_glr_plot_kernel_approximation_001.png
-422 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_approximation_thumb.png
-180 Bytes b/‎dev/_images/sphx_glr_plot_kernel_approximation_thumb.png
-180 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_001.png
-196 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_001.png
-196 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_002.png
-464 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_002.png
-464 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_thumb.png
-37 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_thumb.png
-37 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lasso_model_selection_002.png
29 Bytes b/‎dev/_images/sphx_glr_plot_lasso_model_selection_002.png
29 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lasso_model_selection_003.png
6 Bytes b/‎dev/_images/sphx_glr_plot_lasso_model_selection_003.png
6 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_learning_curve_001.png
-3.89 KB b/‎dev/_images/sphx_glr_plot_learning_curve_001.png
-3.89 KB
diff --git a/‎dev/_images/sphx_glr_plot_learning_curve_thumb.png
-440 Bytes b/‎dev/_images/sphx_glr_plot_learning_curve_thumb.png
-440 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_001.png
204 Bytes b/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_001.png
204 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_thumb.png
-10 Bytes b/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_thumb.png
-10 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linkage_comparison_001.png
541 Bytes b/‎dev/_images/sphx_glr_plot_linkage_comparison_001.png
541 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linkage_comparison_thumb.png
3 Bytes b/‎dev/_images/sphx_glr_plot_linkage_comparison_thumb.png
3 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_004.png
82 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_004.png
82 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_005.png
-148 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_005.png
-148 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_006.png
-265 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_006.png
-265 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_007.png
236 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_007.png
236 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_008.png
5 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_008.png
5 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_009.png
-18 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_009.png
-18 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_010.png
189 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_010.png
189 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_011.png
45 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_011.png
45 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_012.png
192 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_012.png
192 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_013.png
132 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_013.png
132 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_014.png
44 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_014.png
44 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_manifold_sphere_001.png
-128 Bytes b/‎dev/_images/sphx_glr_plot_manifold_sphere_001.png
-128 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_manifold_sphere_thumb.png
-22 Bytes b/‎dev/_images/sphx_glr_plot_manifold_sphere_thumb.png
-22 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_001.png
237 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_001.png
237 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_002.png
328 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_002.png
328 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_003.png
-3.43 KB b/‎dev/_images/sphx_glr_plot_model_complexity_influence_003.png
-3.43 KB
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_thumb.png
38 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_thumb.png
38 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_002.png
546 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_002.png
546 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_003.png
-318 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_003.png
-318 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_004.png
-619 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_004.png
-619 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_001.png
328 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_001.png
328 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_002.png
-1.04 KB b/‎dev/_images/sphx_glr_plot_prediction_latency_002.png
-1.04 KB
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_003.png
2.22 KB b/‎dev/_images/sphx_glr_plot_prediction_latency_003.png
2.22 KB
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_004.png
675 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_004.png
675 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_thumb.png
72 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_thumb.png
72 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_001.png
-2.67 KB b/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_001.png
-2.67 KB
diff --git a/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_thumb.png
-1.38 KB b/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_thumb.png
-1.38 KB
diff --git a/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_001.png
-140 Bytes b/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_001.png
-140 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_thumb.png
-242 Bytes b/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_thumb.png
-242 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sgd_early_stopping_001.png
-4.12 KB b/‎dev/_images/sphx_glr_plot_sgd_early_stopping_001.png
-4.12 KB
diff --git a/‎dev/_images/sphx_glr_plot_sgd_early_stopping_thumb.png
-561 Bytes b/‎dev/_images/sphx_glr_plot_sgd_early_stopping_thumb.png
-561 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_001.png
-270 Bytes b/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_001.png
-270 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_thumb.png
-33 Bytes b/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_thumb.png
-33 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_stack_predictors_001.png
-12 Bytes b/‎dev/_images/sphx_glr_plot_stack_predictors_001.png
-12 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_stack_predictors_thumb.png
-14 Bytes b/‎dev/_images/sphx_glr_plot_stack_predictors_thumb.png
-14 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_001.png
-59 Bytes b/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_001.png
-59 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_thumb.png
36 Bytes b/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_thumb.png
36 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_theilsen_002.png
17 Bytes b/‎dev/_images/sphx_glr_plot_theilsen_002.png
17 Bytes
diff --git a/‎dev/_sources/auto_examples/applications/plot_cyclical_feature_engineering.rst.txt
Lines changed: 1 addition & 1 deletion b/‎dev/_sources/auto_examples/applications/plot_cyclical_feature_engineering.rst.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_sources/auto_examples/applications/plot_digits_denoising.rst.txt
Lines changed: 1 addition & 1 deletion b/‎dev/_sources/auto_examples/applications/plot_digits_denoising.rst.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt
Lines changed: 5 additions & 5 deletions b/‎dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt
Lines changed: 5 additions & 5 deletions
@@ -26,7 +26,90 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Peter Prettenhofer <[email protected]>\n#         Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n    \"--lsa\",\n    dest=\"n_components\",\n    type=\"int\",\n    help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n    \"--no-minibatch\",\n    action=\"store_false\",\n    dest=\"minibatch\",\n    default=True,\n    help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n    \"--no-idf\",\n    action=\"store_false\",\n    dest=\"use_idf\",\n    default=True,\n    help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n    \"--use-hashing\",\n    action=\"store_true\",\n    default=False,\n    help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n    \"--n-features\",\n    type=int,\n    default=10000,\n    help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n    \"--verbose\",\n    action=\"store_true\",\n    dest=\"verbose\",\n    default=False,\n    help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\nop.print_help()\nprint()\n\n\ndef is_interactive():\n    return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n    op.error(\"this script takes no arguments.\")\n    sys.exit(1)\n\n\n# #############################################################################\n# Load some categories from the training set\ncategories = [\n    \"alt.atheism\",\n    \"talk.religion.misc\",\n    \"comp.graphics\",\n    \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n    subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()\n\nlabels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n    if opts.use_idf:\n        # Perform an IDF normalization on the output of HashingVectorizer\n        hasher = HashingVectorizer(\n            n_features=opts.n_features,\n            stop_words=\"english\",\n            alternate_sign=False,\n            norm=None,\n        )\n        vectorizer = make_pipeline(hasher, TfidfTransformer())\n    else:\n        vectorizer = HashingVectorizer(\n            n_features=opts.n_features,\n            stop_words=\"english\",\n            alternate_sign=False,\n            norm=\"l2\",\n        )\nelse:\n    vectorizer = TfidfVectorizer(\n        max_df=0.5,\n        max_features=opts.n_features,\n        min_df=2,\n        stop_words=\"english\",\n        use_idf=opts.use_idf,\n    )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n    print(\"Performing dimensionality reduction using LSA\")\n    t0 = time()\n    # Vectorizer results are normalized, which makes KMeans behave as\n    # spherical k-means for better results. Since LSA/SVD results are\n    # not normalized, we have to redo the normalization.\n    svd = TruncatedSVD(opts.n_components)\n    normalizer = Normalizer(copy=False)\n    lsa = make_pipeline(svd, normalizer)\n\n    X = lsa.fit_transform(X)\n\n    print(\"done in %fs\" % (time() - t0))\n\n    explained_variance = svd.explained_variance_ratio_.sum()\n    print(\n        \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n    )\n\n    print()\n\n\n# #############################################################################\n# Do the actual clustering\n\nif opts.minibatch:\n    km = MiniBatchKMeans(\n        n_clusters=true_k,\n        init=\"k-means++\",\n        n_init=1,\n        init_size=1000,\n        batch_size=1000,\n        verbose=opts.verbose,\n    )\nelse:\n    km = KMeans(\n        n_clusters=true_k,\n        init=\"k-means++\",\n        max_iter=100,\n        n_init=1,\n        verbose=opts.verbose,\n    )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()\n\nprint(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n    \"Silhouette Coefficient: %0.3f\"\n    % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()\n\n\nif not opts.use_hashing:\n    print(\"Top terms per cluster:\")\n\n    if opts.n_components:\n        original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n        order_centroids = original_space_centroids.argsort()[:, ::-1]\n    else:\n        order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n    terms = vectorizer.get_feature_names_out()\n    for i in range(true_k):\n        print(\"Cluster %d:\" % i, end=\"\")\n        for ind in order_centroids[i, :10]:\n            print(\" %s\" % terms[ind], end=\"\")\n        print()"
+        "# Author: Peter Prettenhofer <[email protected]>\n#         Lars Buitinck\n# License: BSD 3 clause\n\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.feature_extraction.text import HashingVectorizer\nfrom sklearn.feature_extraction.text import TfidfTransformer\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import Normalizer\nfrom sklearn import metrics\n\nfrom sklearn.cluster import KMeans, MiniBatchKMeans\n\nimport logging\nfrom optparse import OptionParser\nimport sys\nfrom time import time\n\nimport numpy as np\n\n\n# Display progress logs on stdout\nlogging.basicConfig(level=logging.INFO, format=\"%(asctime)s %(levelname)s %(message)s\")\n\n# parse commandline arguments\nop = OptionParser()\nop.add_option(\n    \"--lsa\",\n    dest=\"n_components\",\n    type=\"int\",\n    help=\"Preprocess documents with latent semantic analysis.\",\n)\nop.add_option(\n    \"--no-minibatch\",\n    action=\"store_false\",\n    dest=\"minibatch\",\n    default=True,\n    help=\"Use ordinary k-means algorithm (in batch mode).\",\n)\nop.add_option(\n    \"--no-idf\",\n    action=\"store_false\",\n    dest=\"use_idf\",\n    default=True,\n    help=\"Disable Inverse Document Frequency feature weighting.\",\n)\nop.add_option(\n    \"--use-hashing\",\n    action=\"store_true\",\n    default=False,\n    help=\"Use a hashing feature vectorizer\",\n)\nop.add_option(\n    \"--n-features\",\n    type=int,\n    default=10000,\n    help=\"Maximum number of features (dimensions) to extract from text.\",\n)\nop.add_option(\n    \"--verbose\",\n    action=\"store_true\",\n    dest=\"verbose\",\n    default=False,\n    help=\"Print progress reports inside k-means algorithm.\",\n)\n\nprint(__doc__)\n\n\ndef is_interactive():\n    return not hasattr(sys.modules[\"__main__\"], \"__file__\")\n\n\nif not is_interactive():\n    op.print_help()\n    print()\n\n# work-around for Jupyter notebook and IPython console\nargv = [] if is_interactive() else sys.argv[1:]\n(opts, args) = op.parse_args(argv)\nif len(args) > 0:\n    op.error(\"this script takes no arguments.\")\n    sys.exit(1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Load some categories from the training set\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "categories = [\n    \"alt.atheism\",\n    \"talk.religion.misc\",\n    \"comp.graphics\",\n    \"sci.space\",\n]\n# Uncomment the following to do the analysis on all the categories\n# categories = None\n\nprint(\"Loading 20 newsgroups dataset for categories:\")\nprint(categories)\n\ndataset = fetch_20newsgroups(\n    subset=\"all\", categories=categories, shuffle=True, random_state=42\n)\n\nprint(\"%d documents\" % len(dataset.data))\nprint(\"%d categories\" % len(dataset.target_names))\nprint()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Feature Extraction\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "labels = dataset.target\ntrue_k = np.unique(labels).shape[0]\n\nprint(\"Extracting features from the training dataset using a sparse vectorizer\")\nt0 = time()\nif opts.use_hashing:\n    if opts.use_idf:\n        # Perform an IDF normalization on the output of HashingVectorizer\n        hasher = HashingVectorizer(\n            n_features=opts.n_features,\n            stop_words=\"english\",\n            alternate_sign=False,\n            norm=None,\n        )\n        vectorizer = make_pipeline(hasher, TfidfTransformer())\n    else:\n        vectorizer = HashingVectorizer(\n            n_features=opts.n_features,\n            stop_words=\"english\",\n            alternate_sign=False,\n            norm=\"l2\",\n        )\nelse:\n    vectorizer = TfidfVectorizer(\n        max_df=0.5,\n        max_features=opts.n_features,\n        min_df=2,\n        stop_words=\"english\",\n        use_idf=opts.use_idf,\n    )\nX = vectorizer.fit_transform(dataset.data)\n\nprint(\"done in %fs\" % (time() - t0))\nprint(\"n_samples: %d, n_features: %d\" % X.shape)\nprint()\n\nif opts.n_components:\n    print(\"Performing dimensionality reduction using LSA\")\n    t0 = time()\n    # Vectorizer results are normalized, which makes KMeans behave as\n    # spherical k-means for better results. Since LSA/SVD results are\n    # not normalized, we have to redo the normalization.\n    svd = TruncatedSVD(opts.n_components)\n    normalizer = Normalizer(copy=False)\n    lsa = make_pipeline(svd, normalizer)\n\n    X = lsa.fit_transform(X)\n\n    print(\"done in %fs\" % (time() - t0))\n\n    explained_variance = svd.explained_variance_ratio_.sum()\n    print(\n        \"Explained variance of the SVD step: {}%\".format(int(explained_variance * 100))\n    )\n\n    print()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Clustering\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "if opts.minibatch:\n    km = MiniBatchKMeans(\n        n_clusters=true_k,\n        init=\"k-means++\",\n        n_init=1,\n        init_size=1000,\n        batch_size=1000,\n        verbose=opts.verbose,\n    )\nelse:\n    km = KMeans(\n        n_clusters=true_k,\n        init=\"k-means++\",\n        max_iter=100,\n        n_init=1,\n        verbose=opts.verbose,\n    )\n\nprint(\"Clustering sparse data with %s\" % km)\nt0 = time()\nkm.fit(X)\nprint(\"done in %0.3fs\" % (time() - t0))\nprint()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Performance metrics\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "print(\"Homogeneity: %0.3f\" % metrics.homogeneity_score(labels, km.labels_))\nprint(\"Completeness: %0.3f\" % metrics.completeness_score(labels, km.labels_))\nprint(\"V-measure: %0.3f\" % metrics.v_measure_score(labels, km.labels_))\nprint(\"Adjusted Rand-Index: %.3f\" % metrics.adjusted_rand_score(labels, km.labels_))\nprint(\n    \"Silhouette Coefficient: %0.3f\"\n    % metrics.silhouette_score(X, km.labels_, sample_size=1000)\n)\n\nprint()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "if not opts.use_hashing:\n    print(\"Top terms per cluster:\")\n\n    if opts.n_components:\n        original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n        order_centroids = original_space_centroids.argsort()[:, ::-1]\n    else:\n        order_centroids = km.cluster_centers_.argsort()[:, ::-1]\n\n    terms = vectorizer.get_feature_names_out()\n    for i in range(true_k):\n        print(\"Cluster %d:\" % i, end=\"\")\n        for ind in order_centroids[i, :10]:\n            print(\" %s\" % terms[ind], end=\"\")\n        print()"
       ]
     }
   ],
 
@@ -118,14 +118,16 @@
 )
 
 print(__doc__)
-op.print_help()
-print()
 
 
 def is_interactive():
     return not hasattr(sys.modules["__main__"], "__file__")
 
 
+if not is_interactive():
+    op.print_help()
+    print()
+
 # work-around for Jupyter notebook and IPython console
 argv = [] if is_interactive() else sys.argv[1:]
 (opts, args) = op.parse_args(argv)
@@ -134,8 +136,10 @@ def is_interactive():
     sys.exit(1)
 
 
-# #############################################################################
+# %%
 # Load some categories from the training set
+# ------------------------------------------
+
 categories = [
     "alt.atheism",
     "talk.religion.misc",
@@ -156,6 +160,11 @@ def is_interactive():
 print("%d categories" % len(dataset.target_names))
 print()
 
+
+# %%
+# Feature Extraction
+# ------------------
+
 labels = dataset.target
 true_k = np.unique(labels).shape[0]
 
@@ -214,8 +223,9 @@ def is_interactive():
     print()
 
 
-# #############################################################################
-# Do the actual clustering
+# %%
+# Clustering
+# ----------
 
 if opts.minibatch:
     km = MiniBatchKMeans(
@@ -241,6 +251,11 @@ def is_interactive():
 print("done in %0.3fs" % (time() - t0))
 print()
 
+
+# %%
+# Performance metrics
+# -------------------
+
 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
 print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
 print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
@@ -253,6 +268,8 @@ def is_interactive():
 print()
 
 
+# %%
+
 if not opts.use_hashing:
     print("Top terms per cluster:")