|
11 | 11 | - initial_centroids , initial centroid values generated by utility function(mentioned
|
12 | 12 | in usage).
|
13 | 13 | - maxiter , maximum number of iterations to process.
|
14 |
| - - heterogeneity , empty list that will be filled with hetrogeneity values if passed |
| 14 | + - heterogeneity , empty list that will be filled with heterogeneity values if passed |
15 | 15 | to kmeans func.
|
16 | 16 | Usage:
|
17 |
| - 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list |
| 17 | + 1. define 'k' value, 'X' features array and 'heterogeneity' empty list |
18 | 18 | 2. create initial_centroids,
|
19 | 19 | initial_centroids = get_initial_centroids(
|
20 | 20 | X,
|
|
31 | 31 | record_heterogeneity=heterogeneity,
|
32 | 32 | verbose=True # whether to print logs in console or not.(default=False)
|
33 | 33 | )
|
34 |
| - 4. Plot the loss function, hetrogeneity values for every iteration saved in |
35 |
| - hetrogeneity list. |
| 34 | + 4. Plot the loss function and heterogeneity values for every iteration saved in |
| 35 | + heterogeneity list. |
36 | 36 | plot_heterogeneity(
|
37 | 37 | heterogeneity,
|
38 | 38 | k
|
@@ -198,13 +198,10 @@ def report_generator(
|
198 | 198 | df: pd.DataFrame, clustering_variables: np.ndarray, fill_missing_report=None
|
199 | 199 | ) -> pd.DataFrame:
|
200 | 200 | """
|
201 |
| - Function generates easy-erading clustering report. It takes 2 arguments as an input: |
202 |
| - DataFrame - dataframe with predicted cluester column; |
203 |
| - FillMissingReport - dictionary of rules how we are going to fill missing |
204 |
| - values of for final report generate (not included in modeling); |
205 |
| - in order to run the function following libraries must be imported: |
206 |
| - import pandas as pd |
207 |
| - import numpy as np |
| 201 | + Generates a clustering report. This function takes 2 arguments as input: |
| 202 | + df - dataframe with predicted cluster column |
| 203 | + fill_missing_report - dictionary of rules on how we are going to fill in missing |
| 204 | + values for final generated report (not included in modelling); |
208 | 205 | >>> data = pd.DataFrame()
|
209 | 206 | >>> data['numbers'] = [1, 2, 3]
|
210 | 207 | >>> data['col1'] = [0.5, 2.5, 4.5]
|
@@ -306,10 +303,10 @@ def report_generator(
|
306 | 303 | a.columns = report.columns # rename columns to match report
|
307 | 304 | report = report.drop(
|
308 | 305 | report[report.Type == "count"].index
|
309 |
| - ) # drop count values except cluster size |
| 306 | + ) # drop count values except for cluster size |
310 | 307 | report = pd.concat(
|
311 | 308 | [report, a, clustersize, clusterproportion], axis=0
|
312 |
| - ) # concat report with clustert size and nan values |
| 309 | + ) # concat report with cluster size and nan values |
313 | 310 | report["Mark"] = report["Features"].isin(clustering_variables)
|
314 | 311 | cols = report.columns.tolist()
|
315 | 312 | cols = cols[0:2] + cols[-1:] + cols[2:-1]
|
|
0 commit comments