asmgx
diff --git a/Diff for: ‎docs/ml-features.md
+140 b/Diff for: ‎docs/ml-features.md
+140
diff --git a/Diff for: ‎docs/ml-statistics.md
+55-1 b/Diff for: ‎docs/ml-statistics.md
+55-1
diff --git a/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaANOVASelectorExample.java
+81 b/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaANOVASelectorExample.java
+81
diff --git a/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaANOVATestExample.java
+1-1 b/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaANOVATestExample.java
+1-1
diff --git a/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaFValueSelectorExample.java
+81 b/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaFValueSelectorExample.java
+81
diff --git a/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaFValueTestExample.java
+1-1 b/Diff for: ‎examples/src/main/java/org/apache/spark/examples/ml/JavaFValueTestExample.java
+1-1
@@ -1793,6 +1793,146 @@ for more details on the API.
 </div>
 </div>
 
+## ANOVASelector
+
+`ANOVASelector` operates on categorical labels with continuous features. It uses the
+[one-way ANOVA F-test](https://en.wikipedia.org/wiki/F-test#Multiple-comparison_ANOVA_problems) to decide which
+features to choose.
+It supports five selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`:
+* `numTopFeatures` chooses a fixed number of top features according to ANOVA F-test.
+* `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
+* `fpr` chooses all features whose p-values are below a threshold, thus controlling the false positive rate of selection.
+* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
+* `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.
+By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
+The user can choose a selection method using `setSelectorType`.
+
+**Examples**
+
+Assume that we have a DataFrame with the columns `id`, `features`, and `label`, which is used as
+our target to be predicted:
+
+~~~
+id | features                       | label
+---|--------------------------------|---------
+ 1 | [1.7, 4.4, 7.6, 5.8, 9.6, 2.3] | 3.0
+ 2 | [8.8, 7.3, 5.7, 7.3, 2.2, 4.1] | 2.0
+ 3 | [1.2, 9.5, 2.5, 3.1, 8.7, 2.5] | 3.0
+ 4 | [3.7, 9.2, 6.1, 4.1, 7.5, 3.8] | 2.0
+ 5 | [8.9, 5.2, 7.8, 8.3, 5.2, 3.0] | 4.0
+ 6 | [7.9, 8.5, 9.2, 4.0, 9.4, 2.1] | 4.0
+~~~
+
+If we use `ANOVASelector` with `numTopFeatures = 1`, the
+last column in our `features` is chosen as the most useful feature:
+
+~~~
+id | features                       | label   | selectedFeatures
+---|--------------------------------|---------|------------------
+ 1 | [1.7, 4.4, 7.6, 5.8, 9.6, 2.3] | 3.0     | [2.3]
+ 2 | [8.8, 7.3, 5.7, 7.3, 2.2, 4.1] | 2.0     | [4.1]
+ 3 | [1.2, 9.5, 2.5, 3.1, 8.7, 2.5] | 3.0     | [2.5]
+ 4 | [3.7, 9.2, 6.1, 4.1, 7.5, 3.8] | 2.0     | [3.8]
+ 5 | [8.9, 5.2, 7.8, 8.3, 5.2, 3.0] | 4.0     | [3.0]
+ 6 | [7.9, 8.5, 9.2, 4.0, 9.4, 2.1] | 4.0     | [2.1]
+~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [ANOVASelector Scala docs](api/scala/org/apache/spark/ml/feature/ANOVASelector.html)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/ANOVASelectorExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [ANOVASelector Java docs](api/java/org/apache/spark/ml/feature/ANOVASelector.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaANOVASelectorExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [ANOVASelector Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.ANOVASelector)
+for more details on the API.
+
+{% include_example python/ml/anova_selector_example.py %}
+</div>
+</div>
+
+## FValueSelector
+
+`FValueSelector` operates on categorical labels with continuous features. It uses the
+[F-test for regression](https://en.wikipedia.org/wiki/F-test#Regression_problems) to decide which
+features to choose.
+It supports five selection methods: `numTopFeatures`, `percentile`, `fpr`, `fdr`, `fwe`:
+* `numTopFeatures` chooses a fixed number of top features according to a F-test for regression.
+* `percentile` is similar to `numTopFeatures` but chooses a fraction of all features instead of a fixed number.
+* `fpr` chooses all features whose p-values are below a threshold, thus controlling the false positive rate of selection.
+* `fdr` uses the [Benjamini-Hochberg procedure](https://en.wikipedia.org/wiki/False_discovery_rate#Benjamini.E2.80.93Hochberg_procedure) to choose all features whose false discovery rate is below a threshold.
+* `fwe` chooses all features whose p-values are below a threshold. The threshold is scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.
+By default, the selection method is `numTopFeatures`, with the default number of top features set to 50.
+The user can choose a selection method using `setSelectorType`.
+
+**Examples**
+
+Assume that we have a DataFrame with the columns `id`, `features`, and `label`, which is used as
+our target to be predicted:
+
+~~~
+id | features                       | label
+---|--------------------------------|---------
+ 1 | [6.0, 7.0, 0.0, 7.0, 6.0, 0.0] | 4.6
+ 2 | [0.0, 9.0, 6.0, 0.0, 5.0, 9.0] | 6.6
+ 3 | [0.0, 9.0, 3.0, 0.0, 5.0, 5.0] | 5.1
+ 4 | [0.0, 9.0, 8.0, 5.0, 6.0, 4.0] | 7.6
+ 5 | [8.0, 9.0, 6.0, 5.0, 4.0, 4.0] | 9.0
+ 6 | [8.0, 9.0, 6.0, 4.0, 0.0, 0.0] | 9.0
+~~~
+
+If we use `FValueSelector` with `numTopFeatures = 1`, the
+3rd column in our `features` is chosen as the most useful feature:
+
+~~~
+id | features                       | label   | selectedFeatures
+---|--------------------------------|---------|------------------
+ 1 | [6.0, 7.0, 0.0, 7.0, 6.0, 0.0] | 4.6     | [0.0]
+ 2 | [0.0, 9.0, 6.0, 0.0, 5.0, 9.0] | 6.6     | [6.0]
+ 3 | [0.0, 9.0, 3.0, 0.0, 5.0, 5.0] | 5.1     | [3.0]
+ 4 | [0.0, 9.0, 8.0, 5.0, 6.0, 4.0] | 7.6     | [8.0]
+ 5 | [8.0, 9.0, 6.0, 5.0, 4.0, 4.0] | 9.0     | [6.0]
+ 6 | [8.0, 9.0, 6.0, 4.0, 0.0, 0.0] | 9.0     | [6.0]
+~~~
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+Refer to the [FValueSelector Scala docs](api/scala/org/apache/spark/ml/feature/FValueSelector.html)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/FValueSelectorExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+
+Refer to the [FValueSelector Java docs](api/java/org/apache/spark/ml/feature/FValueSelector.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaFValueSelectorExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+
+Refer to the [FValueSelector Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.FValueSelector)
+for more details on the API.
+
+{% include_example python/ml/anova_selector_example.py %}
+</div>
+</div>
+
 ## VarianceThresholdSelector
 
 `VarianceThresholdSelector` is a selector that removes low-variance features. Features with a
 
@@ -79,7 +79,35 @@ The output will be a DataFrame that contains the correlation matrix of the colum
 
 Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically
 significant, whether this result occurred by chance or not. `spark.ml` currently supports Pearson's
-Chi-squared ( $\chi^2$) tests for independence.
+Chi-squared ( $\chi^2$) tests for independence, as well as ANOVA test for classification tasks and
+F-value test for regression tasks.
+
+### ANOVATest
+
+`ANOVATest` computes ANOVA F-values between labels and features for classification tasks. The labels should be categorical
+and features should be continuous.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+Refer to the [`ANOVATest` Scala docs](api/scala/org/apache/spark/ml/stat/ANOVATest$.html) for details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/ANOVATestExample.scala %}
+</div>
+
+<div data-lang="java" markdown="1">
+Refer to the [`ANOVATest` Java docs](api/java/org/apache/spark/ml/stat/ANOVATest.html) for details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaANOVATestExample.java %}
+</div>
+
+<div data-lang="python" markdown="1">
+Refer to the [`ANOVATest` Python docs](api/python/index.html#pyspark.ml.stat.ANOVATest$) for details on the API.
+
+{% include_example python/ml/anova_test_example.py %}
+</div>
+</div>
+
+### ChiSquareTest
 
 `ChiSquareTest` conducts Pearson's independence test for every feature against the label.
 For each feature, the (feature, label) pairs are converted into a contingency matrix for which
@@ -106,6 +134,32 @@ Refer to the [`ChiSquareTest` Python docs](api/python/index.html#pyspark.ml.stat
 
 </div>
 
+### FValueTest
+
+`FValueTest` computes F-values between labels and features for regression tasks. Both the labels
+ and features should be continuous.
+
+ <div class="codetabs">
+ <div data-lang="scala" markdown="1">
+ Refer to the [`FValueTest` Scala docs](api/scala/org/apache/spark/ml/stat/FValueTest$.html) for details on the API.
+
+ {% include_example scala/org/apache/spark/examples/ml/FValueTestExample.scala %}
+ </div>
+
+ <div data-lang="java" markdown="1">
+ Refer to the [`FValueTest` Java docs](api/java/org/apache/spark/ml/stat/FValueTest.html) for details on the API.
+
+ {% include_example java/org/apache/spark/examples/ml/JavaFValueTestExample.java %}
+ </div>
+
+ <div data-lang="python" markdown="1">
+ Refer to the [`FValueTest` Python docs](api/python/index.html#pyspark.ml.stat.FValueTest$) for details on the API.
+
+ {% include_example python/ml/fvalue_test_example.py %}
+ </div>
+
+ </div>
+
 ## Summarizer
 
 We provide vector column summary statistics for `Dataframe` through `Summarizer`.
 
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.ANOVASelector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+/**
+ * An example for ANOVASelector.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaANOVASelectorExample
+ * </pre>
+ */
+public class JavaANOVASelectorExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaANOVASelectorExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(1, Vectors.dense(1.7, 4.4, 7.6, 5.8, 9.6, 2.3), 3.0),
+      RowFactory.create(2, Vectors.dense(8.8, 7.3, 5.7, 7.3, 2.2, 4.1), 2.0),
+      RowFactory.create(3, Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5), 3.0),
+      RowFactory.create(4, Vectors.dense(3.7, 9.2, 6.1, 4.1, 7.5, 3.8), 2.0),
+      RowFactory.create(5, Vectors.dense(8.9, 5.2, 7.8, 8.3, 5.2, 3.0), 4.0),
+      RowFactory.create(6, Vectors.dense(7.9, 8.5, 9.2, 4.0, 9.4, 2.1), 4.0)
+    );
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty())
+    });
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    ANOVASelector selector = new ANOVASelector()
+      .setNumTopFeatures(1)
+      .setFeaturesCol("features")
+      .setLabelCol("label")
+      .setOutputCol("selectedFeatures");
+
+    Dataset<Row> result = selector.fit(df).transform(df);
+
+    System.out.println("ANOVASelector output with top " + selector.getNumTopFeatures()
+        + " features selected");
+    result.show();
+
+    // $example off$
+    spark.stop();
+  }
+}
@@ -51,7 +51,7 @@ public static void main(String[] args) {
     List<Row> data = Arrays.asList(
       RowFactory.create(3.0, Vectors.dense(1.7, 4.4, 7.6, 5.8, 9.6, 2.3)),
       RowFactory.create(2.0, Vectors.dense(8.8, 7.3, 5.7, 7.3, 2.2, 4.1)),
-      RowFactory.create(1.0, Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5)),
+      RowFactory.create(3.0, Vectors.dense(1.2, 9.5, 2.5, 3.1, 8.7, 2.5)),
       RowFactory.create(2.0, Vectors.dense(3.7, 9.2, 6.1, 4.1, 7.5, 3.8)),
       RowFactory.create(4.0, Vectors.dense(8.9, 5.2, 7.8, 8.3, 5.2, 3.0)),
       RowFactory.create(4.0, Vectors.dense(7.9, 8.5, 9.2, 4.0, 9.4, 2.1))
 
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.feature.FValueSelector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+// $example off$
+
+/**
+ * An example demonstrating FValueSelector.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaFValueSelectorExample
+ * </pre>
+ */
+public class JavaFValueSelectorExample {
+  public static void main(String[] args) {
+    SparkSession spark = SparkSession
+      .builder()
+      .appName("JavaFValueSelectorExample")
+      .getOrCreate();
+
+    // $example on$
+    List<Row> data = Arrays.asList(
+      RowFactory.create(1, Vectors.dense(6.0, 7.0, 0.0, 7.0, 6.0, 0.0), 4.6),
+      RowFactory.create(2, Vectors.dense(0.0, 9.0, 6.0, 0.0, 5.0, 9.0), 6.6),
+      RowFactory.create(3, Vectors.dense(0.0, 9.0, 3.0, 0.0, 5.0, 5.0), 5.1),
+      RowFactory.create(4, Vectors.dense(0.0, 9.0, 8.0, 5.0, 6.0, 4.0), 7.6),
+      RowFactory.create(5, Vectors.dense(8.0, 9.0, 6.0, 5.0, 4.0, 4.0), 9.0),
+      RowFactory.create(6, Vectors.dense(8.0, 9.0, 6.0, 4.0, 0.0, 0.0), 9.0)
+    );
+    StructType schema = new StructType(new StructField[]{
+      new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
+      new StructField("features", new VectorUDT(), false, Metadata.empty()),
+      new StructField("label", DataTypes.DoubleType, false, Metadata.empty())
+    });
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+
+    FValueSelector selector = new FValueSelector()
+      .setNumTopFeatures(1)
+      .setFeaturesCol("features")
+      .setLabelCol("label")
+      .setOutputCol("selectedFeatures");
+
+    Dataset<Row> result = selector.fit(df).transform(df);
+
+    System.out.println("FValueSelector output with top " + selector.getNumTopFeatures()
+        + " features selected");
+    result.show();
+
+    // $example off$
+    spark.stop();
+  }
+}
@@ -66,7 +66,7 @@ public static void main(String[] args) {
     Row r = FValueTest.test(df, "features", "label").head();
     System.out.println("pValues: " + r.get(0).toString());
     System.out.println("degreesOfFreedom: " + r.getList(1).toString());
-    System.out.println("fvalue: " + r.get(2).toString());
+    System.out.println("fvalues: " + r.get(2).toString());
 
     // $example off$