[SPARK-42452][BUILD] Remove hadoop-2 profile from Apache Spark 3.5.0

### What changes were proposed in this pull request? This pr aims to remove `hadoop-2` profile from Apache Spark 3.5.0. ### Why are the changes needed? Spark 3.4.0 no longer releases Hadoop2 binary distribtuion(SPARK-42447) and Hadoop 2 GitHub Action job already removed after SPARK-42447, we can remove `hadoop-2` profile from Apache Spark 3.5.0. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes apache#40788 from LuciferYang/SPARK-42452. Authored-by: yangjie01 <[email protected]> Signed-off-by: Chao Sun <[email protected]>
k8smeetup · Apr 18, 2023 · 816ebac · 816ebac
1 parent 5cb1c63
commit 816ebac
Show file tree

Hide file tree

Showing 14 changed files with 3 additions and 408 deletions.
diff --git a/assembly/README b/assembly/README
@@ -9,4 +9,4 @@ This module is off by default. To activate it specify the profile in the command
 
 If you need to build an assembly for a different version of Hadoop the
 hadoop-version system property needs to be set as in this example:
-  -Dhadoop.version=2.7.4
+  -Dhadoop.version=3.3.5
diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
@@ -178,9 +178,6 @@ def main():
     # Switch to a Maven-based build if the PR title contains "test-maven":
     if "test-maven" in ghprb_pull_title:
         os.environ["SPARK_JENKINS_BUILD_TOOL"] = "maven"
-    # Switch the Hadoop profile based on the PR title:
-    if "test-hadoop2" in ghprb_pull_title:
-        os.environ["SPARK_JENKINS_BUILD_PROFILE"] = "hadoop2"
     if "test-hadoop3" in ghprb_pull_title:
         os.environ["SPARK_JENKINS_BUILD_PROFILE"] = "hadoop3"
     # Switch the Scala profile based on the PR title:

diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -217,7 +217,6 @@ def get_hadoop_profiles(hadoop_version):
     """
 
     sbt_maven_hadoop_profiles = {
-        "hadoop2": ["-Phadoop-2"],
         "hadoop3": ["-Phadoop-3"],
     }
 

diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
@@ -34,7 +34,6 @@ HADOOP_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkubernetes -Pyarn -Phive \
     -Pspark-ganglia-lgpl -Pkinesis-asl -Phadoop-cloud"
 MVN="build/mvn"
 HADOOP_HIVE_PROFILES=(
-    hadoop-2-hive-2.3
     hadoop-3-hive-2.3
 )
 
@@ -85,8 +84,6 @@ $MVN -q versions:set -DnewVersion=$TEMP_VERSION -DgenerateBackupPoms=false > /de
 for HADOOP_HIVE_PROFILE in "${HADOOP_HIVE_PROFILES[@]}"; do
   if [[ $HADOOP_HIVE_PROFILE == **hadoop-3-hive-2.3** ]]; then
     HADOOP_PROFILE=hadoop-3
-  else
-    HADOOP_PROFILE=hadoop-2
   fi
   echo "Performing Maven install for $HADOOP_HIVE_PROFILE"
   $MVN $HADOOP_MODULE_PROFILES -P$HADOOP_PROFILE jar:jar jar:test-jar install:install clean -q

diff --git a/docs/building-spark.md b/docs/building-spark.md
@@ -79,10 +79,6 @@ Example:
 
     ./build/mvn -Pyarn -Dhadoop.version=3.3.0 -DskipTests clean package
 
-If you want to build with Hadoop 2.x, enable `hadoop-2` profile:
-
-    ./build/mvn -Phadoop-2 -Pyarn -Dhadoop.version=2.8.5 -DskipTests clean package
-
 ## Building With Hive and JDBC Support
 
 To enable Hive integration for Spark SQL along with its JDBC server and CLI,

diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
@@ -208,13 +208,6 @@
   </dependencies>
 
   <profiles>
-    <!--
-     hadoop-3 profile is activated by default so hadoop-2 profile
-     also needs to be declared here for building with -Phadoop-2.
-    -->
-    <profile>
-      <id>hadoop-2</id>
-    </profile>
     <!--
      Hadoop 3 simplifies the classpath, and adds a new committer base class which
      enables store-specific committers.

diff --git a/pom.xml b/pom.xml
@@ -3504,25 +3504,6 @@
     http://hadoop.apache.org/docs/ra.b.c/hadoop-project-dist/hadoop-common/dependency-analysis.html
     -->
 
-    <profile>
-      <id>hadoop-2</id>
-      <properties>
-        <!-- make sure to update IsolatedClientLoader whenever this version is changed -->
-        <hadoop.version>2.7.4</hadoop.version>
-        <curator.version>2.7.1</curator.version>
-        <commons-io.version>2.4</commons-io.version>
-        <!--
-          the declaration site above of these variables explains why we need to re-assign them here
-        -->
-        <hadoop-client-api.artifact>hadoop-client</hadoop-client-api.artifact>
-        <hadoop-client-runtime.artifact>hadoop-yarn-api</hadoop-client-runtime.artifact>
-        <hadoop-client-minicluster.artifact>hadoop-client</hadoop-client-minicluster.artifact>
-        <gcs-connector.version>hadoop2-2.2.11</gcs-connector.version>
-        <!-- SPARK-36547: Please don't upgrade the version below, otherwise there will be an error on building Hadoop 2.7 package -->
-        <scala-maven-plugin.version>4.3.0</scala-maven-plugin.version>
-      </properties>
-    </profile>
-
     <profile>
       <id>hadoop-3</id>
       <!-- Default hadoop profile. Uses global properties. -->

diff --git a/python/pyspark/install.py b/python/pyspark/install.py
@@ -26,7 +26,7 @@
 
 DEFAULT_HADOOP = "hadoop3"
 DEFAULT_HIVE = "hive2.3"
-SUPPORTED_HADOOP_VERSIONS = ["hadoop2", "hadoop3", "without-hadoop"]
+SUPPORTED_HADOOP_VERSIONS = ["hadoop3", "without-hadoop"]
 SUPPORTED_HIVE_VERSIONS = ["hive2.3"]
 UNSUPPORTED_COMBINATIONS = []  # type: ignore
-Original file line number
+Diff line change
@@ Expand Up / @@ -217,7 +217,6 @@ def get_hadoop_profiles(hadoop_version): @@
         """
         sbt_maven_hadoop_profiles = {
-            "hadoop2": ["-Phadoop-2"],
             "hadoop3": ["-Phadoop-3"],
         }
@@ Expand Down @@