From 05b8cf8512c36eb432ed59937c2469da60467c1a Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 12 Jun 2024 20:33:44 -0700 Subject: [PATCH 1/5] try to add spark-connect test --- .github/workflows/test-spark-connect.yml | 33 ++++++++++++++++++++++++ test/test_spark.py | 28 ++++++++++---------- test/test_spark_connect.py | 30 +++++++++++++++++++++ 3 files changed, 76 insertions(+), 15 deletions(-) create mode 100644 .github/workflows/test-spark-connect.yml create mode 100644 test/test_spark_connect.py diff --git a/.github/workflows/test-spark-connect.yml b/.github/workflows/test-spark-connect.yml new file mode 100644 index 0000000..68ca5a6 --- /dev/null +++ b/.github/workflows/test-spark-connect.yml @@ -0,0 +1,33 @@ +name: Main +on: [push, pull_request] +jobs: + build: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + PYTHON_VERSION: ["3.11", "3.12"] + JOBLIB_VERSION: ["1.3.0", "1.4.2"] + PIN_MODE: [false, true] + PYSPARK_VERSION: ["3.5.1"] + name: Run test with spark connect ${{ matrix.PYSPARK_VERSION }}, pin_mode ${{ matrix.PIN_MODE }}, python ${{ matrix.PYTHON_VERSION }}, joblib ${{ matrix.JOBLIB_VERSION }} + steps: + - uses: actions/checkout@v3 + - name: Setup python ${{ matrix.PYTHON_VERSION }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.PYTHON_VERSION }} + architecture: x64 + - name: Install python packages + run: | + pip install joblib==${{ matrix.JOBLIB_VERSION }} scikit-learn>=0.23.1 pytest pylint + pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' + # Add Python deps for Spark Connect. + pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' + pip install "pyspark[connect]==${{ matrix.PYSPARK_VERSION }}" + - name: Run pylint + run: | + ./run-pylint.sh + - name: Run test suites + run: | + PYSPARK_PIN_THREAD=${{ matrix.PIN_MODE }} ./run-tests.sh diff --git a/test/test_spark.py b/test/test_spark.py index 9c78257..238a5cb 100644 --- a/test/test_spark.py +++ b/test/test_spark.py @@ -39,21 +39,7 @@ register_spark() -class TestSparkCluster(unittest.TestCase): - spark = None - - @classmethod - def setup_class(cls): - cls.spark = ( - SparkSession.builder.master("local-cluster[1, 2, 1024]") - .config("spark.task.cpus", "1") - .config("spark.task.maxFailures", "1") - .getOrCreate() - ) - - @classmethod - def teardown_class(cls): - cls.spark.stop() +class JoblibsparkTest: def test_simple(self): def inc(x): @@ -117,6 +103,18 @@ def test_fn(x): assert len(os.listdir(tmp_dir)) == 0 +class TestSparkCluster(JoblibsparkTest, unittest.TestCase): + def setUp(self): + self.spark = ( + SparkSession.builder.master("local-cluster[1, 2, 1024]") + .config("spark.task.cpus", "1") + .config("spark.task.maxFailures", "1") + .getOrCreate() + ) + + def tearDown(self): + self.spark.stop() + @unittest.skipIf(Version(pyspark.__version__).release < (3, 4, 0), "Resource group is only supported since spark 3.4.0") class TestGPUSparkCluster(unittest.TestCase): diff --git a/test/test_spark_connect.py b/test/test_spark_connect.py new file mode 100644 index 0000000..a20bb8c --- /dev/null +++ b/test/test_spark_connect.py @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.sql import SparkSession + +from test.test_spark import JoblibsparkTest + +class TestsOnSparkConnect(JoblibsparkTest, unittest.TestCase): + + def setUp(self) -> None: + self.spark = SparkSession.builder.remote("sc://localhost").getOrCreate() + + def tearDown(self) -> None: + self.spark.stop() From 3a74ef5353761d6cffea37b8243e58f7b5358b07 Mon Sep 17 00:00:00 2001 From: Lu Wang Date: Wed, 12 Jun 2024 20:34:48 -0700 Subject: [PATCH 2/5] revert --- test/test_spark_connect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_spark_connect.py b/test/test_spark_connect.py index a20bb8c..944090a 100644 --- a/test/test_spark_connect.py +++ b/test/test_spark_connect.py @@ -24,7 +24,7 @@ class TestsOnSparkConnect(JoblibsparkTest, unittest.TestCase): def setUp(self) -> None: - self.spark = SparkSession.builder.remote("sc://localhost").getOrCreate() + self.spark = SparkSession.builder.remote("local[2]").getOrCreate() def tearDown(self) -> None: self.spark.stop() From a4b8896e073f7042a2f421cef4ce1f2952f8a365 Mon Sep 17 00:00:00 2001 From: Lu Wang <38018689+lu-wang-dl@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:50:38 -0700 Subject: [PATCH 3/5] Update test-spark-connect.yml --- .github/workflows/test-spark-connect.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-spark-connect.yml b/.github/workflows/test-spark-connect.yml index 68ca5a6..064ad01 100644 --- a/.github/workflows/test-spark-connect.yml +++ b/.github/workflows/test-spark-connect.yml @@ -21,7 +21,7 @@ jobs: - name: Install python packages run: | pip install joblib==${{ matrix.JOBLIB_VERSION }} scikit-learn>=0.23.1 pytest pylint - pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' + pip install setuptools 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' # Add Python deps for Spark Connect. pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' pip install "pyspark[connect]==${{ matrix.PYSPARK_VERSION }}" From a9d8ae082c92b1247ee9245d9dfde6ae66a88afa Mon Sep 17 00:00:00 2001 From: Lu Wang <38018689+lu-wang-dl@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:54:01 -0700 Subject: [PATCH 4/5] Update test-spark-connect.yml --- .github/workflows/test-spark-connect.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-spark-connect.yml b/.github/workflows/test-spark-connect.yml index 064ad01..84d8182 100644 --- a/.github/workflows/test-spark-connect.yml +++ b/.github/workflows/test-spark-connect.yml @@ -20,8 +20,8 @@ jobs: architecture: x64 - name: Install python packages run: | - pip install joblib==${{ matrix.JOBLIB_VERSION }} scikit-learn>=0.23.1 pytest pylint - pip install setuptools 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' + pip install setuptools joblib==${{ matrix.JOBLIB_VERSION }} scikit-learn>=0.23.1 pytest pylint + pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' # Add Python deps for Spark Connect. pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' pip install "pyspark[connect]==${{ matrix.PYSPARK_VERSION }}" From 22cab485d1235399a37d86b3bde4b98f8edb37eb Mon Sep 17 00:00:00 2001 From: Lu Wang <38018689+lu-wang-dl@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:59:10 -0700 Subject: [PATCH 5/5] Update test-spark-connect.yml --- .github/workflows/test-spark-connect.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-spark-connect.yml b/.github/workflows/test-spark-connect.yml index 84d8182..675ec1d 100644 --- a/.github/workflows/test-spark-connect.yml +++ b/.github/workflows/test-spark-connect.yml @@ -9,7 +9,7 @@ jobs: PYTHON_VERSION: ["3.11", "3.12"] JOBLIB_VERSION: ["1.3.0", "1.4.2"] PIN_MODE: [false, true] - PYSPARK_VERSION: ["3.5.1"] + PYSPARK_VERSION: ["4.0.0.dev1"] name: Run test with spark connect ${{ matrix.PYSPARK_VERSION }}, pin_mode ${{ matrix.PIN_MODE }}, python ${{ matrix.PYTHON_VERSION }}, joblib ${{ matrix.JOBLIB_VERSION }} steps: - uses: actions/checkout@v3 @@ -21,7 +21,7 @@ jobs: - name: Install python packages run: | pip install setuptools joblib==${{ matrix.JOBLIB_VERSION }} scikit-learn>=0.23.1 pytest pylint - pip install 'numpy==1.25.1' 'pyarrow==12.0.1' 'pandas<=2.0.3' + pip install 'numpy==1.26.4' 'pyarrow==12.0.1' 'pandas<=2.0.3' # Add Python deps for Spark Connect. pip install 'grpcio>=1.48,<1.57' 'grpcio-status>=1.48,<1.57' 'protobuf==3.20.3' 'googleapis-common-protos==1.56.4' pip install "pyspark[connect]==${{ matrix.PYSPARK_VERSION }}"