diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 754a7cd95676..81777db1706b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -225,8 +225,8 @@ jobs: # Build: build Spark and run the tests for specified modules. build: name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" - needs: precondition - if: fromJson(needs.precondition.outputs.required).build == 'true' + needs: [precondition, precompile] + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).build == 'true' runs-on: ubuntu-latest timeout-minutes: 120 strategy: @@ -333,13 +333,14 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- + ${{ runner.os }}-${{ matrix.java }}-${{ matrix.hadoop }}-coursier- + ${{ runner.os }}-coursier- - name: Free up disk space run: | if [ -f ./dev/free_disk_space ]; then @@ -364,6 +365,20 @@ jobs: run: | python3.11 -m pip install 'numpy>=1.22' pyarrow 'pandas==2.3.3' pyyaml scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.0' 'zstandard==0.25.0' python3.11 -m pip list + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} @@ -374,9 +389,13 @@ jobs: # Hive "other tests" test needs larger metaspace size based on experiment. if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi # SPARK-46283: should delete the following env replacement after SPARK 3.x EOL - if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then + if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} fi + if [ "${{ steps.extract-precompiled.outcome }}" = "success" ]; then + export SKIP_SCALA_BUILD=true + echo "Reusing precompiled artifact, skipping local SBT build." + fi export SERIAL_SBT_TESTS=1 ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - name: Upload test results to report @@ -493,8 +512,85 @@ jobs: cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-pyspark-${{ env.PYSPARK_IMAGE_TO_TEST }}-cache:${{ inputs.branch }} + precompile: + needs: precondition + if: >- + (!cancelled()) && ( + fromJson(needs.precondition.outputs.required).build == 'true' || + fromJson(needs.precondition.outputs.required).pyspark == 'true' || + fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true' || + fromJson(needs.precondition.outputs.required).pyspark-install == 'true' || + fromJson(needs.precondition.outputs.required).sparkr == 'true' || + fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' || + fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' || + fromJson(needs.precondition.outputs.required).tpcds-1g == 'true') + name: "Precompile Spark" + runs-on: ubuntu-latest + timeout-minutes: 60 + # Optional optimization: if this job fails or is cancelled, the pyspark + # matrix entries fall back to running the SBT build locally as before. + continue-on-error: true + env: + HADOOP_PROFILE: ${{ inputs.hadoop }} + HIVE_PROFILE: hive2.3 + GITHUB_PREV_SHA: ${{ github.event.before }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + - name: Cache SBT and Maven + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + ${{ runner.os }}-coursier- + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ inputs.java }} + - name: Build Spark + run: | + ./build/sbt -Phadoop-3 -Pyarn -Pspark-ganglia-lgpl -Phadoop-cloud -Phive \ + -Pkubernetes -Pjvm-profiler -Pkinesis-asl -Phive-thriftserver \ + -Pdocker-integration-tests -Pkubernetes-integration-tests -Pvolcano \ + Test/package streaming-kinesis-asl-assembly/assembly connect/assembly assembly/package + - name: Package compile output + run: | + find . -type d -name target -not -path './build/*' -not -path './.git/*' -print0 \ + | tar --null -cf - -T - | zstd -c -T0 > compile-artifact.tar.zst + ls -lh compile-artifact.tar.zst + - name: Upload compile artifact + uses: actions/upload-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + path: compile-artifact.tar.zst + retention-days: 1 + if-no-files-found: error + pyspark: - needs: [precondition, infra-image] + needs: [precondition, infra-image, precompile] # always run if pyspark == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && (fromJson(needs.precondition.outputs.required).pyspark == 'true' || fromJson(needs.precondition.outputs.required).pyspark-pandas == 'true') name: "Build modules: ${{ matrix.modules }}" @@ -584,13 +680,13 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - pyspark-coursier- + ${{ runner.os }}-coursier- - name: Free up disk space shell: 'script -q -e -c "bash {0}"' run: ./dev/free_disk_space_container @@ -615,11 +711,29 @@ jobs: uses: conda-incubator/setup-miniconda@v3 with: miniforge-version: latest + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | + if [ "${{ steps.extract-precompiled.outcome }}" = "success" ]; then + export SKIP_SCALA_BUILD=true + echo "Reusing precompiled artifact, skipping local SBT build." + fi if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then export PATH=$CONDA/bin:$PATH export SKIP_PACKAGING=false @@ -657,7 +771,7 @@ jobs: path: "**/target/unit-tests.log" sparkr: - needs: [precondition, infra-image] + needs: [precondition, infra-image, precompile] # always run if sparkr == 'true', even infra-image is skip (such as non-master job) if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' name: "Build modules: sparkr" @@ -702,13 +816,13 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - sparkr-coursier- + ${{ runner.os }}-coursier- - name: Free up disk space run: ./dev/free_disk_space_container - name: Install Java ${{ inputs.java }} @@ -716,6 +830,20 @@ jobs: with: distribution: zulu java-version: ${{ inputs.java }} + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst - name: Run tests env: ${{ fromJSON(inputs.envs) }} run: | @@ -723,6 +851,10 @@ jobs: # R issues at docker environment export TZ=UTC export _R_CHECK_SYSTEM_CLOCK_=FALSE + if [ "${{ steps.extract-precompiled.outcome }}" = "success" ]; then + export SKIP_SCALA_BUILD=true + echo "Reusing precompiled artifact, skipping local SBT build." + fi ./dev/run-tests --parallelism 1 --modules sparkr - name: Upload test results to report if: always() @@ -820,13 +952,13 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - docs-coursier- + ${{ runner.os }}-coursier- - name: Cache Maven local repository uses: actions/cache@v4 with: @@ -1002,13 +1134,13 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - docs-coursier- + ${{ runner.os }}-coursier- - name: Cache Maven local repository uses: actions/cache@v4 with: @@ -1111,8 +1243,8 @@ jobs: # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well tpcds-1g: - needs: precondition - if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' + needs: [precondition, precompile] + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' name: Run TPC-DS queries with SF=1 runs-on: ubuntu-latest timeout-minutes: 120 @@ -1141,18 +1273,32 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - tpcds-coursier- + ${{ runner.os }}-coursier- - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst - name: Cache TPC-DS generated data id: cache-tpcds-sf-1 uses: actions/cache@v4 @@ -1213,8 +1359,8 @@ jobs: path: "**/target/unit-tests.log" docker-integration-tests: - needs: precondition - if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' + needs: [precondition, precompile] + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' name: Run Docker integration tests runs-on: ubuntu-latest timeout-minutes: 120 @@ -1250,21 +1396,39 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - docker-integration-coursier- + ${{ runner.os }}-coursier- - name: Install Java ${{ inputs.java }} uses: actions/setup-java@v4 with: distribution: zulu java-version: ${{ inputs.java }} + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst - name: Run tests env: ${{ fromJSON(inputs.envs) }} run: | + if [ "${{ steps.extract-precompiled.outcome }}" = "success" ]; then + export SKIP_SCALA_BUILD=true + echo "Reusing precompiled artifact, skipping local SBT build." + fi ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest - name: Upload test results to report if: always() @@ -1280,8 +1444,8 @@ jobs: path: "**/target/unit-tests.log" k8s-integration-tests: - needs: precondition - if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' + needs: [precondition, precompile] + if: (!cancelled()) && fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' name: Run Spark on Kubernetes Integration test runs-on: ubuntu-latest timeout-minutes: 120 @@ -1309,13 +1473,13 @@ jobs: key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} restore-keys: | build- - - name: Cache Coursier local repository - uses: actions/cache@v4 + - name: Restore Coursier local repository + uses: actions/cache/restore@v4 with: path: ~/.cache/coursier - key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - k8s-integration-coursier- + ${{ runner.os }}-coursier- - name: Free up disk space run: | if [ -f ./dev/free_disk_space ]; then @@ -1326,6 +1490,20 @@ jobs: with: distribution: zulu java-version: ${{ inputs.java }} + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst - name: Install R run: | sudo apt update diff --git a/.github/workflows/maven_test.yml b/.github/workflows/maven_test.yml index 155c2bceb588..7288a17ec7c2 100644 --- a/.github/workflows/maven_test.yml +++ b/.github/workflows/maven_test.yml @@ -52,9 +52,96 @@ on: type: string default: '{}' jobs: + # Precompile Spark with Maven once and publish target/ + ~/.m2/.../spark as + # an artifact for the matrix entries below to consume. Optional: any failure + # here degrades the matrix to its original local `clean install` path. + precompile-maven: + name: "Precompile Spark with Maven" + runs-on: ${{ inputs.os }} + # If this job fails or is cancelled, the matrix entries fall back to + # running `mvn clean install` locally as before. + continue-on-error: true + env: + HADOOP_PROFILE: ${{ inputs.hadoop }} + HIVE_PROFILE: hive2.3 + SPARK_LOCAL_IP: localhost + GITHUB_PREV_SHA: ${{ github.event.before }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + - name: Cache SBT and Maven + # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341 + if: ${{ runner.os != 'macOS' }} + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Maven local repository + # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341 + if: ${{ runner.os != 'macOS' }} + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: java${{ inputs.java }}-maven-${{ hashFiles('**/pom.xml') }} + restore-keys: | + java${{ inputs.java }}-maven- + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ inputs.java }} + - name: Build Spark with Maven + shell: | + bash -c "if script -qec true 2>/dev/null; then script -qec bash\ {0}; else script -qe /dev/null bash {0}; fi" + run: | + set -e + export MAVEN_OPTS="-Xss64m -Xmx4g -Xms4g -XX:ReservedCodeCacheSize=128m -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" + export MAVEN_CLI_OPTS="--no-transfer-progress" + export JAVA_VERSION=${{ inputs.java }} + ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install + - name: Package compile output + run: | + # Exclude assembly/ from the artifact: 11 of 12 matrix entries wipe it + # right after extraction (SPARK-51628 regression-test-for-SPARK-51600), + # and the connect entry rebuilds it via `mvn install -pl assembly`. + find . \( -path './build' -o -path './.git' -o -path './assembly' \) -prune \ + -o -type d -name target -print0 \ + | tar --null -cf - -T - | zstd -c -T0 > compile-target.tar.zst + if [ -d "$HOME/.m2/repository/org/apache/spark" ]; then + tar -C "$HOME/.m2/repository/org/apache" -cf - spark | zstd -c -T0 > compile-m2-spark.tar.zst + fi + ls -lh compile-target.tar.zst compile-m2-spark.tar.zst + - name: Upload compile artifact + uses: actions/upload-artifact@v4 + with: + name: spark-maven-compile-${{ inputs.branch }}-java${{ inputs.java }}-${{ github.run_id }} + path: | + compile-target.tar.zst + compile-m2-spark.tar.zst + retention-days: 1 + if-no-files-found: error + # Build: build Spark and run the tests for specified modules using maven build: name: "Build modules using Maven: ${{ matrix.modules }} ${{ matrix.comment }}" + needs: precompile-maven + if: (!cancelled()) runs-on: ${{ inputs.os }} timeout-minutes: 150 strategy: @@ -179,6 +266,25 @@ jobs: run: | python3.11 -m pip install 'numpy>=1.22' pyarrow 'pandas==2.3.3' pyyaml scipy unittest-xml-reporting 'grpcio==1.76.0' 'grpcio-status==1.76.0' 'protobuf==6.33.0' 'zstandard==0.25.0' python3.11 -m pip list + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile-maven.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-maven-compile-${{ inputs.branch }}-java${{ matrix.java }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-target.tar.zst | tar -xf - + rm compile-target.tar.zst + if [ -f compile-m2-spark.tar.zst ]; then + mkdir -p "$HOME/.m2/repository/org/apache" + zstd -dc compile-m2-spark.tar.zst | tar -C "$HOME/.m2/repository/org/apache" -xf - + rm compile-m2-spark.tar.zst + fi # Run the tests using script command. # BSD's script command doesn't support -c option, and the usage is different from Linux's one. # The kind of script command is tested by `script -qec true`. @@ -198,13 +304,28 @@ jobs: export ENABLE_KINESIS_TESTS=0 # Replace with the real module name, for example, connector#kafka-0-10 -> connector/kafka-0-10 export TEST_MODULES=`echo "$MODULES_TO_TEST" | sed -e "s%#%/%g"` - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install - - if [ "$MODULES_TO_TEST" != "connect" ]; then - echo "Clean up the assembly module before maven testing" - ./build/mvn $MAVEN_CLI_OPTS clean -pl assembly + if [ "${{ steps.extract-precompiled.outcome }}" = "success" ]; then + echo "Reusing precompiled artifact, skipping local Maven clean install." + # SPARK-51628 regression coverage is naturally preserved on the reuse + # path: the precompile artifact excludes assembly/, so non-connect + # tests already run with the assembly module's jars dir missing. + # Connect tests strongly depend on a built assembly module; rebuild + # it here. + if [ "$MODULES_TO_TEST" = "connect" ]; then + echo "Building assembly module for connect tests." + ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -pl assembly install + fi + else + ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} clean install + # SPARK-51628: wipe the assembly module so tests exercise the + # SPARK-51600 prepend fallback path. Connect tests strongly depend + # on a built assembly module, so they are excluded. + if [ "$MODULES_TO_TEST" != "connect" ]; then + echo "Clean up the assembly module before maven testing" + ./build/mvn $MAVEN_CLI_OPTS clean -pl assembly + fi fi - + if [[ "$INCLUDED_TAGS" != "" ]]; then ./build/mvn $MAVEN_CLI_OPTS -pl "$TEST_MODULES" -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pjvm-profiler -Pspark-ganglia-lgpl -Pkinesis-asl -Djava.version=${JAVA_VERSION/-ea} -Dtest.include.tags="$INCLUDED_TAGS" test -fae elif [[ "$MODULES_TO_TEST" == "connect" && "$INPUT_BRANCH" == "branch-4.0" ]]; then diff --git a/.github/workflows/python_hosted_runner_test.yml b/.github/workflows/python_hosted_runner_test.yml index 659171b901d3..364c2b29d3b4 100644 --- a/.github/workflows/python_hosted_runner_test.yml +++ b/.github/workflows/python_hosted_runner_test.yml @@ -56,8 +56,83 @@ on: type: string default: '{}' jobs: + # Precompile Spark with SBT once and publish target/ as an artifact for the + # matrix entries below to consume. Optional: any failure here degrades the + # matrix to its original local SBT build path. + precompile: + name: "Precompile Spark" + runs-on: ${{ inputs.os }} + # If this job fails or is cancelled, the matrix entries fall back to + # running the SBT build locally as before. + continue-on-error: true + env: + HADOOP_PROFILE: ${{ inputs.hadoop }} + HIVE_PROFILE: hive2.3 + SPARK_LOCAL_IP: localhost + GITHUB_PREV_SHA: ${{ github.event.before }} + steps: + - name: Checkout Spark repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + - name: Cache SBT and Maven + # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341 + if: ${{ runner.os != 'macOS' }} + uses: actions/cache@v4 + with: + path: | + build/apache-maven-* + build/*.jar + ~/.sbt + key: build-${{ runner.os }}-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build-${{ runner.os }}- + - name: Cache Coursier local repository + # TODO(SPARK-54466): https://github.com/actions/runner-images/issues/13341 + if: ${{ runner.os != 'macOS' }} + uses: actions/cache@v4 + with: + path: ~/.cache/coursier + key: coursier-${{ runner.os }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + coursier-${{ runner.os }}- + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v4 + with: + distribution: zulu + java-version: ${{ inputs.java }} + - name: Build Spark + run: | + ./build/sbt -Phadoop-3 -Pyarn -Pspark-ganglia-lgpl -Phadoop-cloud -Phive \ + -Pkubernetes -Pjvm-profiler -Pkinesis-asl -Phive-thriftserver \ + -Pdocker-integration-tests -Pvolcano \ + Test/package streaming-kinesis-asl-assembly/assembly connect/assembly assembly/package + - name: Package compile output + run: | + find . -type d -name target -not -path './build/*' -not -path './.git/*' -print0 \ + | tar --null -cf - -T - | zstd -c -T0 > compile-artifact.tar.zst + ls -lh compile-artifact.tar.zst + - name: Upload compile artifact + uses: actions/upload-artifact@v4 + with: + name: spark-compile-${{ inputs.os }}-${{ inputs.branch }}-${{ github.run_id }} + path: compile-artifact.tar.zst + retention-days: 1 + if-no-files-found: error + build: name: "PySpark test on macos: ${{ matrix.modules }}" + needs: precompile + if: (!cancelled()) runs-on: ${{ inputs.os }} timeout-minutes: 120 strategy: @@ -131,9 +206,9 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/coursier - key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + key: ${{ runner.os }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | - pyspark-coursier- + ${{ runner.os }}-coursier- - name: Install Java ${{ matrix.java }} uses: actions/setup-java@v4 with: @@ -153,10 +228,28 @@ jobs: python${{matrix.python}} -m pip cache purge - name: List Python packages run: python${{matrix.python}} -m pip list + - name: Download precompiled artifact + id: download-precompiled + if: needs.precompile.result == 'success' + continue-on-error: true + uses: actions/download-artifact@v4 + with: + name: spark-compile-${{ inputs.os }}-${{ inputs.branch }}-${{ github.run_id }} + - name: Extract precompiled artifact + id: extract-precompiled + if: steps.download-precompiled.outcome == 'success' + continue-on-error: true + run: | + zstd -dc compile-artifact.tar.zst | tar -xf - + rm compile-artifact.tar.zst # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} run: | + if [ "${{ steps.extract-precompiled.outcome }}" = "success" ]; then + export SKIP_SCALA_BUILD=true + echo "Reusing precompiled artifact, skipping local SBT build." + fi if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then export SKIP_PACKAGING=false echo "Python Packaging Tests Enabled!" diff --git a/dev/run-tests.py b/dev/run-tests.py index 780b876b6123..3c375ea15214 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -618,7 +618,8 @@ def main(): run_build_tests() # spark build - build_apache_spark(build_tool, extra_profiles) + if os.environ.get("SKIP_SCALA_BUILD", "false") != "true": + build_apache_spark(build_tool, extra_profiles) # backwards compatibility checks if build_tool == "sbt": @@ -627,7 +628,8 @@ def main(): detect_binary_inop_with_mima(extra_profiles) # Since we did not build assembly/package before running dev/mima, we need to # do it here because the tests still rely on it; see SPARK-13294 for details. - build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) + if os.environ.get("SKIP_SCALA_BUILD", "false") != "true": + build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks) # run the test suites run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)